Skip to content

Commit 0ce38b9

Browse files
authored
[DalleBart P0] add PretrainedConfig and unit test (PaddlePaddle#5069)
* "add PretrainedConfig and unit test " * "fix bug" * "fix bug" * "fix bug" * retest * retest * retest * retest * ”fix“
1 parent c1307db commit 0ce38b9

File tree

9 files changed

+1125
-401
lines changed

9 files changed

+1125
-401
lines changed

paddlenlp/transformers/__init__.py

+1
Original file line numberDiff line numberDiff line change
@@ -181,6 +181,7 @@
181181
from .artist.configuration import *
182182
from .dallebart.modeling import *
183183
from .dallebart.tokenizer import *
184+
from .dallebart.configuration import *
184185
from .clip.modeling import *
185186
from .clip.configuration import *
186187
from .clip.feature_extraction import *
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,254 @@
1+
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
""" DalleBart model configuration"""
15+
from __future__ import annotations
16+
17+
from typing import Dict
18+
19+
from paddlenlp.transformers.configuration_utils import PretrainedConfig
20+
21+
__all__ = ["DALLEBART_PRETRAINED_INIT_CONFIGURATION", "DalleBartConfig", "DALLEBART_PRETRAINED_RESOURCE_FILES_MAP"]
22+
23+
DALLEBART_PRETRAINED_RESOURCE_FILES_MAP = {
24+
"model_state": {
25+
"dalle-mini": "https://bj.bcebos.com/paddlenlp/models/transformers/dallebart/dalle-mini/model_state.pdparams",
26+
"dalle-mega-v16": "https://bj.bcebos.com/paddlenlp/models/transformers/dallebart/dalle-mega-v16/model_state.pdparams",
27+
"dalle-mega-v26": "https://bj.bcebos.com/paddlenlp/models/transformers/dallebart/dalle-mega-v26/model_state.pdparams",
28+
"dalle-mega": "https://bj.bcebos.com/paddlenlp/models/transformers/dallebart/dalle-mega-v26/model_state.pdparams",
29+
}
30+
}
31+
32+
DALLEBART_PRETRAINED_INIT_CONFIGURATION = {
33+
"dalle-mini": {
34+
"text_vocab_size": 50264,
35+
"image_vocab_size": 16384,
36+
"bos_token_id": 16384,
37+
"pad_token_id": 16384,
38+
"eos_token_id": 16384,
39+
"max_text_length": 64,
40+
"max_image_length": 256,
41+
"decoder_start_token_id": 16384,
42+
"d_model": 1024,
43+
"num_encoder_layers": 12,
44+
"num_decoder_layers": 12,
45+
"encoder_attention_heads": 16,
46+
"decoder_attention_heads": 16,
47+
"encoder_ffn_dim": 2730,
48+
"decoder_ffn_dim": 2730,
49+
"dropout": 0.0,
50+
"activation_function": "gelu",
51+
"attention_dropout": 0.0,
52+
"activation_dropout": 0.0,
53+
"use_bias": False,
54+
"init_std": 0.02,
55+
},
56+
"dalle-mega-v16": {
57+
"text_vocab_size": 50272,
58+
"image_vocab_size": 16415,
59+
"bos_token_id": 16384,
60+
"pad_token_id": 16384,
61+
"eos_token_id": 16384,
62+
"max_text_length": 64,
63+
"max_image_length": 256,
64+
"decoder_start_token_id": 16384,
65+
"d_model": 2048,
66+
"num_encoder_layers": 24,
67+
"num_decoder_layers": 24,
68+
"encoder_attention_heads": 32,
69+
"decoder_attention_heads": 32,
70+
"encoder_ffn_dim": 4096,
71+
"decoder_ffn_dim": 4096,
72+
"dropout": 0.0,
73+
"activation_function": "gelu",
74+
"attention_dropout": 0.0,
75+
"activation_dropout": 0.0,
76+
"use_bias": False,
77+
"init_std": 0.02,
78+
},
79+
"dalle-mega-v26": {
80+
"text_vocab_size": 50272,
81+
"image_vocab_size": 16415,
82+
"bos_token_id": 16384,
83+
"pad_token_id": 16384,
84+
"eos_token_id": 16384,
85+
"max_text_length": 64,
86+
"max_image_length": 256,
87+
"decoder_start_token_id": 16384,
88+
"d_model": 2048,
89+
"num_encoder_layers": 24,
90+
"num_decoder_layers": 24,
91+
"encoder_attention_heads": 32,
92+
"decoder_attention_heads": 32,
93+
"encoder_ffn_dim": 4096,
94+
"decoder_ffn_dim": 4096,
95+
"dropout": 0.0,
96+
"activation_function": "gelu",
97+
"attention_dropout": 0.0,
98+
"activation_dropout": 0.0,
99+
"use_bias": False,
100+
"init_std": 0.02,
101+
},
102+
"dalle-mega": {
103+
"text_vocab_size": 50272,
104+
"image_vocab_size": 16415,
105+
"bos_token_id": 16384,
106+
"pad_token_id": 16384,
107+
"eos_token_id": 16384,
108+
"max_text_length": 64,
109+
"max_image_length": 256,
110+
"decoder_start_token_id": 16384,
111+
"d_model": 2048,
112+
"num_encoder_layers": 24,
113+
"num_decoder_layers": 24,
114+
"encoder_attention_heads": 32,
115+
"decoder_attention_heads": 32,
116+
"encoder_ffn_dim": 4096,
117+
"decoder_ffn_dim": 4096,
118+
"dropout": 0.0,
119+
"activation_function": "gelu",
120+
"attention_dropout": 0.0,
121+
"activation_dropout": 0.0,
122+
"use_bias": False,
123+
"init_std": 0.02,
124+
},
125+
}
126+
127+
128+
class DalleBartConfig(PretrainedConfig):
129+
r"""
130+
The bare DalleBart Model outputting raw hidden-states.
131+
This model inherits from :class:`~paddlenlp.transformers.model_utils.PretrainedModel`.
132+
Refer to the superclass documentation for the generic methods.
133+
This model is also a Paddle `paddle.nn.Layer <https://www.paddlepaddle.org.cn/documentation
134+
/docs/en/api/paddle/fluid/dygraph/layers/Layer_en.html>`__ subclass. Use it as a regular Paddle Layer
135+
and refer to the Paddle documentation for all matter related to general usage and behavior.
136+
Args:
137+
text_vocab_size (int):
138+
Vocabulary size of `inputs_ids` in `DalleBartModel`. Also is the vocab size of text token embedding matrix.
139+
Defines the number of different tokens that can be represented by the `inputs_ids` passed when calling `DalleBartModel`.
140+
image_vocab_size (int):
141+
Vocabulary size of `decoder_inputs_ids` in `DalleBartModel`. Also is the vocab size of image token embedding matrix.
142+
Defines the number of different tokens that can be represented by the `decoder_inputs_ids` passed when calling `DalleBartModel`.
143+
bos_token (int, optional):
144+
The beginning of image sequence token that was used during pretraining.
145+
Defaults to `16384`.
146+
pad_token_id(int, optional):
147+
The index of padding token in the image token vocabulary.
148+
Defaults to `16384`.
149+
eos_token (int, optional):
150+
A special token representing the end of a image sequence.
151+
Defaults to `16384`.
152+
max_text_length (int, optional):
153+
The maximum value of the dimensionality of text position encoding, which dictates the maximum supported length of the text
154+
input sequence. Defaults to `64`.
155+
max_image_length (int, optional):
156+
The maximum value of the dimensionality of image position encoding, which dictates the maximum supported length of the image
157+
input sequence. Defaults to `256`.
158+
decoder_start_token_id (int, optional):
159+
The id indicating the start of decoding image sentence. Defaults to `16384`.
160+
d_model (int, optional):
161+
Dimensionality of the embedding layer, encoder layer and decoder layer. Defaults to `1024`.
162+
num_encoder_layers (int, optional):
163+
Number of hidden layers in the :class:`DalleBartEncoder`. Defaults to `12`.
164+
num_decoder_layers (int, optional):
165+
Number of hidden layers in the :class:`DalleBartDecoder`. Defaults to `12`.
166+
encoder_attention_heads (int, optional):
167+
Number of attention heads for each attention layer in the :class:`DalleBartEncoder`.
168+
Defaults to `16`.
169+
decoder_attention_heads (int, optional):
170+
Number of attention heads for each attention layer in the :class:`DalleBartDecoder`.
171+
Defaults to `16`.
172+
encoder_ffn_dim (int, optional):
173+
Dimensionality of the Gated Linear Units (glu) layer in the encoder. Input tensors
174+
to glu layers are firstly projected from `d_model` to `encoder_ffn_dim`,
175+
and then projected back to `d_model`. Typically `encoder_ffn_dim` is larger than `d_model`.
176+
Defaults to `2730`.
177+
decoder_ffn_dim (int, optional):
178+
Dimensionality of the Gated Linear Units (glu) layer in the encoder. Input tensors
179+
to glu layers are firstly projected from `d_model` to `decoder_ffn_dim`,
180+
and then projected back to `d_model`. Typically `decoder_ffn_dim` is larger than `d_model`.
181+
Defaults to `2730`.
182+
dropout (float, optional):
183+
The dropout probability used in all fully connected layers (pre-process and post-process of MHA and FFN sub-layer)
184+
in the encoders and decoders. Defaults to `0.`.
185+
activation_function (str, optional):
186+
The non-linear activation function in the glu layer.
187+
``"gelu"``, ``"relu"`` and any other paddle supported activation functions are supported.
188+
Defaults to `"gelu"`.
189+
attention_dropout (float, optional):
190+
The dropout probability used in MultiHeadAttention in all encoder layers and decoder layers to drop some attention target.
191+
Defaults to `0.`.
192+
activation_dropout (float, optional):
193+
The dropout probability used after glu activation in all encoder layers and decoder layers.
194+
Defaults to `0.`.
195+
use_bias (bool, optional):
196+
Whether or not use bias in all linear layers. Defaults to `False`.
197+
init_std (float, optional):
198+
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
199+
Default to `0.02`.
200+
"""
201+
pretrained_init_configuration = DALLEBART_PRETRAINED_INIT_CONFIGURATION
202+
model_type = "dallebart"
203+
attribute_map: Dict[str, str] = {
204+
"text_vocab_size": "vocab_size",
205+
}
206+
207+
def __init__(
208+
self,
209+
vocab_size=50264,
210+
image_vocab_size=16384,
211+
bos_token_id=16384,
212+
pad_token_id=16384,
213+
eos_token_id=16384,
214+
max_text_length=64,
215+
max_image_length=256,
216+
decoder_start_token_id=16384,
217+
d_model=1024,
218+
num_encoder_layers=12,
219+
num_decoder_layers=12,
220+
encoder_attention_heads=16,
221+
decoder_attention_heads=16,
222+
encoder_ffn_dim=2730,
223+
decoder_ffn_dim=2730,
224+
dropout=0.0,
225+
activation_function="gelu",
226+
attention_dropout=0.0,
227+
activation_dropout=0.0,
228+
use_bias=False,
229+
init_std=0.02,
230+
**kwargs
231+
):
232+
super().__init__(pad_token_id=pad_token_id, **kwargs)
233+
self.vocab_size = vocab_size
234+
self.image_vocab_size = image_vocab_size
235+
self.bos_token_id = bos_token_id
236+
self.eos_token_id = eos_token_id
237+
self.max_text_length = max_text_length
238+
self.max_image_length = max_image_length
239+
self.d_model = d_model
240+
self.num_encoder_layers = num_encoder_layers
241+
self.num_decoder_layers = num_decoder_layers
242+
self.encoder_attention_heads = encoder_attention_heads
243+
self.decoder_attention_heads = decoder_attention_heads
244+
self.encoder_ffn_dim = encoder_ffn_dim
245+
self.decoder_ffn_dim = decoder_ffn_dim
246+
self.dropout = dropout
247+
self.activation_function = activation_function
248+
self.attention_dropout = attention_dropout
249+
self.activation_dropout = activation_dropout
250+
self.use_bias = use_bias
251+
self.init_std = init_std
252+
self.pad_token_id = pad_token_id
253+
self.decoder_start_token_id = decoder_start_token_id
254+
self.text_pad_token_id = 1 # encoder pad id must be 1

0 commit comments

Comments
 (0)