Update configuration_motif.py
Browse files- configuration_motif.py +1 -75
configuration_motif.py
CHANGED
|
@@ -134,7 +134,6 @@ class MotifConfig(PretrainedConfig):
|
|
| 134 |
sliding_window=4096,
|
| 135 |
max_window_layers=28,
|
| 136 |
attention_dropout=0.0,
|
| 137 |
-
multi_token_heads: Optional[int] = None,
|
| 138 |
**kwargs,
|
| 139 |
):
|
| 140 |
"""
|
|
@@ -165,87 +164,14 @@ class MotifConfig(PretrainedConfig):
|
|
| 165 |
self.rope_scaling = rope_scaling
|
| 166 |
self.attention_dropout = attention_dropout
|
| 167 |
|
| 168 |
-
###kwargs
|
| 169 |
-
|
| 170 |
-
# some scale factors
|
| 171 |
-
|
| 172 |
-
self.scale_emb = getattr(kwargs, "scale_emb", 1)
|
| 173 |
-
self.init_scale_o = getattr(kwargs, "init_scale_o", 1)
|
| 174 |
-
|
| 175 |
-
# muparam
|
| 176 |
-
self.hidden_states_shrink = 1 / math.sqrt(num_hidden_layers)
|
| 177 |
-
self.dim_model_base = hidden_size
|
| 178 |
-
self.dim_model_base_attn = (hidden_size // num_attention_heads)
|
| 179 |
-
self.dim_model_base_init = hidden_size
|
| 180 |
-
self.dim_model_base_lr = getattr(kwargs, "dim_model_base_lr", hidden_size//8)
|
| 181 |
-
self.dim_model_base_lmh = 1
|
| 182 |
-
self.dim_model_base_logits = hidden_size
|
| 183 |
-
|
| 184 |
-
self.muP = getattr(kwargs, "muP", False)
|
| 185 |
-
# proxy hidden size ( following YuLan-Mini )
|
| 186 |
-
# reparameterization(wesar_weights)
|
| 187 |
-
logger.info(kwargs)
|
| 188 |
-
self.wesar_weights = getattr(kwargs, "wesar_weights", False)
|
| 189 |
-
logger.info(f'initial wesar reparameterization : {self.wesar_weights}')
|
| 190 |
-
|
| 191 |
-
# alpha (scale factor)
|
| 192 |
-
self.embed_tokens_alpha = getattr(kwargs, "embed_tokens_alpha", None)
|
| 193 |
-
self.q_proj_alpha = getattr(kwargs, "q_proj_alpha", None)
|
| 194 |
-
self.k_proj_alpha = getattr(kwargs, "k_proj_alpha", None)
|
| 195 |
-
self.v_proj_alpha = getattr(kwargs, "v_proj_alpha", None)
|
| 196 |
-
self.o_proj_alpha = getattr(kwargs, "o_proj_alpha", None)
|
| 197 |
-
self.down_proj_alpha = getattr(kwargs, "down_proj_alpha", None)
|
| 198 |
-
self.gate_up_proj_alpha = getattr(kwargs, "gate_up_proj_alpha", None)
|
| 199 |
-
self.input_layernorm_alpha = getattr(kwargs, "input_layernorm_alpha", None)
|
| 200 |
-
self.post_attention_layernorm_alpha = getattr(kwargs, "post_attention_layernorm_alpha", None)
|
| 201 |
-
self.norm_alpha = getattr(kwargs, "norm_alpha", None)
|
| 202 |
-
self.lm_head_alpha = getattr(kwargs, "lm_head_alpha", None)
|
| 203 |
-
self.use_norm_alpha = getattr(kwargs, "use_norm_alpha", False)
|
| 204 |
-
self.use_emb_alpha = getattr(kwargs, "use_emb_alpha", False)
|
| 205 |
-
|
| 206 |
# Validate the correctness of rotary position embeddings parameters
|
| 207 |
# BC: if there is a 'type' field, move it to 'rope_type'.
|
| 208 |
if self.rope_scaling is not None and "type" in self.rope_scaling:
|
| 209 |
self.rope_scaling["rope_type"] = self.rope_scaling["type"]
|
| 210 |
rope_config_validation(self)
|
| 211 |
-
|
| 212 |
-
self.multi_token_heads = multi_token_heads
|
| 213 |
-
self.multi_token_config_validation()
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
|
| 217 |
-
# moe
|
| 218 |
-
self.topk_method = getattr(kwargs, "topk_method", None)
|
| 219 |
-
self.scoring_func = getattr(kwargs, "scoring_func", None)
|
| 220 |
-
self.routed_scaling_factor = getattr(kwargs, "routed_scaling_factor", None)
|
| 221 |
-
self.norm_topk_prob = getattr(kwargs, "norm_topk_prob", None)
|
| 222 |
-
self.seq_aux = getattr(kwargs, "seq_aux", None)
|
| 223 |
-
self.hidden_act_moe = getattr(kwargs, "hidden_act_moe", None)
|
| 224 |
-
|
| 225 |
-
|
| 226 |
-
self.n_group = getattr(kwargs, "n_group", None)
|
| 227 |
-
self.n_routed_experts = getattr(kwargs, "n_routed_experts", None)
|
| 228 |
-
self.moe_intermediate_size = getattr(kwargs, "moe_intermediate_size", None)
|
| 229 |
-
self.topk_group = getattr(kwargs, "topk_group", None)
|
| 230 |
-
|
| 231 |
-
|
| 232 |
-
self.use_fused_mlp = getattr(kwargs, "use_fused_mlp", None)
|
| 233 |
-
self.use_moreh_moe = getattr(kwargs, "use_moreh_moe", False)
|
| 234 |
-
self.continual_training = getattr(kwargs, "continual_training", False)
|
| 235 |
-
|
| 236 |
-
# external
|
| 237 |
-
self.first_expansion = getattr(kwargs, "first_expansion", False)
|
| 238 |
-
self.moe_layer = getattr(kwargs, "moe_layer", False)
|
| 239 |
-
|
| 240 |
-
|
| 241 |
-
|
| 242 |
super().__init__(
|
| 243 |
tie_word_embeddings=tie_word_embeddings,
|
| 244 |
**kwargs,
|
| 245 |
)
|
| 246 |
logger.info(f' kwargs : {kwargs}')
|
| 247 |
-
logger.info(f'after wesar reparameterization : {self.wesar_weights}')
|
| 248 |
-
|
| 249 |
-
def multi_token_config_validation(self):
|
| 250 |
-
if self.multi_token_heads is not None:
|
| 251 |
-
assert isinstance(self.multi_token_heads, int) and self.multi_token_heads >= 1
|
|
|
|
| 134 |
sliding_window=4096,
|
| 135 |
max_window_layers=28,
|
| 136 |
attention_dropout=0.0,
|
|
|
|
| 137 |
**kwargs,
|
| 138 |
):
|
| 139 |
"""
|
|
|
|
| 164 |
self.rope_scaling = rope_scaling
|
| 165 |
self.attention_dropout = attention_dropout
|
| 166 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 167 |
# Validate the correctness of rotary position embeddings parameters
|
| 168 |
# BC: if there is a 'type' field, move it to 'rope_type'.
|
| 169 |
if self.rope_scaling is not None and "type" in self.rope_scaling:
|
| 170 |
self.rope_scaling["rope_type"] = self.rope_scaling["type"]
|
| 171 |
rope_config_validation(self)
|
| 172 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 173 |
super().__init__(
|
| 174 |
tie_word_embeddings=tie_word_embeddings,
|
| 175 |
**kwargs,
|
| 176 |
)
|
| 177 |
logger.info(f' kwargs : {kwargs}')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|