Motif-Technologies
/

Motif-2.6B

@@ -409,35 +409,6 @@ class MotifAttention(nn.Module):
         self.num_key_value_heads //= 2
         self.n_rep = self.num_heads // self.num_key_value_heads
-        ##mix attn
-        self.mix_attn = config.mix_attn
-        if self.mix_attn:
-            self.cq, self.ck = 6, 11
-            self.ch = 2
-            self.key_query_conv = nn.Conv2d(
-                in_channels=self.num_heads*2,
-                out_channels=self.num_heads*2,
-                kernel_size=(self.cq, self.ck),
-                padding="same",
-                groups=self.num_heads*2
-            )
-            self.head_conv = nn.Conv1d(
-                in_channels=self.num_heads,
-                out_channels=self.num_heads,
-                kernel_size=1,
-                padding=0,
-                groups=self.num_heads // self.ch
-            )
-            self.group_norm = nn.GroupNorm(num_groups=self.num_heads, num_channels=self.num_heads)
         # re-init projections
         self.q_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=False)
         self.k_proj = nn.Linear(self.hidden_size, self.hidden_size // self.n_rep, bias=False)
@@ -516,12 +487,6 @@ class MotifAttention(nn.Module):
         attention_mask = torch.triu(
             torch.full((q_len, kv_seq_len), float("-inf"), dtype=attn_weights.dtype, device=attn_weights.device),
             1 + offset)
-        ##attn weights conv2d, softmax and add attention_mask
-        if self.mix_attn:
-            ## condition mask==0, value : 0
-            attn_weights = attn_weights.masked_fill( attention_mask == 0, 0)
-            attn_weights = self.key_query_conv(attn_weights)
-            attn_weights = attn_weights[:, :, :kv_seq_len, :kv_seq_len]
         ###add attn
         attn_weights = attn_weights + attention_mask
@@ -536,11 +501,6 @@ class MotifAttention(nn.Module):
         lambda_full = lambda_1 - lambda_2 + self.lambda_init
         attn_weights = attn_weights.view(bsz, self.num_heads, 2, q_len, -1)
         attn_weights = attn_weights[:, :, 0] - lambda_full * attn_weights[:, :, 1]
-        ##head_conv
-        if self.mix_attn:
-            attn_weights = attn_weights.view(bsz, self.num_heads, -1).contiguous()
-            attn_weights = self.head_conv(attn_weights)
-            attn_weights = attn_weights.view(bsz, self.num_heads, q_len, -1).contiguous()
         ##shape : bsz, #heads, seq, head_dim
         attn_output = torch.matmul(attn_weights, value_states)
@@ -552,9 +512,7 @@ class MotifAttention(nn.Module):
         if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim * 2):
             raise ValueError(f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
                              f" {attn_output.size()}")
-        if self.mix_attn:
-            attn_output = self.group_norm(attn_output)
         attn_output = attn_output.transpose(1, 2).contiguous()
         attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
@@ -889,10 +847,7 @@ class MotifDecoderLayer(nn.Module):
             logger.warning_once(
                 f"Sliding Window Attention is enabled but not implemented for `{config._attn_implementation}`; "
                 "unexpected results may be encountered.")
-        if not config.mix_attn:
-            self.self_attn = MOTIF_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx)
-        else:
-            self.self_attn = MOTIF_ATTENTION_CLASSES["eager"](config, layer_idx)
         self.mlp = MotifMLP(config)
         RMSNorm = MorehRMSNorm if MorehRMSNorm is not None else MotifRMSNorm

         self.num_key_value_heads //= 2
         self.n_rep = self.num_heads // self.num_key_value_heads
         # re-init projections
         self.q_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=False)
         self.k_proj = nn.Linear(self.hidden_size, self.hidden_size // self.n_rep, bias=False)
         attention_mask = torch.triu(
             torch.full((q_len, kv_seq_len), float("-inf"), dtype=attn_weights.dtype, device=attn_weights.device),
             1 + offset)
         ###add attn
         attn_weights = attn_weights + attention_mask
         lambda_full = lambda_1 - lambda_2 + self.lambda_init
         attn_weights = attn_weights.view(bsz, self.num_heads, 2, q_len, -1)
         attn_weights = attn_weights[:, :, 0] - lambda_full * attn_weights[:, :, 1]
         ##shape : bsz, #heads, seq, head_dim
         attn_output = torch.matmul(attn_weights, value_states)
         if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim * 2):
             raise ValueError(f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
                              f" {attn_output.size()}")
         attn_output = attn_output.transpose(1, 2).contiguous()
         attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
             logger.warning_once(
                 f"Sliding Window Attention is enabled but not implemented for `{config._attn_implementation}`; "
                 "unexpected results may be encountered.")
+        self.self_attn = MOTIF_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx)
         self.mlp = MotifMLP(config)
         RMSNorm = MorehRMSNorm if MorehRMSNorm is not None else MotifRMSNorm