Create JiRackTernaryPyTorch_70b_RopeFix.py

Browse files

Files changed (1) hide show

JiRackTernaryPyTorch_70b_RopeFix.py +149 -0

JiRackTernaryPyTorch_70b_RopeFix.py ADDED Viewed

	@@ -0,0 +1,149 @@

+#%%writefile JiRackTernaryPyTorch_70b.py
+# =============================================================================
+# COPYRIGHT © 2025 Konstantin Vladimirovich Grabko. ALL RIGHTS RESERVED.
+# CMS Manhattan JiRack Technology — PATENT PENDING
+# =============================================================================
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from typing import Optional, Tuple, Union
+from transformers import PreTrainedModel, PretrainedConfig, GenerationMixin
+from transformers.modeling_outputs import CausalLMOutputWithPast
+from torch.utils.checkpoint import checkpoint
+class JiRackTernaryConfig(PretrainedConfig):
+    model_type = "jirack_ternary_70b"
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self.vocab_size = 128256
+        self.hidden_size = 8192
+        self.intermediate_size = 28672
+        self.num_hidden_layers = 80
+        self.num_attention_heads = 64
+        self.num_key_value_heads = 8
+        self.head_dim = 128
+        self.rms_norm_eps = 1e-5
+class JiRackBitLinear(nn.Module):
+    def __init__(self, in_features, out_features):
+        super().__init__()
+        self.in_features, self.out_features = in_features, out_features
+        self.register_buffer("packed", None)
+        self.register_buffer("scale", None)
+        self.register_buffer("orig_shape", None)
+    def unpack(self):
+        if self.packed is None: return None
+        p = self.packed.to(torch.int32)
+        b = torch.stack([(p >> 6) & 3, (p >> 4) & 3, (p >> 2) & 3, p & 3], dim=1).view(-1)
+        shape = self.orig_shape if self.orig_shape is not None else torch.tensor([self.out_features, self.in_features])
+        # Тернарная распаковка (-1, 0, 1)
+        w = (b[:shape.numel()].to(torch.float16) - 1.0)
+        return w.view(int(shape[0]), int(shape[1])) * self.scale
+    def forward(self, x):
+        w = self.unpack()
+        if w is None:
+            return F.linear(x, torch.zeros(self.out_features, self.in_features, device=x.device, dtype=x.dtype))
+        # Активационная квантовка (BitNet style)
+        x_norm = x - x.mean(dim=-1, keepdim=True)
+        x_scale = x_norm.abs().max(dim=-1, keepdim=True).values.clamp(min=1e-5)
+        return F.linear((x_norm / x_scale).to(w.dtype), w) * (x_scale * 0.67)
+class RMSNorm(nn.Module):
+    def __init__(self, dim, eps=1e-5):
+        super().__init__()
+        self.eps, self.weight = eps, nn.Parameter(torch.ones(dim))
+    def forward(self, x):
+        v = x.to(torch.float32).pow(2).mean(-1, keepdim=True)
+        return (x.to(torch.float32) * torch.rsqrt(v + self.eps) * self.weight.to(torch.float32)).to(x.dtype)
+def apply_rotary_emb(x, freqs):
+    # x: [bsz, heads, seq, head_dim]
+    # freqs: [seq, head_dim/2]
+    cos = freqs.cos().view(1, 1, freqs.shape[0], freqs.shape[1])
+    sin = freqs.sin().view(1, 1, freqs.shape[0], freqs.shape[1])
+    # Стабильный RoPE: разделяем и вращаем пары
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    # Математика: x * cos + rotate(x) * sin
+    rotated_x = torch.cat([-x2, x1], dim=-1)
+    # Расширяем cos/sin до полной head_dim
+    cos_full = torch.cat([cos, cos], dim=-1)
+    sin_full = torch.cat([sin, sin], dim=-1)
+    return (x * cos_full) + (rotated_x * sin_full)
+class JiRackAttention(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.q_proj = JiRackBitLinear(8192, 8192)
+        self.k_proj = JiRackBitLinear(8192, 1024)
+        self.v_proj = JiRackBitLinear(8192, 1024)
+        self.o_proj = JiRackBitLinear(8192, 8192)
+    def forward(self, x, freqs):
+        bsz, q_len, _ = x.shape
+        q = self.q_proj(x).view(bsz, q_len, 64, 128).transpose(1, 2)
+        k = self.k_proj(x).view(bsz, q_len, 8, 128).transpose(1, 2)
+        v = self.v_proj(x).view(bsz, q_len, 8, 128).transpose(1, 2)
+        q, k = apply_rotary_emb(q, freqs), apply_rotary_emb(k, freqs)
+        # GQA (Grouped Query Attention) для 70B
+        k = k.repeat_interleave(8, dim=1)
+        v = v.repeat_interleave(8, dim=1)
+        attn = F.scaled_dot_product_attention(q, k, v, is_causal=True)
+        return self.o_proj(attn.transpose(1, 2).reshape(bsz, q_len, -1))
+class JiRackDecoderLayer(nn.Module):
+    def __init__(self, layer_idx):
+        super().__init__()
+        self.self_attn = JiRackAttention()
+        self.gate_proj = JiRackBitLinear(8192, 28672)
+        self.up_proj = JiRackBitLinear(8192, 28672)
+        self.down_proj = JiRackBitLinear(28672, 8192)
+        self.input_layernorm = RMSNorm(8192)
+        self.post_attention_layernorm = RMSNorm(8192)
+    def forward(self, x, freqs):
+        h = self.self_attn(self.input_layernorm(x), freqs)
+        x = x + h * 0.4
+        # SwiGLU MLP
+        mlp_act = F.silu(self.gate_proj(self.post_attention_layernorm(x)))
+        mlp_res = mlp_act * self.up_proj(self.post_attention_layernorm(x))
+        return x + self.down_proj(mlp_res) * 0.4
+class JiRackTernary70B(PreTrainedModel, GenerationMixin):
+    config_class = JiRackTernaryConfig
+    def __init__(self, config):
+        super().__init__(config)
+        self.embed_tokens = nn.Embedding(128256, 8192)
+        self.layers = nn.ModuleList([JiRackDecoderLayer(i) for i in range(80)])
+        self.norm = RMSNorm(8192)
+        self.lm_head = nn.Linear(8192, 128256, bias=False)
+        # RoPE инверсные частоты (Llama 3 base = 500k)
+        inv_freq = 1.0 / (500000.0 ** (torch.arange(0, 128, 2).float() / 128))
+        self.register_buffer("inv_freq", inv_freq)
+        self.use_gc = False
+    def forward(self, input_ids, **kwargs):
+        x = self.embed_tokens(input_ids)
+        # Считаем частоты в float32 для точности
+        t = torch.arange(x.shape[1], device=x.device, dtype=torch.float32)
+        freqs = torch.outer(t, self.inv_freq.to(torch.float32))
+        for layer in self.layers:
+            if self.training and self.use_gc:
+                x = checkpoint(layer, x, freqs, use_reentrant=False)
+            else:
+                x = layer(x, freqs)
+        # Финальный скейлинг логитов 0.8 — для 70B это база
+        return CausalLMOutputWithPast(logits=self.lm_head(self.norm(x)) * 0.8)