"""
BEATRIX FLOW-MATCHING - CIFAR-10 (T5 Text Encoder)
===================================================

SD 1.5 VAE + Flan-T5-Large text encoder
Dual tower collectives: vision towers + text towers

Text prompts for CIFAR-10 classes:
    "a photo of an airplane"
    "a photo of an automobile"
    etc.

Requirements:
    pip install transformers diffusers torchvision tqdm
    pip install git+https://github.com/AbstractEyes/geofractal

Currently running like a turtle, will optimize tomorrow.

apache 2.0 license
"""

from __future__ import annotations

import math
from dataclasses import dataclass
from typing import Dict, Tuple, Optional, List
from pathlib import Path

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import Tensor
from torch.utils.data import DataLoader, Dataset
from torchvision import datasets, transforms
from torchvision.utils import make_grid, save_image
from huggingface_hub import HfApi, upload_file, create_repo
import json
from tqdm import tqdm

# =============================================================================
# GEOFRACTAL IMPORTS
# =============================================================================

from geofractal.router.wide_router import WideRouter
from geofractal.router.prefab.agatha.beatrix_tension_oscillator import (
    BeatrixOscillator,
    ScheduleType,
)
from geofractal.router.prefab.geometric_tower_builder import (
    TowerConfig,
    FusionType,
    ConfigurableCollective,
    build_tower_collective,
    preset_pos_neg_pairs,
)
from geofractal.router.prefab.geometric_conv_tower_builder import (
    ConvTowerConfig,
    ConvTowerCollective,
    build_conv_collective,
    preset_conv_pos_neg,
)


# =============================================================================
# CIFAR-10 CLASS PROMPTS
# =============================================================================

CIFAR10_PROMPTS = [
    "a photo of an airplane",
    "a photo of an automobile",
    "a photo of a bird",
    "a photo of a cat",
    "a photo of a deer",
    "a photo of a dog",
    "a photo of a frog",
    "a photo of a horse",
    "a photo of a ship",
    "a photo of a truck",
]


# =============================================================================
# SD 1.5 VAE
# =============================================================================

class SD15VAE(nn.Module):
    def __init__(self, freeze: bool = True):
        super().__init__()
        from diffusers import AutoencoderKL
        
        self.vae = AutoencoderKL.from_pretrained(
            "runwayml/stable-diffusion-v1-5",
            subfolder="vae",
            torch_dtype=torch.float32,
        )
        
        if freeze:
            self.vae.eval()
            for p in self.vae.parameters():
                p.requires_grad = False
        
        self.scale_factor = 0.18215
    
    @torch.no_grad()
    def encode(self, x: Tensor) -> Tensor:
        return self.vae.encode(x).latent_dist.sample() * self.scale_factor
    
    @torch.no_grad()
    def decode(self, z: Tensor) -> Tensor:
        return self.vae.decode(z / self.scale_factor).sample


# =============================================================================
# FLAN-T5-LARGE TEXT ENCODER
# =============================================================================

class T5TextEncoder(nn.Module):
    """Flan-T5 encoder with bottleneck projection."""
    
    def __init__(
        self, 
        model_name: str = "google/flan-t5-xl", 
        freeze: bool = True, 
        max_length: int = 77,
        bottleneck_dim: int = 256,
    ):
        super().__init__()
        from transformers import T5EncoderModel, T5Tokenizer
        
        self.tokenizer = T5Tokenizer.from_pretrained(model_name)
        self.encoder = T5EncoderModel.from_pretrained(model_name)
        self.max_length = max_length
        self.raw_dim = self.encoder.config.d_model  # 2048 for XL
        self.output_dim = bottleneck_dim
        
        # Bottleneck projection
        self.bottleneck = nn.Sequential(
            nn.Linear(self.raw_dim, bottleneck_dim),
            nn.GELU(),
            nn.Linear(bottleneck_dim, bottleneck_dim),
        )
        
        if freeze:
            self.encoder.eval()
            for p in self.encoder.parameters():
                p.requires_grad = False
        # Note: bottleneck stays trainable during cache build, but we detach outputs
    
    @torch.no_grad()
    def forward(self, texts: List[str], device: torch.device) -> Tuple[Tensor, Tensor]:
        """
        Encode text prompts with bottleneck.
        
        Returns:
            sequence: [B, L, bottleneck_dim] - compressed sequence embeddings
            pooled: [B, bottleneck_dim] - compressed mean pooled embedding
        """
        tokens = self.tokenizer(
            texts,
            padding="max_length",
            max_length=self.max_length,
            truncation=True,
            return_tensors="pt",
        )
        
        input_ids = tokens.input_ids.to(device)
        attention_mask = tokens.attention_mask.to(device)
        
        outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        sequence_raw = outputs.last_hidden_state  # [B, L, raw_dim]
        
        # Apply bottleneck
        sequence = self.bottleneck(sequence_raw)  # [B, L, bottleneck_dim]
        
        # Mean pool over non-padding tokens
        mask_expanded = attention_mask.unsqueeze(-1).float()
        pooled = (sequence * mask_expanded).sum(dim=1) / mask_expanded.sum(dim=1)
        
        return sequence, pooled
    
    @torch.no_grad()
    def encode_raw(self, texts: List[str], device: torch.device) -> Tuple[Tensor, Tensor]:
        """
        Encode text prompts WITHOUT bottleneck (for caching raw embeddings).
        
        Returns:
            sequence: [B, L, raw_dim] - raw T5 embeddings
            pooled: [B, raw_dim] - raw mean pooled embedding
        """
        tokens = self.tokenizer(
            texts,
            padding="max_length",
            max_length=self.max_length,
            truncation=True,
            return_tensors="pt",
        )
        
        input_ids = tokens.input_ids.to(device)
        attention_mask = tokens.attention_mask.to(device)
        
        outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        sequence = outputs.last_hidden_state  # [B, L, raw_dim]
        
        # Mean pool over non-padding tokens
        mask_expanded = attention_mask.unsqueeze(-1).float()
        pooled = (sequence * mask_expanded).sum(dim=1) / mask_expanded.sum(dim=1)
        
        return sequence, pooled


# =============================================================================
# CACHED DATASET (VAE latents + T5 text embeddings per class)
# =============================================================================

class CachedCIFAR10T5(Dataset):
    """
    Pre-cached CIFAR-10 with VAE latents.
    T5 embeddings are computed per-class (not per-image).
    """
    
    T5_MODEL = "google/flan-t5-xl"  # Change this to use different T5 variant
    
    def __init__(
        self,
        train: bool = True,
        image_size: int = 256,
        cache_dir: str = "./cache",
        device: str = "cuda",
    ):
        self.train = train
        # Include T5 model name in cache path
        t5_suffix = self.T5_MODEL.replace("/", "_")
        self.cache_path = Path(cache_dir) / f"cifar10_{t5_suffix}_{'train' if train else 'val'}_{image_size}.pt"
        
        if self.cache_path.exists():
            print(f"Loading cache: {self.cache_path}")
            cache = torch.load(self.cache_path, weights_only=False)
            self.latents = cache['latents']
            self.labels = cache['labels']
            self.text_sequence = cache['text_sequence']  # [10, L, dim]
            self.text_pooled = cache['text_pooled']      # [10, dim]
            self.text_dim = cache.get('text_dim', self.text_pooled.shape[-1])
        else:
            print(f"Building cache for {'train' if train else 'val'} set...")
            self._build_cache(image_size, device)
    
    def _build_cache(self, image_size: int, device: str):
        # Load encoders
        print("  Loading VAE...")
        vae = SD15VAE(freeze=True).to(device)
        print(f"  Loading T5 ({self.T5_MODEL})...")
        t5 = T5TextEncoder(model_name=self.T5_MODEL, freeze=True).to(device)
        
        # Encode class prompts - save RAW embeddings (bottleneck is in model)
        print(f"  Encoding text prompts (T5 raw_dim={t5.raw_dim})...")
        text_seq, text_pool = t5.encode_raw(CIFAR10_PROMPTS, device)
        self.text_sequence = text_seq.cpu()  # [10, L, raw_dim]
        self.text_pooled = text_pool.cpu()   # [10, raw_dim]
        self.text_dim = t5.raw_dim  # Store raw dim for bottleneck sizing
        
        # Encode images
        transform = transforms.Compose([
            transforms.Resize((image_size, image_size)),
            transforms.ToTensor(),
            transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
        ])
        
        dataset = datasets.CIFAR10('./data', train=self.train, download=True, transform=transform)
        loader = DataLoader(dataset, batch_size=64, shuffle=False, num_workers=4, pin_memory=True)
        
        all_latents, all_labels = [], []
        
        print("  Encoding images...")
        with torch.no_grad():
            for images, labels in tqdm(loader, desc="  Caching", leave=False):
                images = images.to(device)
                all_latents.append(vae.encode(images).cpu())
                all_labels.append(labels)
        
        self.latents = torch.cat(all_latents, dim=0)
        self.labels = torch.cat(all_labels, dim=0)
        
        del vae, t5
        torch.cuda.empty_cache()
        
        # Save
        self.cache_path.parent.mkdir(parents=True, exist_ok=True)
        torch.save({
            'latents': self.latents,
            'labels': self.labels,
            'text_sequence': self.text_sequence,
            'text_pooled': self.text_pooled,
            'text_dim': self.text_dim,
        }, self.cache_path)
        print(f"  Saved: {self.cache_path}")
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        label = self.labels[idx]
        return (
            self.latents[idx],
            self.text_sequence[label],  # [L, raw_dim]
            self.text_pooled[label],    # [raw_dim]
            label,
        )


# =============================================================================
# SINUSOIDAL EMBEDDING
# =============================================================================

class SinusoidalEmbed(nn.Module):
    def __init__(self, dim: int):
        super().__init__()
        self.dim = dim

    def forward(self, t: Tensor) -> Tensor:
        half = self.dim // 2
        freqs = torch.exp(-math.log(10000) * torch.arange(half, device=t.device) / half)
        args = t.unsqueeze(-1) * freqs
        return torch.cat([torch.cos(args), torch.sin(args)], dim=-1)


# =============================================================================
# CONFIG
# =============================================================================

@dataclass
class FlowConfig:
    image_size: int = 256
    num_classes: int = 10
    latent_channels: int = 4
    latent_size: int = 32
    
    # T5 dimensions
    text_raw_dim: int = 2048  # Raw T5-XL output, overridden by dataset
    text_seq_len: int = 77
    bottleneck_dim: int = 256  # Compressed text dim
    
    # Tower collective (transformer-based)
    tower_dim: int = 256
    tower_depth: int = 2
    num_heads: int = 8
    geometric_types: Tuple[str, ...] = ('cantor', 'beatrix', 'helix', 'simplex')
    
    # Conv tower types (convolutional)
    conv_types: Tuple[str, ...] = ('wide_resnet', 'frequency', 'bottleneck', 'squeeze_excite')
    conv_spatial_size: int = 8  # Spatial size for conv towers
    
    # Oscillator
    manifold_dim: int = 1024  # Projected manifold (smaller than latent)
    num_tower_pairs: int = 16  # 32 towers / 2
    osc_steps: int = 50  # For sampling only
    fingerprint_dim: int = 64
    
    # Flow
    num_flow_steps: int = 50
    sigma_min: float = 0.001
    
    # Training
    batch_size: int = 64
    lr: float = 1e-4
    weight_decay: float = 0.01
    num_epochs: int = 100
    
    cache_dir: str = "./cache"
    device: str = "cuda"
    output_dir: str = "./beatrix_cifar_t5"
    
    @property
    def latent_flat_dim(self) -> int:
        """Full flattened latent size: 4 × 32 × 32 = 4096"""
        return self.latent_channels * self.latent_size * self.latent_size


# =============================================================================
# BEATRIX FLOW MODEL (Vision + Text Towers)
# =============================================================================

class BeatrixFlowT5(WideRouter):
    """
    Flow model with dual tower collectives per modality:
    
    Vision side:
        - Geometric towers (transformer): cantor, beatrix, helix, simplex (pos/neg)
        - Conv towers: wide_resnet, frequency, bottleneck, squeeze_excite (pos/neg)
        
    Text side (mirrored):
        - Geometric towers (transformer): cantor, beatrix, helix, simplex (pos/neg)
        - Conv towers: wide_resnet, frequency, bottleneck, squeeze_excite (pos/neg)
    
    All towers output opinions that combine for velocity prediction.
    """
    
    def __init__(self, cfg: FlowConfig):
        super().__init__(name='beatrix_flow_t5', strict=False, auto_discover=False)
        self.objects['cfg'] = cfg
        
        # =================================================================
        # TEXT BOTTLENECK (trainable)
        # =================================================================
        self.attach('text_bottleneck_seq', nn.Sequential(
            nn.Linear(cfg.text_raw_dim, cfg.bottleneck_dim),
            nn.GELU(),
            nn.Linear(cfg.bottleneck_dim, cfg.bottleneck_dim),
        ))
        self.attach('text_bottleneck_pool', nn.Sequential(
            nn.Linear(cfg.text_raw_dim, cfg.bottleneck_dim),
            nn.GELU(),
            nn.Linear(cfg.bottleneck_dim, cfg.bottleneck_dim),
        ))
        
        # =================================================================
        # VISION GEOMETRIC TOWERS (pos/neg pairs)
        # =================================================================
        vision_geo_configs = preset_pos_neg_pairs(list(cfg.geometric_types))
        
        vision_geo_collective = build_tower_collective(
            configs=vision_geo_configs,
            dim=cfg.tower_dim,
            default_depth=cfg.tower_depth,
            num_heads=cfg.num_heads,
            ffn_mult=4.0,
            dropout=0.1,
            fingerprint_dim=cfg.fingerprint_dim,
            fusion_type='adaptive',
            name='vision_geo',
        )
        self.attach('vision_geo', vision_geo_collective)
        
        # =================================================================
        # VISION CONV TOWERS (pos/neg pairs)
        # =================================================================
        vision_conv_configs = preset_conv_pos_neg(list(cfg.conv_types))
        
        vision_conv_collective = build_conv_collective(
            configs=vision_conv_configs,
            dim=cfg.tower_dim,
            default_depth=cfg.tower_depth,
            fingerprint_dim=cfg.fingerprint_dim,
            spatial_size=cfg.conv_spatial_size,
            name='vision_conv',
        )
        self.attach('vision_conv', vision_conv_collective)
        
        # =================================================================
        # TEXT GEOMETRIC TOWERS (pos/neg pairs) - MIRRORED
        # =================================================================
        text_geo_configs = preset_pos_neg_pairs(list(cfg.geometric_types))
        
        text_geo_collective = build_tower_collective(
            configs=text_geo_configs,
            dim=cfg.tower_dim,
            default_depth=cfg.tower_depth,
            num_heads=cfg.num_heads,
            ffn_mult=4.0,
            dropout=0.1,
            fingerprint_dim=cfg.fingerprint_dim,
            fusion_type='adaptive',
            name='text_geo',
        )
        self.attach('text_geo', text_geo_collective)
        
        # =================================================================
        # TEXT CONV TOWERS (pos/neg pairs) - MIRRORED
        # =================================================================
        text_conv_configs = preset_conv_pos_neg(list(cfg.conv_types))
        
        text_conv_collective = build_conv_collective(
            configs=text_conv_configs,
            dim=cfg.tower_dim,
            default_depth=cfg.tower_depth,
            fingerprint_dim=cfg.fingerprint_dim,
            spatial_size=cfg.conv_spatial_size,
            name='text_conv',
        )
        self.attach('text_conv', text_conv_collective)
        
        # =================================================================
        # PROJECTIONS
        # =================================================================
        # Latent patchifier
        patch_size = 4
        num_patches = (cfg.latent_size // patch_size) ** 2
        patch_dim = cfg.latent_channels * patch_size * patch_size
        
        self.attach('patch_proj', nn.Linear(patch_dim, cfg.tower_dim))
        self.patch_pos_embed = nn.Parameter(torch.randn(1, num_patches, cfg.tower_dim) * 0.02)
        self.objects['patch_size'] = patch_size
        self.objects['num_patches'] = num_patches
        
        # Text already at bottleneck_dim (256) = tower_dim, no extra projection needed
        
        # =================================================================
        # OSCILLATOR (for sampling)
        # =================================================================
        # Total towers: (4 geo + 4 conv) × pos/neg × 2 modalities = 32 towers
        num_geo_towers = len(vision_geo_configs)
        num_conv_towers = len(vision_conv_configs)
        total_towers = (num_geo_towers + num_conv_towers) * 2  # × 2 for vision + text
        
        oscillator = BeatrixOscillator(
            name='oscillator',
            manifold_dim=cfg.manifold_dim,
            tower_dim=cfg.tower_dim,
            num_tower_pairs=total_towers // 2,
            num_theta_probes=4,
            fingerprint_dim=cfg.fingerprint_dim,
            kappa_schedule=ScheduleType.TESLA_369,
            use_intrinsic_tension=True,
        )
        self.attach('oscillator', oscillator)
        
        # =================================================================
        # CONDITIONING
        # =================================================================
        # Time embedding
        time_embed = nn.Sequential(
            SinusoidalEmbed(256),
            nn.Linear(256, cfg.tower_dim),
            nn.GELU(),
            nn.Linear(cfg.tower_dim, cfg.tower_dim),
        )
        self.attach('time_embed', time_embed)
        
        # Bottlenecked text -> reference anchor
        self.attach('text_to_ref', nn.Sequential(
            nn.Linear(cfg.bottleneck_dim, cfg.manifold_dim),
            nn.GELU(),
            nn.Linear(cfg.manifold_dim, cfg.manifold_dim),
        ))
        
        # Time modulation for reference
        self.attach('time_to_ref', nn.Linear(cfg.tower_dim, cfg.manifold_dim))
        
        # =================================================================
        # LATENT PROJECTION (4096 <-> manifold_dim)
        # =================================================================
        self.attach('latent_down', nn.Linear(cfg.latent_flat_dim, cfg.manifold_dim))
        self.attach('latent_up', nn.Linear(cfg.manifold_dim, cfg.latent_flat_dim))
        
        # Learnable velocity mixing
        self.velocity_mix = nn.Parameter(torch.tensor(0.5))
    
    def patchify(self, z: Tensor) -> Tensor:
        """[B, 4, 32, 32] -> [B, num_patches, tower_dim]"""
        B, C, H, W = z.shape
        p = self.objects['patch_size']
        
        z = z.unfold(2, p, p).unfold(3, p, p)
        z = z.permute(0, 2, 3, 1, 4, 5).contiguous()
        z = z.view(B, -1, C * p * p)
        
        return self['patch_proj'](z) + self.patch_pos_embed
    
    def get_tower_outputs(self, z: Tensor, text_seq: Tensor) -> List[Tensor]:
        """
        Run all four tower collectives.
        Returns list of tower opinions [B, tower_dim] (32 total).
        """
        patches = self.patchify(z)
        text_bottlenecked = self['text_bottleneck_seq'](text_seq)
        
        # Run all collectives
        vision_geo = self['vision_geo'](patches)
        vision_conv_fused, vision_conv_ops = self['vision_conv'](patches)
        text_geo = self['text_geo'](text_bottlenecked)
        text_conv_fused, text_conv_ops = self['text_conv'](text_bottlenecked)
        
        # Collect opinions - use list comprehension (faster than append loop)
        return (
            [op.opinion for op in vision_geo.opinions.values()] +
            list(vision_conv_ops.values()) +
            [op.opinion for op in text_geo.opinions.values()] +
            list(text_conv_ops.values())
        )
    
    def forward(
        self,
        z_0: Tensor,
        text_seq: Tensor,
        text_pooled: Tensor,
        labels: Tensor,
        t: Optional[Tensor] = None,
    ) -> Dict[str, Tensor]:
        """Training forward - single step velocity prediction."""
        cfg = self.objects['cfg']
        B = z_0.shape[0]
        device = z_0.device
        
        if t is None:
            t = torch.rand(B, device=device)
        
        # Flatten latent [B, 4, 32, 32] -> [B, 4096]
        z_0_flat = z_0.flatten(1)
        
        # Noise + interpolate in full latent space
        eps = torch.randn_like(z_0)
        eps_flat = eps.flatten(1)
        t_exp = t.view(B, 1, 1, 1)
        z_t = (1 - t_exp) * z_0 + t_exp * eps
        z_t_flat = z_t.flatten(1)
        
        # Target velocity (in full latent space)
        v_target = eps_flat - z_0_flat
        
        # === PROJECT TO SMALLER MANIFOLD ===
        z_t_proj = self['latent_down'](z_t_flat)  # [B, 4096] -> [B, manifold_dim]
        
        # Bottleneck pooled text for reference
        text_pooled_bn = self['text_bottleneck_pool'](text_pooled)
        
        # Reference from bottlenecked text + time (in manifold space)
        time_emb = self['time_embed'](t)
        x_ref = self['text_to_ref'](text_pooled_bn) + self['time_to_ref'](time_emb)
        
        # Get all tower outputs (text_seq bottlenecked inside get_tower_outputs)
        tower_outputs = self.get_tower_outputs(z_t, text_seq)
        
        # Compute forces in manifold space
        osc = self['oscillator']
        tower_force, _ = osc.force_generator(z_t_proj, tower_outputs, state_fingerprint=None)
        spring_force = x_ref - z_t_proj
        
        # Velocity prediction in manifold space
        tau = torch.sigmoid(self.velocity_mix)
        v_pred_proj = (1 - tau) * spring_force + tau * tower_force
        
        # === PROJECT BACK TO FULL LATENT ===
        v_pred = self['latent_up'](v_pred_proj)  # [B, manifold_dim] -> [B, 4096]
        
        loss = F.mse_loss(v_pred, v_target)
        
        return {'loss': loss, 'tau': tau.detach()}
    
    @torch.no_grad()
    def sample(
        self,
        text_seq: Tensor,
        text_pooled: Tensor,
        vae: SD15VAE,
        num_steps: Optional[int] = None,
    ) -> Tensor:
        """Generate samples from text conditioning."""
        cfg = self.objects['cfg']
        B = text_seq.shape[0]
        device = text_seq.device
        num_steps = num_steps or cfg.num_flow_steps
        
        # Bottleneck pooled text once
        text_pooled_bn = self['text_bottleneck_pool'](text_pooled)
        
        # Start from noise
        z = torch.randn(B, cfg.latent_channels, cfg.latent_size, cfg.latent_size, device=device)
        
        dt = 1.0 / num_steps
        
        for step in range(num_steps):
            t_val = 1.0 - step * dt
            t = torch.full((B,), t_val, device=device)
            
            time_emb = self['time_embed'](t)
            x_ref = self['text_to_ref'](text_pooled_bn) + self['time_to_ref'](time_emb)
            
            z_flat = z.flatten(1)
            
            # Project to manifold
            z_proj = self['latent_down'](z_flat)
            
            tower_outputs = self.get_tower_outputs(z, text_seq)
            
            osc = self['oscillator']
            tower_force, _ = osc.force_generator(z_proj, tower_outputs, state_fingerprint=None)
            spring_force = x_ref - z_proj
            
            tau = torch.sigmoid(self.velocity_mix)
            v_pred_proj = (1 - tau) * spring_force + tau * tower_force
            
            # Project back and update
            v_pred = self['latent_up'](v_pred_proj)
            z_flat = z_flat - dt * v_pred
            z = z_flat.view(B, cfg.latent_channels, cfg.latent_size, cfg.latent_size)
        
        return vae.decode(z)


# =============================================================================
# TRAINER
# =============================================================================

class Trainer:
    def __init__(self, cfg: FlowConfig):
        self.cfg = cfg
        self.device = torch.device(cfg.device if torch.cuda.is_available() else "cpu")
        self.output_dir = Path(cfg.output_dir)
        self.output_dir.mkdir(parents=True, exist_ok=True)
        
        if torch.cuda.is_available():
            torch.backends.cudnn.benchmark = True
            torch.backends.cuda.matmul.allow_tf32 = True
            torch.backends.cudnn.allow_tf32 = True
        
        self.scaler = torch.amp.GradScaler('cuda')
        
        # Dataset
        print("\n=== Building Cached Datasets ===")
        self.train_dataset = CachedCIFAR10T5(train=True, image_size=cfg.image_size, cache_dir=cfg.cache_dir, device=cfg.device)
        self.val_dataset = CachedCIFAR10T5(train=False, image_size=cfg.image_size, cache_dir=cfg.cache_dir, device=cfg.device)
        
        # Update config with actual T5 raw dimension from cache
        cfg.text_raw_dim = self.train_dataset.text_dim
        print(f"T5 raw dimension: {cfg.text_raw_dim} → bottleneck: {cfg.bottleneck_dim}")
        
        self.train_loader = DataLoader(self.train_dataset, batch_size=cfg.batch_size, shuffle=True, num_workers=0, pin_memory=True, drop_last=True)
        self.val_loader = DataLoader(self.val_dataset, batch_size=cfg.batch_size, shuffle=False, num_workers=0, pin_memory=True)
        
        # Store raw text embeddings for sampling (bottleneck applied in model)
        self.text_sequence = self.train_dataset.text_sequence.to(self.device)  # [10, L, raw_dim]
        self.text_pooled = self.train_dataset.text_pooled.to(self.device)  # [10, raw_dim]
        
        # Model
        print("\n=== Building Model (Vision + Text Towers) ===")
        self.model = BeatrixFlowT5(cfg).to(self.device)
        
        # Compile
        if hasattr(torch, 'compile'):
            print("Compiling with WideRouter.prepare_and_compile()...")
            self.model = self.model.prepare_and_compile(
                mode="reduce-overhead",
                fullgraph=False,
            )
        
        num_params = sum(p.numel() for p in self.model.parameters())
        print(f"Trainable parameters: {num_params:,}")
        
        self.optimizer = torch.optim.AdamW(self.model.parameters(), lr=cfg.lr, weight_decay=cfg.weight_decay)
        self.scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(self.optimizer, T_max=cfg.num_epochs * len(self.train_loader))
        
        # Load most recent checkpoint if exists
        self.start_epoch = 0
        self.hf_repo = "AbstractPhil/beatrix-diffusion-proto"
        self._load_latest_checkpoint()
        
        self._vae = None
        
        # HuggingFace Hub setup
        self._setup_hf_repo()
    
    def _setup_hf_repo(self):
        """Create HF repo if needed and save initial config."""
        try:
            self.hf_api = HfApi()
            create_repo(self.hf_repo, exist_ok=True, repo_type="model")
            print(f"HF repo: {self.hf_repo}")
            
            # Save config
            config_dict = {
                'image_size': self.cfg.image_size,
                'num_classes': self.cfg.num_classes,
                'latent_channels': self.cfg.latent_channels,
                'latent_size': self.cfg.latent_size,
                'text_raw_dim': self.cfg.text_raw_dim,
                'bottleneck_dim': self.cfg.bottleneck_dim,
                'tower_dim': self.cfg.tower_dim,
                'tower_depth': self.cfg.tower_depth,
                'num_heads': self.cfg.num_heads,
                'geometric_types': self.cfg.geometric_types,
                'conv_types': self.cfg.conv_types,
                'conv_spatial_size': self.cfg.conv_spatial_size,
                'manifold_dim': self.cfg.manifold_dim,
                'fingerprint_dim': self.cfg.fingerprint_dim,
                'num_flow_steps': self.cfg.num_flow_steps,
            }
            config_path = self.output_dir / "config.json"
            with open(config_path, 'w') as f:
                json.dump(config_dict, f, indent=2)
            
            upload_file(
                path_or_fileobj=str(config_path),
                path_in_repo="config.json",
                repo_id=self.hf_repo,
            )
        except Exception as e:
            print(f"HF setup warning: {e}")
            self.hf_api = None
    
    def _upload_to_hf(self, epoch: int, sample_path: Path, metrics: dict = None):
        """Upload checkpoint, samples, and metrics to HuggingFace."""
        if self.hf_api is None:
            return
        
        try:
            # Upload checkpoint
            ckpt_path = self.output_dir / "ckpt_latest.pt"
            if ckpt_path.exists():
                upload_file(
                    path_or_fileobj=str(ckpt_path),
                    path_in_repo="ckpt_latest.pt",
                    repo_id=self.hf_repo,
                )
            
            # Upload samples
            if sample_path.exists():
                upload_file(
                    path_or_fileobj=str(sample_path),
                    path_in_repo=f"samples/epoch_{epoch:03d}.png",
                    repo_id=self.hf_repo,
                )
                # Also as latest
                upload_file(
                    path_or_fileobj=str(sample_path),
                    path_in_repo="samples/latest.png",
                    repo_id=self.hf_repo,
                )
            
            # Upload metrics log
            if metrics:
                metrics_path = self.output_dir / "metrics.jsonl"
                with open(metrics_path, 'a') as f:
                    f.write(json.dumps({'epoch': epoch, **metrics}) + '\n')
                upload_file(
                    path_or_fileobj=str(metrics_path),
                    path_in_repo="metrics.jsonl",
                    repo_id=self.hf_repo,
                )
            
            print(f"  → Uploaded to HF")
        except Exception as e:
            print(f"  → HF upload failed: {e}")
    
    def _load_latest_checkpoint(self):
        """Load most recent checkpoint if available (local or HF)."""
        latest_path = self.output_dir / "ckpt_latest.pt"
        
        # Try local first
        if latest_path.exists():
            print(f"Resuming from local ckpt_latest.pt...")
            ckpt = torch.load(latest_path, weights_only=False)
        else:
            # Fall back to numbered checkpoints
            ckpts = sorted(self.output_dir.glob("ckpt_epoch*.pt"))
            if ckpts:
                latest_path = ckpts[-1]
                print(f"Resuming from {latest_path.name}...")
                ckpt = torch.load(latest_path, weights_only=False)
            else:
                # Try downloading from HuggingFace
                try:
                    from huggingface_hub import hf_hub_download
                    print(f"Checking HF for checkpoint...")
                    hf_path = hf_hub_download(
                        repo_id=self.hf_repo,
                        filename="ckpt_latest.pt",
                        local_dir=str(self.output_dir),
                    )
                    print(f"Downloaded checkpoint from HF")
                    ckpt = torch.load(hf_path, weights_only=False)
                except Exception as e:
                    print(f"No checkpoint found (local or HF): {e}")
                    return
        
        self.model.load_state_dict(ckpt['model'])
        self.optimizer.load_state_dict(ckpt['optimizer'])
        self.scheduler.load_state_dict(ckpt['scheduler'])
        self.start_epoch = ckpt['epoch']
        print(f"  Resumed at epoch {self.start_epoch}")
    
    def _load_vae(self):
        """Load VAE for sampling (temporary)."""
        print("Loading VAE for sampling...")
        return SD15VAE(freeze=True).to(self.device)
    
    def _unload_vae(self, vae):
        """Unload VAE after sampling."""
        del vae
        torch.cuda.empty_cache()
    
    def train_epoch(self, epoch: int) -> Dict[str, float]:
        self.model.train()
        total_loss, total_tau, n = 0.0, 0.0, 0
        
        pbar = tqdm(self.train_loader, desc=f"Epoch {epoch+1}/{self.cfg.num_epochs}", leave=False)
        for latents, text_seq, text_pooled, labels in pbar:
            latents = latents.to(self.device)
            text_seq = text_seq.to(self.device)
            text_pooled = text_pooled.to(self.device)
            labels = labels.to(self.device)
            
            with torch.amp.autocast('cuda'):
                out = self.model(latents, text_seq, text_pooled, labels)
                loss = out['loss']
            
            self.optimizer.zero_grad()
            self.scaler.scale(loss).backward()
            self.scaler.unscale_(self.optimizer)
            torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)
            self.scaler.step(self.optimizer)
            self.scaler.update()
            self.scheduler.step()
            
            total_loss += loss.item()
            total_tau += out['tau'].item()
            n += 1
            
            pbar.set_postfix(loss=f"{loss.item():.4f}", τ=f"{out['tau'].item():.2f}")
        
        return {'loss': total_loss / n, 'tau': total_tau / n}
    
    @torch.no_grad()
    def validate(self) -> Dict[str, float]:
        self.model.eval()
        total_loss, n = 0.0, 0
        
        for latents, text_seq, text_pooled, labels in self.val_loader:
            latents = latents.to(self.device)
            text_seq = text_seq.to(self.device)
            text_pooled = text_pooled.to(self.device)
            labels = labels.to(self.device)
            
            with torch.amp.autocast('cuda'):
                out = self.model(latents, text_seq, text_pooled, labels)
            total_loss += out['loss'].item()
            n += 1
        
        return {'val_loss': total_loss / n}
    
    @torch.no_grad()
    def sample_images(self, n_per_class: int = 10) -> Tensor:
        """Generate samples for each class (memory-efficient batched)."""
        self.model.eval()
        torch.cuda.empty_cache()
        
        # Load VAE temporarily
        vae = self._load_vae()
        
        all_samples = []
        batch_size = 10  # Generate 10 images at a time
        
        for class_idx in range(10):
            # Generate n_per_class images for this class
            for batch_start in range(0, n_per_class, batch_size):
                batch_n = min(batch_size, n_per_class - batch_start)
                
                text_seq = self.text_sequence[class_idx:class_idx+1].expand(batch_n, -1, -1)
                text_pooled = self.text_pooled[class_idx:class_idx+1].expand(batch_n, -1)
                
                with torch.amp.autocast('cuda'):
                    samples = self.model.sample(text_seq, text_pooled, vae)
                
                all_samples.append(samples.cpu())
        
        # Unload VAE
        self._unload_vae(vae)
        
        samples = torch.cat(all_samples, dim=0).to(self.device)
        return ((samples + 1) / 2).clamp(0, 1)
    
    def save_checkpoint(self, epoch: int, milestone: bool = False):
        ckpt = {
            'epoch': epoch,
            'model': self.model.state_dict(),
            'optimizer': self.optimizer.state_dict(),
            'scheduler': self.scheduler.state_dict(),
        }
        # Always save latest (for resume)
        torch.save(ckpt, self.output_dir / "ckpt_latest.pt")
        # Save milestone checkpoints
        if milestone:
            torch.save(ckpt, self.output_dir / f"ckpt_epoch{epoch:03d}.pt")
    
    def train(self):
        num_geo = len(self.cfg.geometric_types) * 2  # pos/neg
        num_conv = len(self.cfg.conv_types) * 2
        total_towers = (num_geo + num_conv) * 2  # × 2 modalities
        
        print(f"\n{'='*60}")
        print("BEATRIX FLOW - Dual Geometric + Conv Towers (Bottlenecked)")
        print(f"{'='*60}")
        print(f"Device: {self.device}")
        print(f"Geometric towers: {self.cfg.geometric_types} (pos/neg)")
        print(f"Conv towers: {self.cfg.conv_types} (pos/neg)")
        print(f"Tower dim: {self.cfg.tower_dim}")
        print(f"T5 raw → bottleneck: {self.cfg.text_raw_dim} → {self.cfg.bottleneck_dim}")
        print(f"Latent → manifold: {self.cfg.latent_flat_dim} → {self.cfg.manifold_dim}")
        print(f"Total towers: {total_towers}")
        print(f"Batch size: {self.cfg.batch_size}")
        print(f"Epochs: {self.start_epoch}/{self.cfg.num_epochs}")
        print(f"{'='*60}\n")
        
        for epoch in range(self.start_epoch, self.cfg.num_epochs):
            train_metrics = self.train_epoch(epoch)
            val_metrics = self.validate()
            
            lr = self.scheduler.get_last_lr()[0]
            print(f"Epoch {epoch+1:3d} │ loss={train_metrics['loss']:.4f} │ val={val_metrics['val_loss']:.4f} │ τ={train_metrics['tau']:.2f} │ lr={lr:.2e}")
            
            # Sample every epoch to track progress
            samples = self.sample_images(10)
            grid = make_grid(samples, nrow=10, padding=2)
            sample_path = self.output_dir / f"samples_epoch{epoch+1:03d}.png"
            save_image(grid, sample_path)
            print(f"  → Saved samples")
            
            # Checkpoint every epoch (latest), milestone every 10
            self.save_checkpoint(epoch + 1, milestone=((epoch + 1) % 10 == 0))
            
            # Upload to HuggingFace
            metrics = {
                'loss': train_metrics['loss'],
                'val_loss': val_metrics['val_loss'],
                'tau': train_metrics['tau'],
                'lr': lr,
            }
            self._upload_to_hf(epoch + 1, sample_path, metrics)
        
        samples = self.sample_images(10)
        grid = make_grid(samples, nrow=10, padding=2)
        final_path = self.output_dir / "samples_final.png"
        save_image(grid, final_path)
        self.save_checkpoint(self.cfg.num_epochs, milestone=True)
        self._upload_to_hf(self.cfg.num_epochs, final_path)
        print(f"\nTraining complete!")


# =============================================================================
# MAIN
# =============================================================================

def main():
    # Lightweight config - 16 towers instead of 32
    cfg = FlowConfig(
        image_size=256,
        tower_dim=256,
        tower_depth=2,
        num_heads=8,
        geometric_types=('cantor', 'beatrix'),  # 2 types × pos/neg = 4 per modality
        conv_types=('wide_resnet', 'squeeze_excite'),  # 2 types × pos/neg = 4 per modality
        conv_spatial_size=8,
        bottleneck_dim=256,
        manifold_dim=512,  # Smaller manifold
        batch_size=64,
        num_epochs=100,
        cache_dir="./cache",
        output_dir="./beatrix_cifar_t5",
    )
    
    trainer = Trainer(cfg)
    trainer.train()


def main_full():
    """Full 32-tower configuration."""
    cfg = FlowConfig(
        image_size=256,
        tower_dim=256,
        tower_depth=2,
        num_heads=8,
        geometric_types=('cantor', 'beatrix', 'helix', 'simplex'),
        conv_types=('wide_resnet', 'frequency', 'bottleneck', 'squeeze_excite'),
        conv_spatial_size=8,
        bottleneck_dim=256,
        manifold_dim=1024,
        batch_size=64,
        num_epochs=100,
        cache_dir="./cache",
        output_dir="./beatrix_cifar_t5",
    )
    
    trainer = Trainer(cfg)
    trainer.train()


if __name__ == "__main__":
    main()