#!/usr/bin/env python3 """ Example usage script for Apollo Astralis 2 Demonstrates loading and inference with the model """ import torch from transformers import AutoTokenizer, BitsAndBytesConfig, Mistral3ForConditionalGeneration from peft import PeftModel def load_apollo_astralis_v2(model_path="vanta-research/apollo-astralis-2"): """ Load Apollo Astralis 2 model with 4-bit quantization. Args: model_path: Path to the model (HuggingFace repo or local path) Returns: model, tokenizer: Loaded model and tokenizer """ print("Loading Apollo Astralis 2...") # Configure 4-bit quantization for memory efficiency bnb_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.float16, bnb_4bit_use_double_quant=True, ) # Load tokenizer tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) # Load base model with quantization base_model = Mistral3ForConditionalGeneration.from_pretrained( "Ministral-3-8B-Reasoning-2512", quantization_config=bnb_config, device_map="auto", trust_remote_code=True, torch_dtype=torch.float16, ) # Load LoRA adapter model = PeftModel.from_pretrained(base_model, model_path) model.eval() print("Model loaded successfully!") return model, tokenizer def generate_response(model, tokenizer, prompt, max_new_tokens=512, temperature=0.7, top_p=0.9): """ Generate a response from Apollo Astralis 2. Args: model: The loaded model tokenizer: The loaded tokenizer prompt: User prompt/question max_new_tokens: Maximum tokens to generate temperature: Sampling temperature (0.0 = deterministic, 1.0 = random) top_p: Nucleus sampling parameter Returns: str: Generated response """ # Format prompt with chat template messages = [{"role": "user", "content": prompt}] input_text = tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=True ) # Tokenize input inputs = tokenizer(input_text, return_tensors="pt").to(model.device) # Generate response with torch.no_grad(): outputs = model.generate( **inputs, max_new_tokens=max_new_tokens, do_sample=temperature > 0, temperature=temperature if temperature > 0 else None, top_p=top_p if temperature > 0 else None, pad_token_id=tokenizer.pad_token_id, eos_token_id=tokenizer.eos_token_id, ) # Decode response (excluding the input prompt) response = tokenizer.decode( outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True ) return response def main(): """ Example usage demonstrating various capabilities of Apollo Astralis 2 """ # Load model model, tokenizer = load_apollo_astralis_v2() # Example 1: Logical reasoning print("\n" + "="*80) print("EXAMPLE 1: Logical Reasoning") print("="*80) prompt1 = "Analyze this argument: If it rains, the streets get wet. The streets are wet. Therefore, it must have rained. Is this reasoning valid?" print(f"\nPrompt: {prompt1}") print(f"\nResponse:\n{generate_response(model, tokenizer, prompt1)}") # Example 2: Mathematical problem solving print("\n" + "="*80) print("EXAMPLE 2: Mathematical Problem Solving") print("="*80) prompt2 = """ A train travels at 60 mph for 2 hours, then 80 mph for 3 hours. What is the average speed for the entire journey? """ print(f"\nPrompt: {prompt2.strip()}") print(f"\nResponse:\n{generate_response(model, tokenizer, prompt2)}") # Example 3: Commonsense reasoning print("\n" + "="*80) print("EXAMPLE 3: Commonsense Reasoning") print("="*80) prompt3 = """ You need to keep food cold but your refrigerator is broken. What are some practical solutions? """ print(f"\nPrompt: {prompt3.strip()}") print(f"\nResponse:\n{generate_response(model, tokenizer, prompt3)}") # Example 4: Physical commonsense print("\n" + "="*80) print("EXAMPLE 4: Physical Commonsense") print("="*80) prompt4 = """ You have a jar with a tight lid that won't open. What are effective ways to open it? """ print(f"\nPrompt: {prompt4.strip()}") print(f"\nResponse:\n{generate_response(model, tokenizer, prompt4)}") print("\n" + "="*80) print("Examples completed!") print("="*80) if __name__ == "__main__": main()