| import pandas as pd | |
| import pickle | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| # 1. Load your 5000 samples | |
| print("👻 Loading Rosetta Stone Dataset...") | |
| try: | |
| df = pd.read_csv("rosetta_code_dataset.csv") | |
| print(f" -> Loaded {len(df)} examples.") | |
| except: | |
| print("Error: Could not find rosetta_code_dataset.csv") | |
| exit() | |
| # 2. Train the Brain (TF-IDF Vectorizer) | |
| # This converts English text ("fibonacci in java") into Math Numbers | |
| print("🧠Training the Ghost Engine...") | |
| vectorizer = TfidfVectorizer() | |
| tfidf_matrix = vectorizer.fit_transform(df['prompt'].values.astype('U')) | |
| # 3. Save the Brain file | |
| # We save the Vectorizer (translator), Matrix (memory), and Code (answers) | |
| output_file = "ghost_brain.pkl" | |
| with open(output_file, "wb") as f: | |
| pickle.dump((vectorizer, tfidf_matrix, df['code'].values), f) | |
| print(f"✅ SUCCESS! Brain saved as '{output_file}'") | |
| print(f" Size: {os.path.getsize(output_file) / 1024:.2f} KB (Tiny!)") | |
| print(" Copy this file + ghost_coder.py to your USB stick.") |