ShiroOnigami23
/

Ghost-Codex-Offline

code-generation

Model card Files Files and versions

Ghost-Codex-Offline / train.py

ShiroOnigami23's picture

Upload train.py with huggingface_hub

d533652 verified 5 days ago

history blame contribute delete

1.06 kB

	import pandas as pd
	import pickle
	from sklearn.feature_extraction.text import TfidfVectorizer

	# 1. Load your 5000 samples
	print("👻 Loading Rosetta Stone Dataset...")
	try:
	df = pd.read_csv("rosetta_code_dataset.csv")
	print(f" -> Loaded {len(df)} examples.")
	except:
	print("Error: Could not find rosetta_code_dataset.csv")
	exit()

	# 2. Train the Brain (TF-IDF Vectorizer)
	# This converts English text ("fibonacci in java") into Math Numbers
	print("🧠 Training the Ghost Engine...")
	vectorizer = TfidfVectorizer()
	tfidf_matrix = vectorizer.fit_transform(df['prompt'].values.astype('U'))

	# 3. Save the Brain file
	# We save the Vectorizer (translator), Matrix (memory), and Code (answers)
	output_file = "ghost_brain.pkl"
	with open(output_file, "wb") as f:
	pickle.dump((vectorizer, tfidf_matrix, df['code'].values), f)

	print(f"✅ SUCCESS! Brain saved as '{output_file}'")
	print(f" Size: {os.path.getsize(output_file) / 1024:.2f} KB (Tiny!)")
	print(" Copy this file + ghost_coder.py to your USB stick.")