Spaces:

Jcalemcg
/

zephyr-cybersecurity-trainer

Paused

App Files Files Community

Jcalemcg commited on 11 days ago

Commit

d7872a8

verified ·

1 Parent(s): e737baa

Upload train.py with huggingface_hub

Browse files

Files changed (1) hide show

train.py +213 -0

train.py ADDED Viewed

	@@ -0,0 +1,213 @@

+#!/usr/bin/env python3
+"""
+Fine-tune Zephyr 7B on CyberSecurity Dataset Collection
+Runs on Hugging Face Spaces infrastructure
+"""
+import os
+import torch
+from datasets import load_dataset, concatenate_datasets
+from transformers import (
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    BitsAndBytesConfig,
+    TrainingArguments,
+    Trainer,
+    DataCollatorForLanguageModeling
+)
+from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
+# Configuration
+MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
+OUTPUT_MODEL_NAME = "Jcalemcg/zephyr-7b-cybersecurity-finetuned"
+# CyberSecurity datasets from thelordofweb collection
+CYBERSECURITY_DATASETS = [
+    "AlicanKiraz0/All-CVE-Records-Training-Dataset",
+    "AlicanKiraz0/Cybersecurity-Dataset-v1",
+    "Bouquets/Cybersecurity-LLM-CVE",
+    "CyberNative/CyberSecurityEval",
+    "Mohabahmed03/Alpaca_Dataset_CyberSecurity_Smaller",
+    "CyberNative/github_cybersecurity_READMEs",
+    "AlicanKiraz0/Cybersecurity-Dataset-Heimdall-v1.1",
+    "jcordon5/cybersecurity-rules",
+    "Bouquets/DeepSeek-V3-Distill-Cybersecurity-en",
+    "Seerene/cybersecurity_dataset",
+    "ahmedds10/finetuning_alpaca_Cybersecurity",
+    "Tiamz/cybersecurity-instruction-dataset",
+    "OhWayTee/Cybersecurity-News_3",
+    "Trendyol/All-CVE-Chat-MultiTurn-1999-2025-Dataset",
+    "Vanessasml/cyber-reports-news-analysis-llama2-3k",
+    "Vanessasml/cybersecurity_32k_instruction_input_output",
+    "Vanessasml/enisa_cyber_news_dataset",
+    "Trendyol/Trendyol-Cybersecurity-Instruction-Tuning-Dataset"
+]
+def format_instruction(example):
+    """Format examples into Zephyr chat format"""
+    if "instruction" in example and "output" in example:
+        prompt = f"<|user|>\n{example['instruction']}"
+        if example.get("input", "").strip():
+            prompt += f"\n{example['input']}"
+        prompt += f"</s>\n<|assistant|>\n{example['output']}</s>"
+        return {"text": prompt}
+    elif "question" in example and "answer" in example:
+        return {"text": f"<|user|>\n{example['question']}</s>\n<|assistant|>\n{example['answer']}</s>"}
+    elif "prompt" in example and "completion" in example:
+        return {"text": f"<|user|>\n{example['prompt']}</s>\n<|assistant|>\n{example['completion']}</s>"}
+    elif "text" in example:
+        return {"text": example["text"]}
+    elif "messages" in example:
+        formatted_text = ""
+        for msg in example["messages"]:
+            role = msg.get("role", "")
+            content = msg.get("content", "")
+            if role == "user":
+                formatted_text += f"<|user|>\n{content}</s>\n"
+            elif role == "assistant":
+                formatted_text += f"<|assistant|>\n{content}</s>\n"
+        return {"text": formatted_text}
+    return {"text": str(example)}
+def load_datasets():
+    """Load and prepare cybersecurity datasets"""
+    print("=" * 70)
+    print("LOADING CYBERSECURITY DATASETS")
+    print("=" * 70)
+    all_datasets = []
+    for dataset_name in CYBERSECURITY_DATASETS:
+        try:
+            print(f"\nLoading: {dataset_name}")
+            dataset = load_dataset(dataset_name, split="train", trust_remote_code=True)
+            formatted = dataset.map(
+                format_instruction,
+                remove_columns=dataset.column_names,
+                desc="Formatting"
+            )
+            if len(formatted) > 10000:
+                formatted = formatted.shuffle(seed=42).select(range(10000))
+            all_datasets.append(formatted)
+            print(f"✓ {len(formatted)} examples loaded")
+        except Exception as e:
+            print(f"✗ Failed: {e}")
+    combined = concatenate_datasets(all_datasets)
+    print(f"\n{'='*70}")
+    print(f"TOTAL DATASET SIZE: {len(combined):,} examples")
+    print(f"{'='*70}\n")
+    combined = combined.shuffle(seed=42)
+    return combined.train_test_split(test_size=0.05, seed=42)
+def setup_model():
+    """Setup model with QLoRA"""
+    print("Setting up Zephyr 7B with QLoRA...")
+    bnb_config = BitsAndBytesConfig(
+        load_in_4bit=True,
+        bnb_4bit_quant_type="nf4",
+        bnb_4bit_compute_dtype=torch.float16,
+        bnb_4bit_use_double_quant=True,
+    )
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
+    tokenizer.pad_token = tokenizer.eos_token
+    tokenizer.padding_side = "right"
+    model = AutoModelForCausalLM.from_pretrained(
+        MODEL_NAME,
+        quantization_config=bnb_config,
+        device_map="auto",
+        trust_remote_code=True,
+    )
+    model = prepare_model_for_kbit_training(model)
+    lora_config = LoraConfig(
+        r=16,
+        lora_alpha=32,
+        target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
+        lora_dropout=0.05,
+        bias="none",
+        task_type="CAUSAL_LM"
+    )
+    model = get_peft_model(model, lora_config)
+    model.print_trainable_parameters()
+    return model, tokenizer
+def main():
+    print("\n" + "=" * 70)
+    print("ZEPHYR 7B CYBERSECURITY FINE-TUNING")
+    print("=" * 70 + "\n")
+    # Load data
+    datasets = load_datasets()
+    train_data = datasets["train"]
+    eval_data = datasets["test"]
+    # Setup model
+    model, tokenizer = setup_model()
+    # Tokenize
+    print("\nTokenizing datasets...")
+    def tokenize(examples):
+        return tokenizer(examples["text"], truncation=True, max_length=2048, padding="max_length")
+    train_data = train_data.map(tokenize, batched=True, remove_columns=train_data.column_names)
+    eval_data = eval_data.map(tokenize, batched=True, remove_columns=eval_data.column_names)
+    # Training config
+    training_args = TrainingArguments(
+        output_dir="./output",
+        num_train_epochs=3,
+        per_device_train_batch_size=4,
+        per_device_eval_batch_size=4,
+        gradient_accumulation_steps=4,
+        learning_rate=2e-4,
+        fp16=True,
+        save_strategy="steps",
+        save_steps=500,
+        eval_strategy="steps",
+        eval_steps=500,
+        logging_steps=50,
+        warmup_steps=100,
+        lr_scheduler_type="cosine",
+        optim="paged_adamw_8bit",
+        save_total_limit=3,
+        load_best_model_at_end=True,
+        push_to_hub=True,
+        hub_model_id=OUTPUT_MODEL_NAME,
+        hub_strategy="every_save",
+        report_to="tensorboard",
+    )
+    # Train
+    trainer = Trainer(
+        model=model,
+        args=training_args,
+        train_dataset=train_data,
+        eval_dataset=eval_data,
+        data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False),
+    )
+    print("\n" + "=" * 70)
+    print("STARTING TRAINING")
+    print("=" * 70 + "\n")
+    trainer.train()
+    print("\nSaving model...")
+    trainer.save_model()
+    model.push_to_hub(OUTPUT_MODEL_NAME)
+    tokenizer.push_to_hub(OUTPUT_MODEL_NAME)
+    print("\n" + "=" * 70)
+    print("✓ TRAINING COMPLETE")
+    print(f"✓ Model: {OUTPUT_MODEL_NAME}")
+    print("=" * 70)
+if __name__ == "__main__":
+    main()