| import gradio as gr |
| import pandas as pd |
| import plotly.express as px |
| import plotly.graph_objects as go |
| from datasets import load_dataset |
| import json |
| from datetime import datetime |
| import re |
|
|
| |
| DATASETS_METADATA = [ |
| {"name": "ethanolivertroy/nist-cybersecurity-training", "downloads": 8000, "likes": 48, "size": "100K-1M", "language": "en", "tags": ["cybersecurity", "nist", "compliance", "security-controls", "zero-trust", "privacy"], "category": "compliance", "description": "NIST Cybersecurity Training Dataset v1.1 - The largest open-source NIST cybersecurity training dataset for fine-tuning LLMs"}, |
| {"name": "clydeiii/cybersecurity", "downloads": 4000, "likes": 6, "size": "100K-1M", "language": "unknown", "tags": ["APT", "threat-intelligence"], "category": "offensive", "description": "APT notes dataset from GitHub"}, |
| {"name": "vinitvek/cybersecurityattacks", "downloads": 2300, "likes": 5, "size": "10K-100K", "language": "en", "tags": ["attacks", "security"], "category": "offensive", "description": "Cybersecurity attacks dataset"}, |
| {"name": "Trendyol/Trendyol-Cybersecurity-Instruction-Tuning-Dataset", "downloads": 786, "likes": 78, "size": "10K-100K", "language": "en", "tags": ["cybersecurity", "defensive-security", "instruction-tuning", "threat-intelligence", "incident-response", "security-operations"], "category": "defensive", "description": "53,202 meticulously curated system/user/assistant instruction-tuning examples covering defensive security"}, |
| {"name": "AlicanKiraz0/Cybersecurity-Dataset-Fenrir-v2.0", "downloads": 353, "likes": 10, "size": "10K-100K", "language": "en", "tags": ["cybersecurity", "defensive-security", "instruction-tuning"], "category": "defensive", "description": "83,920 high-quality system/user/assistant triples for defensive cybersecurity"}, |
| {"name": "AlicanKiraz0/Cybersecurity-Dataset-Heimdall-v1.1", "downloads": 192, "likes": 13, "size": "10K-100K", "language": "en", "tags": ["cybersecurity", "defensive-security", "instruction-tuning"], "category": "defensive", "description": "21,258 high-quality system/user/assistant triples for training alignment-safe, defensive-cybersecurity LLMs"}, |
| {"name": "Chemically-motivated/CyberSecurityDataset", "downloads": 180, "likes": 3, "size": "<1K", "language": "en", "tags": ["cybersecurity", "machine learning", "pentesting", "exploits"], "category": "offensive", "description": "Curated data points related to penetration testing, known exploits, and vulnerabilities"}, |
| {"name": "ChaoticNeutrals/Cybersecurity-ShareGPT", "downloads": 175, "likes": 15, "size": "10K-100K", "language": "en", "tags": ["cybersecurity", "ShareGPT"], "category": "ai", "description": "Converted, deslopped, min-hash deduplicated cybersecurity ShareGPT dataset"}, |
| {"name": "Mohabahmed03/Alpaca_Dataset_CyberSecurity_Smaller_2.0", "downloads": 145, "likes": 0, "size": "10K-100K", "language": "en", "tags": ["Cyber", "Security", "Cybersecurity", "LLM", "Pentest", "RedTeam", "BlueTeam"], "category": "ai", "description": "Alpaca format cybersecurity dataset"}, |
| {"name": "Bouquets/DeepSeek-V3-Distill-Cybersecurity-en", "downloads": 140, "likes": 0, "size": "1K-10K", "language": "en", "tags": ["cybersecurity", "penetration-testing", "distilled"], "category": "offensive", "description": "High-quality distilled dataset specialized in cybersecurity penetration testing domain"}, |
| {"name": "Druva-S-Kumar/cybersecurity-qa-dataset", "downloads": 123, "likes": 0, "size": "<1K", "language": "en", "tags": ["qa", "cybersecurity"], "category": "ai", "description": "Cybersecurity Q&A dataset"}, |
| {"name": "Rowden/CybersecurityQAA", "downloads": 119, "likes": 4, "size": "1K-10K", "language": "en", "tags": ["cybersecurity", "QAA"], "category": "ai", "description": "Cybersecurity Question-Answer-Assertion (QAA) Dataset designed to evaluate LLM capabilities"}, |
| {"name": "luckwa/cybersecurity-dataset", "downloads": 119, "likes": 1, "size": "1K-10K", "language": "en", "tags": ["cybersecurity"], "category": "defensive", "description": "General cybersecurity dataset"}, |
| {"name": "Vanessasml/cybersecurity_32k_instruction_input_output", "downloads": 114, "likes": 17, "size": "10K-100K", "language": "en", "tags": ["NIST", "ITC EBA", "threat-identification"], "category": "compliance", "description": "Q&As focused on identification of cyber threats, and text classification under NIST taxonomy"}, |
| {"name": "AlicanKiraz0/Cybersecurity-Dataset-v1", "downloads": 98, "likes": 12, "size": "1K-10K", "language": "en", "tags": ["cybersecurity"], "category": "defensive", "description": "2,500 high-quality instruction-response pairs focused on defensive cybersecurity education"}, |
| {"name": "mariiazhiv/cybersecurity_qa", "downloads": 97, "likes": 1, "size": "<1K", "language": "en", "tags": ["question-answering", "cybersecurity"], "category": "ai", "description": "Instruction-response pairs focused on cybersecurity concepts"}, |
| {"name": "CyberNative/CyberSecurityEval", "downloads": 84, "likes": 19, "size": "<1K", "language": "en", "tags": ["cybersecurity", "infosec", "IT", "evaluation"], "category": "ai", "description": "CyberNative AI for CyberSecurity Q/A Evaluation - NOT FOR TRAINING"}, |
| {"name": "whybe-choi/kovidore-v2-cybersecurity-beir", "downloads": 80, "likes": 1, "size": "1K-10K", "language": "ko", "tags": ["Visual Retrieving", "Industrial RAG"], "category": "defensive", "description": "Corpus of technical reports on cyber threat trends and security incident responses in Korea"}, |
| {"name": "Canstralian/Purple-Team-Cybersecurity-Dataset", "downloads": 73, "likes": 9, "size": "10K-100K", "language": "en", "tags": ["purple-team", "code"], "category": "defensive", "description": "Synthetic collection designed to simulate collaborative cybersecurity exercises"}, |
| {"name": "Bouquets/Cybersecurity-LLM-CVE", "downloads": 46, "likes": 15, "size": "100K-1M", "language": "en", "tags": ["CVE", "vulnerabilities"], "category": "defensive", "description": "CVE vulnerability database for cybersecurity"}, |
| {"name": "theResearchNinja/benchmarkResults_violentUTF_cybersecurityBehavior", "downloads": 37, "likes": 1, "size": "100K-1M", "language": "en", "tags": ["benchmark", "results"], "category": "ai", "description": "Interdependent cybersecurity benchmark results"}, |
| {"name": "schooly/Cyber-Security-Breaches", "downloads": 36, "likes": 11, "size": "1K-10K", "language": "en", "tags": ["breaches", "incidents"], "category": "offensive", "description": "Cyber security breaches dataset"}, |
| {"name": "jcordon5/cybersecurity-rules", "downloads": 36, "likes": 9, "size": "<1K", "language": "en", "tags": ["SIGMA", "YARA", "Suricata", "detection-rules"], "category": "defensive", "description": "950 detection rules from official SIGMA, YARA, and Suricata repositories"}, |
| {"name": "Tiamz/cybersecurity-instruction-dataset", "downloads": 33, "likes": 0, "size": "10K-100K", "language": "en", "tags": ["instruction", "cybersecurity"], "category": "ai", "description": "Cybersecurity instruction dataset"}, |
| {"name": "zeroshot/cybersecurity-corpus", "downloads": 29, "likes": 9, "size": "1K-10K", "language": "en", "tags": ["corpus"], "category": "ai", "description": "Cybersecurity corpus for training"}, |
| {"name": "mteb/kovidore-v2-cybersecurity-mteb", "downloads": 29, "likes": 0, "size": "1K-10K", "language": "ko", "tags": ["MTEB", "retrieval"], "category": "ai", "description": "MTEB cybersecurity retrieval dataset in Korean"}, |
| {"name": "electricsheepafrica/nigerian-telecom-cybersecurity-incident-logs", "downloads": 27, "likes": 0, "size": "10K-100K", "language": "en", "tags": ["telecom", "cybersecurity", "incident", "logs"], "category": "defensive", "description": "Security events including intrusions, DDoS attacks, and malware on telecom infrastructure"}, |
| {"name": "CyberNative/github_cybersecurity_READMEs", "downloads": 26, "likes": 14, "size": "1K-10K", "language": "en", "tags": ["github", "README"], "category": "ai", "description": "GitHub cybersecurity README files"}, |
| {"name": "Mohabahmed03/Alpaca_Dataset_CyberSecurity_2.0", "downloads": 26, "likes": 0, "size": "100K-1M", "language": "en", "tags": ["Cyber", "Security", "Pentest", "Cybersecurity", "LLM", "BlueTeam"], "category": "ai", "description": "Alpaca format cybersecurity dataset v2.0"}, |
| {"name": "hcnote/Cybersecurity-Dataset", "downloads": 26, "likes": 0, "size": "10K-100K", "language": "en", "tags": ["code", "question-answering"], "category": "ai", "description": "High-quality cybersecurity dataset"}, |
| {"name": "Zeo6/CyberSecurity-FineTune", "downloads": 25, "likes": 0, "size": "unknown", "language": "en", "tags": ["finetune"], "category": "ai", "description": "Cybersecurity fine-tuning dataset"}, |
| {"name": "ystemsrx/Cybersecurity-ShareGPT-Chinese", "downloads": 24, "likes": 21, "size": "10K-100K", "language": "zh", "tags": ["code", "Chinese"], "category": "ai", "description": "Chinese cybersecurity dataset in ShareGPT format"}, |
| {"name": "whybe-choi/kovidore-v2-cybersecurity-mteb", "downloads": 24, "likes": 0, "size": "1K-10K", "language": "ko", "tags": ["MTEB", "retrieval"], "category": "ai", "description": "MTEB cybersecurity retrieval dataset"}, |
| {"name": "princemaxp/cybersecurity-keywords", "downloads": 22, "likes": 1, "size": "<1K", "language": "en", "tags": ["cybersecurity", "keywords"], "category": "ai", "description": "Common cybersecurity keywords list"}, |
| {"name": "madox81/cybersecurity_attack_conversational_dataset", "downloads": 20, "likes": 0, "size": "unknown", "language": "en", "tags": ["conversational", "attacks"], "category": "offensive", "description": "Conversational cybersecurity attack dataset"}, |
| {"name": "safouene99999/Cybersecurity_QA", "downloads": 19, "likes": 0, "size": "10K-100K", "language": "en", "tags": ["QA"], "category": "ai", "description": "Cybersecurity Q&A dataset"}, |
| {"name": "hcnote/High-quality-cybersecurity-datasets", "downloads": 19, "likes": 0, "size": "100K-1M", "language": "en", "tags": ["high-quality"], "category": "ai", "description": "277,707 high-quality cybersecurity records with AI annotation"}, |
| {"name": "theResearchNinja/violentutf_cybersecurityBehavior", "downloads": 18, "likes": 3, "size": "10K-100K", "language": "en", "tags": ["cybersecurity", "cognitive behavioral psychology", "benchmark"], "category": "ai", "description": "LLM cybersecurity behavior benchmark dataset"}, |
| {"name": "GotThatData/nist-cybersecurity-framework", "downloads": 18, "likes": 7, "size": "1K-10K", "language": "en", "tags": ["NIST", "Cybersecurity", "Framework"], "category": "compliance", "description": "NIST Cybersecurity Publications Dataset"}, |
| {"name": "Mohabahmed03/Alpaca_Dataset_General_CyberSecurity", "downloads": 18, "likes": 0, "size": "100K-1M", "language": "en", "tags": ["General", "Alpaca", "CyberSecurity"], "category": "ai", "description": "General Alpaca format cybersecurity dataset"}, |
| {"name": "vnovaai19/CYBERSECURITY_JSONL_V1", "downloads": 18, "likes": 0, "size": "<1K", "language": "en", "tags": ["cybersecurity", "synthetic-data", "safety", "phishing", "fraud-detection"], "category": "defensive", "description": "100 synthetic cybersecurity threat scenarios with educational AI responses"}, |
| {"name": "Mattimax/Cybersecurity-ShareGPT-Italian", "downloads": 18, "likes": 0, "size": "1K-10K", "language": "it", "tags": ["Italian", "ShareGPT"], "category": "ai", "description": "Italian cybersecurity ShareGPT dataset"}, |
| {"name": "olgazigbeehub/cybersecurity-news-dataset-english-3000", "downloads": 18, "likes": 0, "size": "1K-10K", "language": "en", "tags": ["news", "cybersecurity", "media-analysis"], "category": "defensive", "description": "3,000 English-language cybersecurity news metadata rows"}, |
| {"name": "hcnote/Cybersecurity-High-Quality-Dataset", "downloads": 17, "likes": 0, "size": "100K-1M", "language": "en", "tags": ["high-quality"], "category": "ai", "description": "270,271 high-quality Chinese-English Q&A cybersecurity dataset"}, |
| {"name": "ScoutieAutoML/cybersecurity_news_telegram_dataset", "downloads": 16, "likes": 2, "size": "10K-100K", "language": "ru", "tags": ["russia", "cybersecurity", "media", "news"], "category": "defensive", "description": "Russian-language Telegram news channels on cybersecurity"}, |
| {"name": "savaniDhruv/Cybersecurity_Attack_Dataset", "downloads": 16, "likes": 2, "size": "10K-100K", "language": "en", "tags": ["attacks"], "category": "offensive", "description": "Cybersecurity attack dataset"}, |
| {"name": "pyToshka/cyber-security-events", "downloads": 16, "likes": 0, "size": "10K-100K", "language": "en", "tags": ["cybersecurity", "honeypot", "threat-intelligence"], "category": "defensive", "description": "Cybersecurity events collected from honeypot infrastructure"}, |
| {"name": "ahmadkaab/Trendyol-Cybersecurity-Instruction-Tuning-Dataset", "downloads": 16, "likes": 0, "size": "10K-100K", "language": "en", "tags": ["cybersecurity", "defensive-security", "instruction-tuning"], "category": "defensive", "description": "53,202 defensive security instruction-tuning examples"}, |
| {"name": "MCP-1st-Birthday/smoltrace-cybersecurity-tasks", "downloads": 15, "likes": 0, "size": "<1K", "language": "en", "tags": ["smoltrace", "synthetic-data", "agent-evaluation"], "category": "ai", "description": "SMOLTRACE synthetic dataset for agent evaluation"}, |
| {"name": "ErebusTN/The-Ultimate-CyberSecurity-Dataset-Collection", "downloads": 14, "likes": 1, "size": "unknown", "language": "en", "tags": ["collection"], "category": "ai", "description": "Ultimate cybersecurity dataset collection"}, |
| {"name": "NewsDataHub/cybersecurity-news-dataset-english-3000", "downloads": 14, "likes": 1, "size": "1K-10K", "language": "en", "tags": ["news", "cybersecurity"], "category": "defensive", "description": "3,000 English cybersecurity news metadata rows"}, |
| {"name": "AYI-NEDJIMI/ai-cybersecurity-en", "downloads": 14, "likes": 0, "size": "<1K", "language": "en", "tags": ["artificial-intelligence", "cybersecurity", "offensive-ai", "defensive-ai", "deepfake"], "category": "ai", "description": "AI in Offensive and Defensive Cybersecurity - English Dataset"}, |
| {"name": "AR2021/cybersecurity-corpus-llama2-1k", "downloads": 13, "likes": 1, "size": "<1K", "language": "en", "tags": ["llama2"], "category": "ai", "description": "Cybersecurity corpus for Llama2"}, |
| {"name": "boapro/Purple-Team-Cybersecurity-Dataset", "downloads": 13, "likes": 0, "size": "10K-100K", "language": "en", "tags": ["code", "purple-team"], "category": "defensive", "description": "Synthetic purple team cybersecurity exercises"}, |
| {"name": "tuandunghcmut/Trendyol-Cybersecurity-Instruction-Tuning-Dataset", "downloads": 13, "likes": 1, "size": "10K-100K", "language": "en", "tags": ["cybersecurity", "security", "cyber-defense", "conversational"], "category": "defensive", "description": "GPT format conversational cybersecurity dataset"}, |
| {"name": "AYI-NEDJIMI/ai-cybersecurity-fr", "downloads": 13, "likes": 0, "size": "<1K", "language": "fr", "tags": ["artificial-intelligence", "cybersecurity", "offensive-ai", "defensive-ai"], "category": "ai", "description": "AI in Offensive and Defensive Cybersecurity - French Dataset"}, |
| {"name": "pki/autonlp-data-cybersecurity", "downloads": 12, "likes": 0, "size": "unknown", "language": "en", "tags": ["autonlp"], "category": "ai", "description": "AutoNLP cybersecurity data"}, |
| {"name": "Hadihilman/cybersecurity-dataset", "downloads": 12, "likes": 0, "size": "<1K", "language": "en", "tags": ["images"], "category": "defensive", "description": "Cybersecurity image dataset"}, |
| {"name": "AnodeAI/Elite_quality_cybersecurity", "downloads": 12, "likes": 1, "size": "10K-100K", "language": "en", "tags": ["legal", "finance"], "category": "ai", "description": "Elite quality cybersecurity dataset"}, |
| {"name": "hcnote/Cybersecurity-bigDataset", "downloads": 12, "likes": 0, "size": "100K-1M", "language": "en", "tags": ["large-scale"], "category": "ai", "description": "Global first open-source mega-scale cybersecurity dataset"}, |
| {"name": "bnsapa/cybersecurity-ner", "downloads": 11, "likes": 2, "size": "1K-10K", "language": "en", "tags": ["token-classification", "NER"], "category": "ai", "description": "Cybersecurity named entity recognition dataset"}, |
| {"name": "baig31/Cybersecurity_penetration_testing_books", "downloads": 11, "likes": 12, "size": "unknown", "language": "en", "tags": ["books", "penetration-testing"], "category": "offensive", "description": "Cybersecurity penetration testing books"}, |
| {"name": "beldua/english-cybersecurity-basics-30", "downloads": 11, "likes": 0, "size": "<1K", "language": "en", "tags": ["basics"], "category": "ai", "description": "English cybersecurity basics"}, |
| {"name": "ahmedds10/finetuning_cybersecurity", "downloads": 10, "likes": 0, "size": "<1K", "language": "en", "tags": ["finetuning"], "category": "ai", "description": "Cybersecurity fine-tuning dataset"}, |
| {"name": "Mohabahmed03/Alpaca_Dataset_CyberSecurity_Smaller", "downloads": 10, "likes": 0, "size": "10K-100K", "language": "en", "tags": ["CyberSecurity", "Finetune"], "category": "ai", "description": "Smaller Alpaca cybersecurity dataset"}, |
| {"name": "ChavyvAkvar/Trendyol-Cybersecurity-Instruction-Tuning-Dataset-Converted", "downloads": 10, "likes": 1, "size": "10K-100K", "language": "en", "tags": ["converted"], "category": "defensive", "description": "Converted Trendyol cybersecurity dataset"}, |
| {"name": "tandevllc/cybersecurity-atom-rss-feeds-2025", "downloads": 10, "likes": 1, "size": "unknown", "language": "en", "tags": ["news", "rss", "feeds"], "category": "defensive", "description": "Cybersecurity Atom/RSS feeds 2025"}, |
| {"name": "tandevllc/cybersecurity-wiki-slices", "downloads": 10, "likes": 1, "size": "10K-100K", "language": "en", "tags": ["wikipedia", "cybersecurity"], "category": "ai", "description": "Curated collection of English Wikipedia pages covering cybersecurity"}, |
| {"name": "antitheft159/CybersecurityAttacks", "downloads": 9, "likes": 1, "size": "unknown", "language": "en", "tags": ["attacks"], "category": "offensive", "description": "Cybersecurity attacks dataset"}, |
| {"name": "Tiamz/cybersecurity-raw-json-datasets", "downloads": 9, "likes": 0, "size": "unknown", "language": "en", "tags": ["raw", "json"], "category": "ai", "description": "Raw JSON cybersecurity datasets"}, |
| {"name": "burpsuite/Cybersecurity-Dataset-v1", "downloads": 9, "likes": 0, "size": "1K-10K", "language": "en", "tags": ["cybersecurity"], "category": "defensive", "description": "2,500 defensive cybersecurity instruction-response pairs"}, |
| {"name": "Deshaune/Global-Cybersecurity-Threats-2015_2024", "downloads": 8, "likes": 1, "size": "1K-10K", "language": "en", "tags": ["global-threats"], "category": "defensive", "description": "Global cybersecurity threats from 2015-2024"}, |
| {"name": "oceancharcoal/Cybersecurity_attack_dataset", "downloads": 8, "likes": 0, "size": "10K-100K", "language": "en", "tags": ["attacks"], "category": "offensive", "description": "Cybersecurity attack dataset"}, |
| {"name": "pyToshka/cyber-security-events-full", "downloads": 8, "likes": 0, "size": "100K-1M", "language": "en", "tags": ["cybersecurity", "honeypot", "threat-intelligence"], "category": "defensive", "description": "Full cybersecurity events from honeypot infrastructure"}, |
| {"name": "dattaraj/rag_eval_cybersecurity", "downloads": 7, "likes": 0, "size": "<1K", "language": "en", "tags": ["RAG", "evaluation"], "category": "ai", "description": "RAG evaluation for cybersecurity"}, |
| {"name": "lianghsun/tw-cybersecurity", "downloads": 7, "likes": 0, "size": "1K-10K", "language": "zh", "tags": ["Taiwan", "cybersecurity", "ISO-27001"], "category": "compliance", "description": "Taiwan cybersecurity dataset with ISO/IEC 27001"}, |
| {"name": "mariiazhiv/Cybersecurity_messages", "downloads": 7, "likes": 0, "size": "1K-10K", "language": "en", "tags": ["messages"], "category": "ai", "description": "Cybersecurity messages dataset"}, |
| {"name": "MichaelPrimez/cybersecurity-questionaire", "downloads": 6, "likes": 0, "size": "<1K", "language": "en", "tags": ["questionnaire", "synthetic", "distilabel"], "category": "ai", "description": "Cybersecurity questionnaire dataset"}, |
| {"name": "lianghsun/tw-cybersecurity-chat", "downloads": 5, "likes": 0, "size": "1K-10K", "language": "zh", "tags": ["Taiwan", "cybersecurity", "chat"], "category": "ai", "description": "Taiwan cybersecurity chat dataset"}, |
| {"name": "WhoIsShe/CyberSecurity-big", "downloads": 5, "likes": 1, "size": "1M-10M", "language": "en", "tags": ["large-scale"], "category": "ai", "description": "Large-scale cybersecurity dataset"}, |
| ] |
|
|
| def create_dataframe(): |
| """Create pandas DataFrame from metadata""" |
| df = pd.DataFrame(DATASETS_METADATA) |
| df['url'] = df['name'].apply(lambda x: f"https://huggingface.co/datasets/{x}") |
| return df |
|
|
| def get_dataset_stats(): |
| """Generate overall statistics""" |
| df = create_dataframe() |
|
|
| stats = { |
| "Total Datasets": len(df), |
| "Total Downloads": f"{df['downloads'].sum():,}", |
| "Total Likes": f"{df['likes'].sum():,}", |
| "Languages": len(df['language'].unique()), |
| "Categories": len(df['category'].unique()), |
| } |
| return stats |
|
|
| def filter_datasets(keyword, language, category, min_downloads, min_likes): |
| """Filter datasets based on criteria""" |
| df = create_dataframe() |
|
|
| |
| if keyword: |
| mask = ( |
| df['name'].str.contains(keyword, case=False, na=False) | |
| df['description'].str.contains(keyword, case=False, na=False) | |
| df['tags'].apply(lambda x: any(keyword.lower() in tag.lower() for tag in x)) |
| ) |
| df = df[mask] |
|
|
| |
| if language and language != "All": |
| df = df[df['language'] == language] |
|
|
| |
| if category and category != "All": |
| df = df[df['category'] == category] |
|
|
| |
| if min_downloads: |
| df = df[df['downloads'] >= min_downloads] |
|
|
| |
| if min_likes: |
| df = df[df['likes'] >= min_likes] |
|
|
| return df |
|
|
| def search_datasets(keyword, language, category, min_downloads, min_likes): |
| """Search and display datasets""" |
| df = filter_datasets(keyword, language, category, min_downloads, min_likes) |
|
|
| |
| display_df = df[['name', 'downloads', 'likes', 'size', 'language', 'category', 'description']].copy() |
| display_df.columns = ['Dataset Name', 'Downloads', 'Likes', 'Size', 'Language', 'Category', 'Description'] |
|
|
| result_text = f"Found {len(df)} datasets matching your criteria" |
|
|
| return display_df, result_text |
|
|
| def get_dataset_details(dataset_name): |
| """Get detailed information about a specific dataset""" |
| df = create_dataframe() |
|
|
| if not dataset_name: |
| return "Please select a dataset from the list above", None, None |
|
|
| dataset = df[df['name'] == dataset_name] |
|
|
| if dataset.empty: |
| return "Dataset not found", None, None |
|
|
| dataset = dataset.iloc[0] |
|
|
| details = f""" |
| ## {dataset['name']} |
| |
| **Description:** {dataset['description']} |
| |
| **Statistics:** |
| - Downloads: {dataset['downloads']:,} |
| - Likes: {dataset['likes']} |
| - Size: {dataset['size']} |
| - Language: {dataset['language']} |
| - Category: {dataset['category']} |
| |
| **Tags:** {', '.join(dataset['tags'])} |
| |
| **HuggingFace URL:** [{dataset['url']}]({dataset['url']}) |
| |
| --- |
| |
| *Note: To preview dataset samples, you would need to load the actual dataset using the HuggingFace datasets library. |
| This demo shows metadata only. For full dataset access, click the URL above.* |
| """ |
|
|
| |
| preview_data = { |
| "Column": ["Feature 1", "Feature 2", "Feature 3"], |
| "Type": ["text", "text", "category"], |
| "Sample": ["Sample data...", "Sample data...", "Sample category..."] |
| } |
| preview_df = pd.DataFrame(preview_data) |
|
|
| return details, preview_df, dataset['url'] |
|
|
| def create_category_chart(): |
| """Create pie chart of datasets by category""" |
| df = create_dataframe() |
| category_counts = df['category'].value_counts() |
|
|
| fig = px.pie( |
| values=category_counts.values, |
| names=category_counts.index, |
| title='Datasets by Category', |
| color_discrete_sequence=px.colors.sequential.RdBu, |
| hole=0.3 |
| ) |
| fig.update_layout( |
| paper_bgcolor='rgba(0,0,0,0)', |
| plot_bgcolor='rgba(0,0,0,0)', |
| font=dict(color='white') |
| ) |
| return fig |
|
|
| def create_language_chart(): |
| """Create bar chart of datasets by language""" |
| df = create_dataframe() |
| language_counts = df['language'].value_counts().head(10) |
|
|
| fig = px.bar( |
| x=language_counts.index, |
| y=language_counts.values, |
| title='Top 10 Languages', |
| labels={'x': 'Language', 'y': 'Number of Datasets'}, |
| color=language_counts.values, |
| color_continuous_scale='Viridis' |
| ) |
| fig.update_layout( |
| paper_bgcolor='rgba(0,0,0,0)', |
| plot_bgcolor='rgba(0,0,0,0)', |
| font=dict(color='white'), |
| showlegend=False |
| ) |
| return fig |
|
|
| def create_downloads_chart(): |
| """Create bar chart of top datasets by downloads""" |
| df = create_dataframe() |
| top_downloads = df.nlargest(15, 'downloads')[['name', 'downloads']] |
| top_downloads['short_name'] = top_downloads['name'].apply(lambda x: x.split('/')[-1][:30]) |
|
|
| fig = px.bar( |
| top_downloads, |
| x='downloads', |
| y='short_name', |
| orientation='h', |
| title='Top 15 Datasets by Downloads', |
| labels={'downloads': 'Downloads', 'short_name': 'Dataset'}, |
| color='downloads', |
| color_continuous_scale='Plasma' |
| ) |
| fig.update_layout( |
| paper_bgcolor='rgba(0,0,0,0)', |
| plot_bgcolor='rgba(0,0,0,0)', |
| font=dict(color='white'), |
| height=600, |
| showlegend=False |
| ) |
| return fig |
|
|
| def create_size_distribution_chart(): |
| """Create distribution chart of dataset sizes""" |
| df = create_dataframe() |
| size_counts = df['size'].value_counts() |
|
|
| fig = px.bar( |
| x=size_counts.index, |
| y=size_counts.values, |
| title='Dataset Size Distribution', |
| labels={'x': 'Size Category', 'y': 'Number of Datasets'}, |
| color=size_counts.values, |
| color_continuous_scale='Cividis' |
| ) |
| fig.update_layout( |
| paper_bgcolor='rgba(0,0,0,0)', |
| plot_bgcolor='rgba(0,0,0,0)', |
| font=dict(color='white'), |
| showlegend=False |
| ) |
| return fig |
|
|
| def export_to_csv(keyword, language, category, min_downloads, min_likes): |
| """Export filtered datasets to CSV""" |
| df = filter_datasets(keyword, language, category, min_downloads, min_likes) |
| output_path = "/tmp/cybersecurity_datasets.csv" |
| df.to_csv(output_path, index=False) |
| return output_path |
|
|
| def export_to_json(keyword, language, category, min_downloads, min_likes): |
| """Export filtered datasets to JSON""" |
| df = filter_datasets(keyword, language, category, min_downloads, min_likes) |
| output_path = "/tmp/cybersecurity_datasets.json" |
| df.to_json(output_path, orient='records', indent=2) |
| return output_path |
|
|
| |
| with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue", secondary_hue="purple").set( |
| body_background_fill='*primary_900', |
| body_background_fill_dark='*primary_950', |
| block_background_fill='*primary_800', |
| block_background_fill_dark='*primary_900', |
| block_border_color='*primary_600', |
| input_background_fill='*primary_700', |
| button_primary_background_fill='*primary_600', |
| button_primary_background_fill_hover='*primary_500', |
| )) as demo: |
|
|
| gr.Markdown(""" |
| # π Cybersecurity Dataset Explorer |
| |
| Explore and analyze 80+ cybersecurity datasets from HuggingFace |
| |
| **Features:** |
| - Search by keyword, language, category |
| - View detailed statistics and visualizations |
| - Export datasets metadata to CSV/JSON |
| - Preview dataset information |
| - Direct links to HuggingFace repositories |
| """) |
|
|
| |
| with gr.Row(): |
| stats = get_dataset_stats() |
| for key, value in stats.items(): |
| gr.Markdown(f"### {key}\n## {value}") |
|
|
| |
| with gr.Tabs(): |
|
|
| |
| with gr.Tab("π Search & Filter"): |
| with gr.Row(): |
| with gr.Column(scale=1): |
| keyword_input = gr.Textbox( |
| label="Search Keyword", |
| placeholder="Enter keyword (name, description, or tags)...", |
| lines=1 |
| ) |
| language_dropdown = gr.Dropdown( |
| label="Language", |
| choices=["All"] + sorted(list(set([d['language'] for d in DATASETS_METADATA]))), |
| value="All" |
| ) |
| category_dropdown = gr.Dropdown( |
| label="Category", |
| choices=["All", "ai", "defensive", "offensive", "compliance"], |
| value="All" |
| ) |
| min_downloads_slider = gr.Slider( |
| label="Minimum Downloads", |
| minimum=0, |
| maximum=10000, |
| value=0, |
| step=100 |
| ) |
| min_likes_slider = gr.Slider( |
| label="Minimum Likes", |
| minimum=0, |
| maximum=100, |
| value=0, |
| step=1 |
| ) |
| search_btn = gr.Button("π Search Datasets", variant="primary") |
|
|
| with gr.Column(scale=3): |
| result_text = gr.Textbox(label="Search Results", lines=1) |
| results_table = gr.Dataframe( |
| label="Datasets", |
| wrap=True, |
| interactive=False |
| ) |
|
|
| with gr.Row(): |
| export_csv_btn = gr.Button("π₯ Export to CSV") |
| export_json_btn = gr.Button("π₯ Export to JSON") |
|
|
| with gr.Row(): |
| csv_file = gr.File(label="CSV Download") |
| json_file = gr.File(label="JSON Download") |
|
|
| search_btn.click( |
| fn=search_datasets, |
| inputs=[keyword_input, language_dropdown, category_dropdown, min_downloads_slider, min_likes_slider], |
| outputs=[results_table, result_text] |
| ) |
|
|
| export_csv_btn.click( |
| fn=export_to_csv, |
| inputs=[keyword_input, language_dropdown, category_dropdown, min_downloads_slider, min_likes_slider], |
| outputs=csv_file |
| ) |
|
|
| export_json_btn.click( |
| fn=export_to_json, |
| inputs=[keyword_input, language_dropdown, category_dropdown, min_downloads_slider, min_likes_slider], |
| outputs=json_file |
| ) |
|
|
| |
| with gr.Tab("π Dataset Details"): |
| dataset_selector = gr.Dropdown( |
| label="Select Dataset", |
| choices=[d['name'] for d in DATASETS_METADATA], |
| value=DATASETS_METADATA[0]['name'] if DATASETS_METADATA else None |
| ) |
| view_details_btn = gr.Button("View Details", variant="primary") |
|
|
| dataset_details = gr.Markdown(label="Dataset Information") |
| preview_table = gr.Dataframe(label="Preview (Mock Data)") |
| dataset_link = gr.Textbox(label="HuggingFace URL") |
|
|
| view_details_btn.click( |
| fn=get_dataset_details, |
| inputs=dataset_selector, |
| outputs=[dataset_details, preview_table, dataset_link] |
| ) |
|
|
| |
| with gr.Tab("π Statistics & Charts"): |
| gr.Markdown("## Dataset Analytics Dashboard") |
|
|
| with gr.Row(): |
| category_chart = gr.Plot(label="Category Distribution") |
| language_chart = gr.Plot(label="Language Distribution") |
|
|
| with gr.Row(): |
| downloads_chart = gr.Plot(label="Top Downloads") |
|
|
| with gr.Row(): |
| size_chart = gr.Plot(label="Size Distribution") |
|
|
| refresh_charts_btn = gr.Button("π Refresh Charts", variant="primary") |
|
|
| def refresh_all_charts(): |
| return ( |
| create_category_chart(), |
| create_language_chart(), |
| create_downloads_chart(), |
| create_size_distribution_chart() |
| ) |
|
|
| refresh_charts_btn.click( |
| fn=refresh_all_charts, |
| outputs=[category_chart, language_chart, downloads_chart, size_chart] |
| ) |
|
|
| |
| demo.load( |
| fn=refresh_all_charts, |
| outputs=[category_chart, language_chart, downloads_chart, size_chart] |
| ) |
|
|
| |
| with gr.Tab("βΉοΈ About"): |
| gr.Markdown(""" |
| ## About Dataset Explorer |
| |
| This application provides a comprehensive interface to explore 80 cybersecurity datasets from HuggingFace. |
| |
| ### Features: |
| |
| 1. **Search & Filter**: Find datasets by keyword, language, category, popularity |
| 2. **Dataset Details**: View comprehensive information about each dataset |
| 3. **Statistics**: Visual analytics with interactive charts |
| 4. **Export**: Download filtered results as CSV or JSON |
| 5. **Direct Links**: Access to HuggingFace repositories |
| |
| ### Categories: |
| |
| - **AI**: Datasets for training AI/ML models |
| - **Defensive**: Blue team, threat detection, incident response |
| - **Offensive**: Red team, penetration testing, exploits |
| - **Compliance**: NIST, ISO 27001, regulatory frameworks |
| |
| ### Data Sources: |
| |
| All datasets are publicly available on HuggingFace Hub. This explorer provides |
| metadata and filtering capabilities. To access the actual dataset content, |
| click the HuggingFace URL for any dataset. |
| |
| ### Technologies: |
| |
| - **Gradio**: Interactive web interface |
| - **Pandas**: Data manipulation |
| - **Plotly**: Interactive visualizations |
| - **HuggingFace Datasets**: Dataset metadata |
| |
| --- |
| |
| **Created by:** AYI-NEDJIMI |
| **Version:** 1.0 |
| **Last Updated:** February 2026 |
| """) |
|
|
| |
| gr.Markdown(""" |
| --- |
| π‘ **Tip**: Use the search feature to find datasets by specific topics like "NIST", "penetration testing", "threat intelligence", etc. |
| """) |
|
|
| if __name__ == "__main__": |
| demo.launch() |
|
|