AYI-NEDJIMI
Initial commit: Dataset Explorer v1.0
14b051b
import gradio as gr
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from datasets import load_dataset
import json
from datetime import datetime
import re
# Dataset metadata from HuggingFace search
DATASETS_METADATA = [
{"name": "ethanolivertroy/nist-cybersecurity-training", "downloads": 8000, "likes": 48, "size": "100K-1M", "language": "en", "tags": ["cybersecurity", "nist", "compliance", "security-controls", "zero-trust", "privacy"], "category": "compliance", "description": "NIST Cybersecurity Training Dataset v1.1 - The largest open-source NIST cybersecurity training dataset for fine-tuning LLMs"},
{"name": "clydeiii/cybersecurity", "downloads": 4000, "likes": 6, "size": "100K-1M", "language": "unknown", "tags": ["APT", "threat-intelligence"], "category": "offensive", "description": "APT notes dataset from GitHub"},
{"name": "vinitvek/cybersecurityattacks", "downloads": 2300, "likes": 5, "size": "10K-100K", "language": "en", "tags": ["attacks", "security"], "category": "offensive", "description": "Cybersecurity attacks dataset"},
{"name": "Trendyol/Trendyol-Cybersecurity-Instruction-Tuning-Dataset", "downloads": 786, "likes": 78, "size": "10K-100K", "language": "en", "tags": ["cybersecurity", "defensive-security", "instruction-tuning", "threat-intelligence", "incident-response", "security-operations"], "category": "defensive", "description": "53,202 meticulously curated system/user/assistant instruction-tuning examples covering defensive security"},
{"name": "AlicanKiraz0/Cybersecurity-Dataset-Fenrir-v2.0", "downloads": 353, "likes": 10, "size": "10K-100K", "language": "en", "tags": ["cybersecurity", "defensive-security", "instruction-tuning"], "category": "defensive", "description": "83,920 high-quality system/user/assistant triples for defensive cybersecurity"},
{"name": "AlicanKiraz0/Cybersecurity-Dataset-Heimdall-v1.1", "downloads": 192, "likes": 13, "size": "10K-100K", "language": "en", "tags": ["cybersecurity", "defensive-security", "instruction-tuning"], "category": "defensive", "description": "21,258 high-quality system/user/assistant triples for training alignment-safe, defensive-cybersecurity LLMs"},
{"name": "Chemically-motivated/CyberSecurityDataset", "downloads": 180, "likes": 3, "size": "<1K", "language": "en", "tags": ["cybersecurity", "machine learning", "pentesting", "exploits"], "category": "offensive", "description": "Curated data points related to penetration testing, known exploits, and vulnerabilities"},
{"name": "ChaoticNeutrals/Cybersecurity-ShareGPT", "downloads": 175, "likes": 15, "size": "10K-100K", "language": "en", "tags": ["cybersecurity", "ShareGPT"], "category": "ai", "description": "Converted, deslopped, min-hash deduplicated cybersecurity ShareGPT dataset"},
{"name": "Mohabahmed03/Alpaca_Dataset_CyberSecurity_Smaller_2.0", "downloads": 145, "likes": 0, "size": "10K-100K", "language": "en", "tags": ["Cyber", "Security", "Cybersecurity", "LLM", "Pentest", "RedTeam", "BlueTeam"], "category": "ai", "description": "Alpaca format cybersecurity dataset"},
{"name": "Bouquets/DeepSeek-V3-Distill-Cybersecurity-en", "downloads": 140, "likes": 0, "size": "1K-10K", "language": "en", "tags": ["cybersecurity", "penetration-testing", "distilled"], "category": "offensive", "description": "High-quality distilled dataset specialized in cybersecurity penetration testing domain"},
{"name": "Druva-S-Kumar/cybersecurity-qa-dataset", "downloads": 123, "likes": 0, "size": "<1K", "language": "en", "tags": ["qa", "cybersecurity"], "category": "ai", "description": "Cybersecurity Q&A dataset"},
{"name": "Rowden/CybersecurityQAA", "downloads": 119, "likes": 4, "size": "1K-10K", "language": "en", "tags": ["cybersecurity", "QAA"], "category": "ai", "description": "Cybersecurity Question-Answer-Assertion (QAA) Dataset designed to evaluate LLM capabilities"},
{"name": "luckwa/cybersecurity-dataset", "downloads": 119, "likes": 1, "size": "1K-10K", "language": "en", "tags": ["cybersecurity"], "category": "defensive", "description": "General cybersecurity dataset"},
{"name": "Vanessasml/cybersecurity_32k_instruction_input_output", "downloads": 114, "likes": 17, "size": "10K-100K", "language": "en", "tags": ["NIST", "ITC EBA", "threat-identification"], "category": "compliance", "description": "Q&As focused on identification of cyber threats, and text classification under NIST taxonomy"},
{"name": "AlicanKiraz0/Cybersecurity-Dataset-v1", "downloads": 98, "likes": 12, "size": "1K-10K", "language": "en", "tags": ["cybersecurity"], "category": "defensive", "description": "2,500 high-quality instruction-response pairs focused on defensive cybersecurity education"},
{"name": "mariiazhiv/cybersecurity_qa", "downloads": 97, "likes": 1, "size": "<1K", "language": "en", "tags": ["question-answering", "cybersecurity"], "category": "ai", "description": "Instruction-response pairs focused on cybersecurity concepts"},
{"name": "CyberNative/CyberSecurityEval", "downloads": 84, "likes": 19, "size": "<1K", "language": "en", "tags": ["cybersecurity", "infosec", "IT", "evaluation"], "category": "ai", "description": "CyberNative AI for CyberSecurity Q/A Evaluation - NOT FOR TRAINING"},
{"name": "whybe-choi/kovidore-v2-cybersecurity-beir", "downloads": 80, "likes": 1, "size": "1K-10K", "language": "ko", "tags": ["Visual Retrieving", "Industrial RAG"], "category": "defensive", "description": "Corpus of technical reports on cyber threat trends and security incident responses in Korea"},
{"name": "Canstralian/Purple-Team-Cybersecurity-Dataset", "downloads": 73, "likes": 9, "size": "10K-100K", "language": "en", "tags": ["purple-team", "code"], "category": "defensive", "description": "Synthetic collection designed to simulate collaborative cybersecurity exercises"},
{"name": "Bouquets/Cybersecurity-LLM-CVE", "downloads": 46, "likes": 15, "size": "100K-1M", "language": "en", "tags": ["CVE", "vulnerabilities"], "category": "defensive", "description": "CVE vulnerability database for cybersecurity"},
{"name": "theResearchNinja/benchmarkResults_violentUTF_cybersecurityBehavior", "downloads": 37, "likes": 1, "size": "100K-1M", "language": "en", "tags": ["benchmark", "results"], "category": "ai", "description": "Interdependent cybersecurity benchmark results"},
{"name": "schooly/Cyber-Security-Breaches", "downloads": 36, "likes": 11, "size": "1K-10K", "language": "en", "tags": ["breaches", "incidents"], "category": "offensive", "description": "Cyber security breaches dataset"},
{"name": "jcordon5/cybersecurity-rules", "downloads": 36, "likes": 9, "size": "<1K", "language": "en", "tags": ["SIGMA", "YARA", "Suricata", "detection-rules"], "category": "defensive", "description": "950 detection rules from official SIGMA, YARA, and Suricata repositories"},
{"name": "Tiamz/cybersecurity-instruction-dataset", "downloads": 33, "likes": 0, "size": "10K-100K", "language": "en", "tags": ["instruction", "cybersecurity"], "category": "ai", "description": "Cybersecurity instruction dataset"},
{"name": "zeroshot/cybersecurity-corpus", "downloads": 29, "likes": 9, "size": "1K-10K", "language": "en", "tags": ["corpus"], "category": "ai", "description": "Cybersecurity corpus for training"},
{"name": "mteb/kovidore-v2-cybersecurity-mteb", "downloads": 29, "likes": 0, "size": "1K-10K", "language": "ko", "tags": ["MTEB", "retrieval"], "category": "ai", "description": "MTEB cybersecurity retrieval dataset in Korean"},
{"name": "electricsheepafrica/nigerian-telecom-cybersecurity-incident-logs", "downloads": 27, "likes": 0, "size": "10K-100K", "language": "en", "tags": ["telecom", "cybersecurity", "incident", "logs"], "category": "defensive", "description": "Security events including intrusions, DDoS attacks, and malware on telecom infrastructure"},
{"name": "CyberNative/github_cybersecurity_READMEs", "downloads": 26, "likes": 14, "size": "1K-10K", "language": "en", "tags": ["github", "README"], "category": "ai", "description": "GitHub cybersecurity README files"},
{"name": "Mohabahmed03/Alpaca_Dataset_CyberSecurity_2.0", "downloads": 26, "likes": 0, "size": "100K-1M", "language": "en", "tags": ["Cyber", "Security", "Pentest", "Cybersecurity", "LLM", "BlueTeam"], "category": "ai", "description": "Alpaca format cybersecurity dataset v2.0"},
{"name": "hcnote/Cybersecurity-Dataset", "downloads": 26, "likes": 0, "size": "10K-100K", "language": "en", "tags": ["code", "question-answering"], "category": "ai", "description": "High-quality cybersecurity dataset"},
{"name": "Zeo6/CyberSecurity-FineTune", "downloads": 25, "likes": 0, "size": "unknown", "language": "en", "tags": ["finetune"], "category": "ai", "description": "Cybersecurity fine-tuning dataset"},
{"name": "ystemsrx/Cybersecurity-ShareGPT-Chinese", "downloads": 24, "likes": 21, "size": "10K-100K", "language": "zh", "tags": ["code", "Chinese"], "category": "ai", "description": "Chinese cybersecurity dataset in ShareGPT format"},
{"name": "whybe-choi/kovidore-v2-cybersecurity-mteb", "downloads": 24, "likes": 0, "size": "1K-10K", "language": "ko", "tags": ["MTEB", "retrieval"], "category": "ai", "description": "MTEB cybersecurity retrieval dataset"},
{"name": "princemaxp/cybersecurity-keywords", "downloads": 22, "likes": 1, "size": "<1K", "language": "en", "tags": ["cybersecurity", "keywords"], "category": "ai", "description": "Common cybersecurity keywords list"},
{"name": "madox81/cybersecurity_attack_conversational_dataset", "downloads": 20, "likes": 0, "size": "unknown", "language": "en", "tags": ["conversational", "attacks"], "category": "offensive", "description": "Conversational cybersecurity attack dataset"},
{"name": "safouene99999/Cybersecurity_QA", "downloads": 19, "likes": 0, "size": "10K-100K", "language": "en", "tags": ["QA"], "category": "ai", "description": "Cybersecurity Q&A dataset"},
{"name": "hcnote/High-quality-cybersecurity-datasets", "downloads": 19, "likes": 0, "size": "100K-1M", "language": "en", "tags": ["high-quality"], "category": "ai", "description": "277,707 high-quality cybersecurity records with AI annotation"},
{"name": "theResearchNinja/violentutf_cybersecurityBehavior", "downloads": 18, "likes": 3, "size": "10K-100K", "language": "en", "tags": ["cybersecurity", "cognitive behavioral psychology", "benchmark"], "category": "ai", "description": "LLM cybersecurity behavior benchmark dataset"},
{"name": "GotThatData/nist-cybersecurity-framework", "downloads": 18, "likes": 7, "size": "1K-10K", "language": "en", "tags": ["NIST", "Cybersecurity", "Framework"], "category": "compliance", "description": "NIST Cybersecurity Publications Dataset"},
{"name": "Mohabahmed03/Alpaca_Dataset_General_CyberSecurity", "downloads": 18, "likes": 0, "size": "100K-1M", "language": "en", "tags": ["General", "Alpaca", "CyberSecurity"], "category": "ai", "description": "General Alpaca format cybersecurity dataset"},
{"name": "vnovaai19/CYBERSECURITY_JSONL_V1", "downloads": 18, "likes": 0, "size": "<1K", "language": "en", "tags": ["cybersecurity", "synthetic-data", "safety", "phishing", "fraud-detection"], "category": "defensive", "description": "100 synthetic cybersecurity threat scenarios with educational AI responses"},
{"name": "Mattimax/Cybersecurity-ShareGPT-Italian", "downloads": 18, "likes": 0, "size": "1K-10K", "language": "it", "tags": ["Italian", "ShareGPT"], "category": "ai", "description": "Italian cybersecurity ShareGPT dataset"},
{"name": "olgazigbeehub/cybersecurity-news-dataset-english-3000", "downloads": 18, "likes": 0, "size": "1K-10K", "language": "en", "tags": ["news", "cybersecurity", "media-analysis"], "category": "defensive", "description": "3,000 English-language cybersecurity news metadata rows"},
{"name": "hcnote/Cybersecurity-High-Quality-Dataset", "downloads": 17, "likes": 0, "size": "100K-1M", "language": "en", "tags": ["high-quality"], "category": "ai", "description": "270,271 high-quality Chinese-English Q&A cybersecurity dataset"},
{"name": "ScoutieAutoML/cybersecurity_news_telegram_dataset", "downloads": 16, "likes": 2, "size": "10K-100K", "language": "ru", "tags": ["russia", "cybersecurity", "media", "news"], "category": "defensive", "description": "Russian-language Telegram news channels on cybersecurity"},
{"name": "savaniDhruv/Cybersecurity_Attack_Dataset", "downloads": 16, "likes": 2, "size": "10K-100K", "language": "en", "tags": ["attacks"], "category": "offensive", "description": "Cybersecurity attack dataset"},
{"name": "pyToshka/cyber-security-events", "downloads": 16, "likes": 0, "size": "10K-100K", "language": "en", "tags": ["cybersecurity", "honeypot", "threat-intelligence"], "category": "defensive", "description": "Cybersecurity events collected from honeypot infrastructure"},
{"name": "ahmadkaab/Trendyol-Cybersecurity-Instruction-Tuning-Dataset", "downloads": 16, "likes": 0, "size": "10K-100K", "language": "en", "tags": ["cybersecurity", "defensive-security", "instruction-tuning"], "category": "defensive", "description": "53,202 defensive security instruction-tuning examples"},
{"name": "MCP-1st-Birthday/smoltrace-cybersecurity-tasks", "downloads": 15, "likes": 0, "size": "<1K", "language": "en", "tags": ["smoltrace", "synthetic-data", "agent-evaluation"], "category": "ai", "description": "SMOLTRACE synthetic dataset for agent evaluation"},
{"name": "ErebusTN/The-Ultimate-CyberSecurity-Dataset-Collection", "downloads": 14, "likes": 1, "size": "unknown", "language": "en", "tags": ["collection"], "category": "ai", "description": "Ultimate cybersecurity dataset collection"},
{"name": "NewsDataHub/cybersecurity-news-dataset-english-3000", "downloads": 14, "likes": 1, "size": "1K-10K", "language": "en", "tags": ["news", "cybersecurity"], "category": "defensive", "description": "3,000 English cybersecurity news metadata rows"},
{"name": "AYI-NEDJIMI/ai-cybersecurity-en", "downloads": 14, "likes": 0, "size": "<1K", "language": "en", "tags": ["artificial-intelligence", "cybersecurity", "offensive-ai", "defensive-ai", "deepfake"], "category": "ai", "description": "AI in Offensive and Defensive Cybersecurity - English Dataset"},
{"name": "AR2021/cybersecurity-corpus-llama2-1k", "downloads": 13, "likes": 1, "size": "<1K", "language": "en", "tags": ["llama2"], "category": "ai", "description": "Cybersecurity corpus for Llama2"},
{"name": "boapro/Purple-Team-Cybersecurity-Dataset", "downloads": 13, "likes": 0, "size": "10K-100K", "language": "en", "tags": ["code", "purple-team"], "category": "defensive", "description": "Synthetic purple team cybersecurity exercises"},
{"name": "tuandunghcmut/Trendyol-Cybersecurity-Instruction-Tuning-Dataset", "downloads": 13, "likes": 1, "size": "10K-100K", "language": "en", "tags": ["cybersecurity", "security", "cyber-defense", "conversational"], "category": "defensive", "description": "GPT format conversational cybersecurity dataset"},
{"name": "AYI-NEDJIMI/ai-cybersecurity-fr", "downloads": 13, "likes": 0, "size": "<1K", "language": "fr", "tags": ["artificial-intelligence", "cybersecurity", "offensive-ai", "defensive-ai"], "category": "ai", "description": "AI in Offensive and Defensive Cybersecurity - French Dataset"},
{"name": "pki/autonlp-data-cybersecurity", "downloads": 12, "likes": 0, "size": "unknown", "language": "en", "tags": ["autonlp"], "category": "ai", "description": "AutoNLP cybersecurity data"},
{"name": "Hadihilman/cybersecurity-dataset", "downloads": 12, "likes": 0, "size": "<1K", "language": "en", "tags": ["images"], "category": "defensive", "description": "Cybersecurity image dataset"},
{"name": "AnodeAI/Elite_quality_cybersecurity", "downloads": 12, "likes": 1, "size": "10K-100K", "language": "en", "tags": ["legal", "finance"], "category": "ai", "description": "Elite quality cybersecurity dataset"},
{"name": "hcnote/Cybersecurity-bigDataset", "downloads": 12, "likes": 0, "size": "100K-1M", "language": "en", "tags": ["large-scale"], "category": "ai", "description": "Global first open-source mega-scale cybersecurity dataset"},
{"name": "bnsapa/cybersecurity-ner", "downloads": 11, "likes": 2, "size": "1K-10K", "language": "en", "tags": ["token-classification", "NER"], "category": "ai", "description": "Cybersecurity named entity recognition dataset"},
{"name": "baig31/Cybersecurity_penetration_testing_books", "downloads": 11, "likes": 12, "size": "unknown", "language": "en", "tags": ["books", "penetration-testing"], "category": "offensive", "description": "Cybersecurity penetration testing books"},
{"name": "beldua/english-cybersecurity-basics-30", "downloads": 11, "likes": 0, "size": "<1K", "language": "en", "tags": ["basics"], "category": "ai", "description": "English cybersecurity basics"},
{"name": "ahmedds10/finetuning_cybersecurity", "downloads": 10, "likes": 0, "size": "<1K", "language": "en", "tags": ["finetuning"], "category": "ai", "description": "Cybersecurity fine-tuning dataset"},
{"name": "Mohabahmed03/Alpaca_Dataset_CyberSecurity_Smaller", "downloads": 10, "likes": 0, "size": "10K-100K", "language": "en", "tags": ["CyberSecurity", "Finetune"], "category": "ai", "description": "Smaller Alpaca cybersecurity dataset"},
{"name": "ChavyvAkvar/Trendyol-Cybersecurity-Instruction-Tuning-Dataset-Converted", "downloads": 10, "likes": 1, "size": "10K-100K", "language": "en", "tags": ["converted"], "category": "defensive", "description": "Converted Trendyol cybersecurity dataset"},
{"name": "tandevllc/cybersecurity-atom-rss-feeds-2025", "downloads": 10, "likes": 1, "size": "unknown", "language": "en", "tags": ["news", "rss", "feeds"], "category": "defensive", "description": "Cybersecurity Atom/RSS feeds 2025"},
{"name": "tandevllc/cybersecurity-wiki-slices", "downloads": 10, "likes": 1, "size": "10K-100K", "language": "en", "tags": ["wikipedia", "cybersecurity"], "category": "ai", "description": "Curated collection of English Wikipedia pages covering cybersecurity"},
{"name": "antitheft159/CybersecurityAttacks", "downloads": 9, "likes": 1, "size": "unknown", "language": "en", "tags": ["attacks"], "category": "offensive", "description": "Cybersecurity attacks dataset"},
{"name": "Tiamz/cybersecurity-raw-json-datasets", "downloads": 9, "likes": 0, "size": "unknown", "language": "en", "tags": ["raw", "json"], "category": "ai", "description": "Raw JSON cybersecurity datasets"},
{"name": "burpsuite/Cybersecurity-Dataset-v1", "downloads": 9, "likes": 0, "size": "1K-10K", "language": "en", "tags": ["cybersecurity"], "category": "defensive", "description": "2,500 defensive cybersecurity instruction-response pairs"},
{"name": "Deshaune/Global-Cybersecurity-Threats-2015_2024", "downloads": 8, "likes": 1, "size": "1K-10K", "language": "en", "tags": ["global-threats"], "category": "defensive", "description": "Global cybersecurity threats from 2015-2024"},
{"name": "oceancharcoal/Cybersecurity_attack_dataset", "downloads": 8, "likes": 0, "size": "10K-100K", "language": "en", "tags": ["attacks"], "category": "offensive", "description": "Cybersecurity attack dataset"},
{"name": "pyToshka/cyber-security-events-full", "downloads": 8, "likes": 0, "size": "100K-1M", "language": "en", "tags": ["cybersecurity", "honeypot", "threat-intelligence"], "category": "defensive", "description": "Full cybersecurity events from honeypot infrastructure"},
{"name": "dattaraj/rag_eval_cybersecurity", "downloads": 7, "likes": 0, "size": "<1K", "language": "en", "tags": ["RAG", "evaluation"], "category": "ai", "description": "RAG evaluation for cybersecurity"},
{"name": "lianghsun/tw-cybersecurity", "downloads": 7, "likes": 0, "size": "1K-10K", "language": "zh", "tags": ["Taiwan", "cybersecurity", "ISO-27001"], "category": "compliance", "description": "Taiwan cybersecurity dataset with ISO/IEC 27001"},
{"name": "mariiazhiv/Cybersecurity_messages", "downloads": 7, "likes": 0, "size": "1K-10K", "language": "en", "tags": ["messages"], "category": "ai", "description": "Cybersecurity messages dataset"},
{"name": "MichaelPrimez/cybersecurity-questionaire", "downloads": 6, "likes": 0, "size": "<1K", "language": "en", "tags": ["questionnaire", "synthetic", "distilabel"], "category": "ai", "description": "Cybersecurity questionnaire dataset"},
{"name": "lianghsun/tw-cybersecurity-chat", "downloads": 5, "likes": 0, "size": "1K-10K", "language": "zh", "tags": ["Taiwan", "cybersecurity", "chat"], "category": "ai", "description": "Taiwan cybersecurity chat dataset"},
{"name": "WhoIsShe/CyberSecurity-big", "downloads": 5, "likes": 1, "size": "1M-10M", "language": "en", "tags": ["large-scale"], "category": "ai", "description": "Large-scale cybersecurity dataset"},
]
def create_dataframe():
"""Create pandas DataFrame from metadata"""
df = pd.DataFrame(DATASETS_METADATA)
df['url'] = df['name'].apply(lambda x: f"https://huggingface.co/datasets/{x}")
return df
def get_dataset_stats():
"""Generate overall statistics"""
df = create_dataframe()
stats = {
"Total Datasets": len(df),
"Total Downloads": f"{df['downloads'].sum():,}",
"Total Likes": f"{df['likes'].sum():,}",
"Languages": len(df['language'].unique()),
"Categories": len(df['category'].unique()),
}
return stats
def filter_datasets(keyword, language, category, min_downloads, min_likes):
"""Filter datasets based on criteria"""
df = create_dataframe()
# Filter by keyword
if keyword:
mask = (
df['name'].str.contains(keyword, case=False, na=False) |
df['description'].str.contains(keyword, case=False, na=False) |
df['tags'].apply(lambda x: any(keyword.lower() in tag.lower() for tag in x))
)
df = df[mask]
# Filter by language
if language and language != "All":
df = df[df['language'] == language]
# Filter by category
if category and category != "All":
df = df[df['category'] == category]
# Filter by downloads
if min_downloads:
df = df[df['downloads'] >= min_downloads]
# Filter by likes
if min_likes:
df = df[df['likes'] >= min_likes]
return df
def search_datasets(keyword, language, category, min_downloads, min_likes):
"""Search and display datasets"""
df = filter_datasets(keyword, language, category, min_downloads, min_likes)
# Format for display
display_df = df[['name', 'downloads', 'likes', 'size', 'language', 'category', 'description']].copy()
display_df.columns = ['Dataset Name', 'Downloads', 'Likes', 'Size', 'Language', 'Category', 'Description']
result_text = f"Found {len(df)} datasets matching your criteria"
return display_df, result_text
def get_dataset_details(dataset_name):
"""Get detailed information about a specific dataset"""
df = create_dataframe()
if not dataset_name:
return "Please select a dataset from the list above", None, None
dataset = df[df['name'] == dataset_name]
if dataset.empty:
return "Dataset not found", None, None
dataset = dataset.iloc[0]
details = f"""
## {dataset['name']}
**Description:** {dataset['description']}
**Statistics:**
- Downloads: {dataset['downloads']:,}
- Likes: {dataset['likes']}
- Size: {dataset['size']}
- Language: {dataset['language']}
- Category: {dataset['category']}
**Tags:** {', '.join(dataset['tags'])}
**HuggingFace URL:** [{dataset['url']}]({dataset['url']})
---
*Note: To preview dataset samples, you would need to load the actual dataset using the HuggingFace datasets library.
This demo shows metadata only. For full dataset access, click the URL above.*
"""
# Create a simple preview table (mock data since we're not loading actual datasets)
preview_data = {
"Column": ["Feature 1", "Feature 2", "Feature 3"],
"Type": ["text", "text", "category"],
"Sample": ["Sample data...", "Sample data...", "Sample category..."]
}
preview_df = pd.DataFrame(preview_data)
return details, preview_df, dataset['url']
def create_category_chart():
"""Create pie chart of datasets by category"""
df = create_dataframe()
category_counts = df['category'].value_counts()
fig = px.pie(
values=category_counts.values,
names=category_counts.index,
title='Datasets by Category',
color_discrete_sequence=px.colors.sequential.RdBu,
hole=0.3
)
fig.update_layout(
paper_bgcolor='rgba(0,0,0,0)',
plot_bgcolor='rgba(0,0,0,0)',
font=dict(color='white')
)
return fig
def create_language_chart():
"""Create bar chart of datasets by language"""
df = create_dataframe()
language_counts = df['language'].value_counts().head(10)
fig = px.bar(
x=language_counts.index,
y=language_counts.values,
title='Top 10 Languages',
labels={'x': 'Language', 'y': 'Number of Datasets'},
color=language_counts.values,
color_continuous_scale='Viridis'
)
fig.update_layout(
paper_bgcolor='rgba(0,0,0,0)',
plot_bgcolor='rgba(0,0,0,0)',
font=dict(color='white'),
showlegend=False
)
return fig
def create_downloads_chart():
"""Create bar chart of top datasets by downloads"""
df = create_dataframe()
top_downloads = df.nlargest(15, 'downloads')[['name', 'downloads']]
top_downloads['short_name'] = top_downloads['name'].apply(lambda x: x.split('/')[-1][:30])
fig = px.bar(
top_downloads,
x='downloads',
y='short_name',
orientation='h',
title='Top 15 Datasets by Downloads',
labels={'downloads': 'Downloads', 'short_name': 'Dataset'},
color='downloads',
color_continuous_scale='Plasma'
)
fig.update_layout(
paper_bgcolor='rgba(0,0,0,0)',
plot_bgcolor='rgba(0,0,0,0)',
font=dict(color='white'),
height=600,
showlegend=False
)
return fig
def create_size_distribution_chart():
"""Create distribution chart of dataset sizes"""
df = create_dataframe()
size_counts = df['size'].value_counts()
fig = px.bar(
x=size_counts.index,
y=size_counts.values,
title='Dataset Size Distribution',
labels={'x': 'Size Category', 'y': 'Number of Datasets'},
color=size_counts.values,
color_continuous_scale='Cividis'
)
fig.update_layout(
paper_bgcolor='rgba(0,0,0,0)',
plot_bgcolor='rgba(0,0,0,0)',
font=dict(color='white'),
showlegend=False
)
return fig
def export_to_csv(keyword, language, category, min_downloads, min_likes):
"""Export filtered datasets to CSV"""
df = filter_datasets(keyword, language, category, min_downloads, min_likes)
output_path = "/tmp/cybersecurity_datasets.csv"
df.to_csv(output_path, index=False)
return output_path
def export_to_json(keyword, language, category, min_downloads, min_likes):
"""Export filtered datasets to JSON"""
df = filter_datasets(keyword, language, category, min_downloads, min_likes)
output_path = "/tmp/cybersecurity_datasets.json"
df.to_json(output_path, orient='records', indent=2)
return output_path
# Create Gradio interface
with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue", secondary_hue="purple").set(
body_background_fill='*primary_900',
body_background_fill_dark='*primary_950',
block_background_fill='*primary_800',
block_background_fill_dark='*primary_900',
block_border_color='*primary_600',
input_background_fill='*primary_700',
button_primary_background_fill='*primary_600',
button_primary_background_fill_hover='*primary_500',
)) as demo:
gr.Markdown("""
# πŸ” Cybersecurity Dataset Explorer
Explore and analyze 80+ cybersecurity datasets from HuggingFace
**Features:**
- Search by keyword, language, category
- View detailed statistics and visualizations
- Export datasets metadata to CSV/JSON
- Preview dataset information
- Direct links to HuggingFace repositories
""")
# Statistics overview
with gr.Row():
stats = get_dataset_stats()
for key, value in stats.items():
gr.Markdown(f"### {key}\n## {value}")
# Main tabs
with gr.Tabs():
# Search & Filter Tab
with gr.Tab("πŸ” Search & Filter"):
with gr.Row():
with gr.Column(scale=1):
keyword_input = gr.Textbox(
label="Search Keyword",
placeholder="Enter keyword (name, description, or tags)...",
lines=1
)
language_dropdown = gr.Dropdown(
label="Language",
choices=["All"] + sorted(list(set([d['language'] for d in DATASETS_METADATA]))),
value="All"
)
category_dropdown = gr.Dropdown(
label="Category",
choices=["All", "ai", "defensive", "offensive", "compliance"],
value="All"
)
min_downloads_slider = gr.Slider(
label="Minimum Downloads",
minimum=0,
maximum=10000,
value=0,
step=100
)
min_likes_slider = gr.Slider(
label="Minimum Likes",
minimum=0,
maximum=100,
value=0,
step=1
)
search_btn = gr.Button("πŸ” Search Datasets", variant="primary")
with gr.Column(scale=3):
result_text = gr.Textbox(label="Search Results", lines=1)
results_table = gr.Dataframe(
label="Datasets",
wrap=True,
interactive=False
)
with gr.Row():
export_csv_btn = gr.Button("πŸ“₯ Export to CSV")
export_json_btn = gr.Button("πŸ“₯ Export to JSON")
with gr.Row():
csv_file = gr.File(label="CSV Download")
json_file = gr.File(label="JSON Download")
search_btn.click(
fn=search_datasets,
inputs=[keyword_input, language_dropdown, category_dropdown, min_downloads_slider, min_likes_slider],
outputs=[results_table, result_text]
)
export_csv_btn.click(
fn=export_to_csv,
inputs=[keyword_input, language_dropdown, category_dropdown, min_downloads_slider, min_likes_slider],
outputs=csv_file
)
export_json_btn.click(
fn=export_to_json,
inputs=[keyword_input, language_dropdown, category_dropdown, min_downloads_slider, min_likes_slider],
outputs=json_file
)
# Dataset Details Tab
with gr.Tab("πŸ“Š Dataset Details"):
dataset_selector = gr.Dropdown(
label="Select Dataset",
choices=[d['name'] for d in DATASETS_METADATA],
value=DATASETS_METADATA[0]['name'] if DATASETS_METADATA else None
)
view_details_btn = gr.Button("View Details", variant="primary")
dataset_details = gr.Markdown(label="Dataset Information")
preview_table = gr.Dataframe(label="Preview (Mock Data)")
dataset_link = gr.Textbox(label="HuggingFace URL")
view_details_btn.click(
fn=get_dataset_details,
inputs=dataset_selector,
outputs=[dataset_details, preview_table, dataset_link]
)
# Statistics & Visualizations Tab
with gr.Tab("πŸ“ˆ Statistics & Charts"):
gr.Markdown("## Dataset Analytics Dashboard")
with gr.Row():
category_chart = gr.Plot(label="Category Distribution")
language_chart = gr.Plot(label="Language Distribution")
with gr.Row():
downloads_chart = gr.Plot(label="Top Downloads")
with gr.Row():
size_chart = gr.Plot(label="Size Distribution")
refresh_charts_btn = gr.Button("πŸ”„ Refresh Charts", variant="primary")
def refresh_all_charts():
return (
create_category_chart(),
create_language_chart(),
create_downloads_chart(),
create_size_distribution_chart()
)
refresh_charts_btn.click(
fn=refresh_all_charts,
outputs=[category_chart, language_chart, downloads_chart, size_chart]
)
# Load charts on startup
demo.load(
fn=refresh_all_charts,
outputs=[category_chart, language_chart, downloads_chart, size_chart]
)
# About Tab
with gr.Tab("ℹ️ About"):
gr.Markdown("""
## About Dataset Explorer
This application provides a comprehensive interface to explore 80 cybersecurity datasets from HuggingFace.
### Features:
1. **Search & Filter**: Find datasets by keyword, language, category, popularity
2. **Dataset Details**: View comprehensive information about each dataset
3. **Statistics**: Visual analytics with interactive charts
4. **Export**: Download filtered results as CSV or JSON
5. **Direct Links**: Access to HuggingFace repositories
### Categories:
- **AI**: Datasets for training AI/ML models
- **Defensive**: Blue team, threat detection, incident response
- **Offensive**: Red team, penetration testing, exploits
- **Compliance**: NIST, ISO 27001, regulatory frameworks
### Data Sources:
All datasets are publicly available on HuggingFace Hub. This explorer provides
metadata and filtering capabilities. To access the actual dataset content,
click the HuggingFace URL for any dataset.
### Technologies:
- **Gradio**: Interactive web interface
- **Pandas**: Data manipulation
- **Plotly**: Interactive visualizations
- **HuggingFace Datasets**: Dataset metadata
---
**Created by:** AYI-NEDJIMI
**Version:** 1.0
**Last Updated:** February 2026
""")
# Footer
gr.Markdown("""
---
πŸ’‘ **Tip**: Use the search feature to find datasets by specific topics like "NIST", "penetration testing", "threat intelligence", etc.
""")
if __name__ == "__main__":
demo.launch()