Spaces:

librarian-bots
/

dataset-card-drafter

Running

davanstrien HF Staff Claude Opus 4.5 commited on 4 days ago

Commit

72431fa

1 Parent(s): 7622a26

Add MVP implementation for dataset card drafter

- app.py: WebhooksServer + Gradio UI for webhook handling
- description_generator.py: LLM-based description generation
- requirements.txt: Dependencies (gradio, huggingface_hub, datasets-server-py)

Features:
- Watches davanstrien/* datasets via webhooks
- Uses DatasetCard for YAML-aware README handling
- Generates descriptions with GLM-4.6V via InferenceClient
- Opens PRs with card.push_to_hub(create_pr=True)
- Manual test and trigger UI tabs
- JSON persistence in /data for Space

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <[email protected]>

Files changed (5) hide show

.beads/issues.jsonl +1 -0
.gitignore +26 -0
app.py +221 -0
description_generator.py +173 -0
requirements.txt +3 -0

.beads/issues.jsonl ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"id":"dataset-card-drafter-wbd","title":"MVP implementation: WebhooksServer + DatasetCard + InferenceClient","description":"","status":"closed","priority":1,"issue_type":"feature","created_at":"2025-12-15T17:24:36.365733Z","updated_at":"2025-12-15T17:28:21.127763Z","closed_at":"2025-12-15T17:28:21.127763Z","close_reason":"MVP implemented with WebhooksServer, DatasetCard, and InferenceClient"}

.gitignore ADDED Viewed

	@@ -0,0 +1,26 @@

+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+.venv/
+venv/
+ENV/
+# IDE
+.idea/
+.vscode/
+*.swp
+*.swo
+# Local data (for development)
+local_data/
+# Environment
+.env
+.env.local
+# OS
+.DS_Store
+Thumbs.db

app.py ADDED Viewed

	@@ -0,0 +1,221 @@

+"""Dataset Card Drafter - MVP Space.
+Watches davanstrien/* datasets and opens PRs with auto-generated descriptions.
+"""
+import json
+import os
+from datetime import datetime
+from pathlib import Path
+import gradio as gr
+from huggingface_hub import DatasetCard, WebhookPayload, WebhooksServer
+from description_generator import generate_description
+# Configuration
+WATCHED_PREFIXES = ["davanstrien/"]  # Repos to watch
+MIN_DESCRIPTION_LENGTH = 100  # Chars below which we generate
+# Persistence directory
+DATA_DIR = Path("/data") if Path("/data").exists() else Path("./local_data")
+DATA_DIR.mkdir(exist_ok=True)
+PROCESSED_FILE = DATA_DIR / "processed.json"
+def load_processed() -> dict:
+    """Load processed datasets from persistence."""
+    if PROCESSED_FILE.exists():
+        return json.loads(PROCESSED_FILE.read_text())
+    return {}
+def save_processed(data: dict) -> None:
+    """Save processed datasets to persistence."""
+    PROCESSED_FILE.write_text(json.dumps(data, indent=2))
+def is_watched_repo(repo_name: str) -> bool:
+    """Check if a repo is in our watched list."""
+    return any(repo_name.startswith(prefix) for prefix in WATCHED_PREFIXES)
+def should_generate(card: DatasetCard) -> bool:
+    """Check if a dataset card needs a description."""
+    if not card.text:
+        return True
+    return len(card.text.strip()) < MIN_DESCRIPTION_LENGTH
+async def process_dataset(dataset_id: str, hf_token: str) -> dict:
+    """Process a single dataset: check, generate, and open PR.
+    Returns a status dict with results.
+    """
+    # Load current card
+    try:
+        card = DatasetCard.load(dataset_id)
+    except Exception as e:
+        return {"status": "error", "reason": f"card load failed: {e}"}
+    # Check if description needed
+    if not should_generate(card):
+        return {"status": "skipped", "reason": "description exists"}
+    # Generate description
+    try:
+        description = generate_description(dataset_id, hf_token)
+    except Exception as e:
+        return {"status": "error", "reason": f"generation failed: {e}"}
+    if not description:
+        return {"status": "error", "reason": "empty description generated"}
+    # Update card and push as PR
+    card.text = description
+    try:
+        commit_info = card.push_to_hub(
+            repo_id=dataset_id,
+            repo_type="dataset",
+            commit_message="Add dataset description",
+            create_pr=True,
+            token=hf_token,
+        )
+        pr_url = getattr(commit_info, "pr_url", str(commit_info))
+    except Exception as e:
+        return {"status": "error", "reason": f"PR creation failed: {e}"}
+    return {"status": "pr_created", "pr_url": pr_url, "description": description}
+# Gradio UI
+with gr.Blocks(title="Dataset Card Drafter") as demo:
+    gr.Markdown("# Dataset Card Drafter MVP")
+    gr.Markdown(
+        f"Watching datasets matching: `{'`, `'.join(WATCHED_PREFIXES)}`\n\n"
+        f"Triggers when description < {MIN_DESCRIPTION_LENGTH} characters."
+    )
+    with gr.Tab("Status"):
+        status_display = gr.JSON(label="Processed Datasets", value=load_processed)
+        refresh_btn = gr.Button("Refresh")
+        refresh_btn.click(fn=load_processed, outputs=status_display)
+    with gr.Tab("Manual Test"):
+        gr.Markdown(
+            "Test description generation without opening a PR.\n\n"
+            "**Note:** This requires `HF_TOKEN` to be set."
+        )
+        test_input = gr.Textbox(
+            label="Dataset ID",
+            placeholder="davanstrien/test-dataset",
+        )
+        test_btn = gr.Button("Generate Description (Preview)")
+        test_output = gr.Textbox(label="Generated Description", lines=5)
+        test_status = gr.JSON(label="Status")
+        def test_generate(dataset_id: str):
+            if not dataset_id:
+                return "", {"status": "error", "reason": "no dataset ID provided"}
+            hf_token = os.getenv("HF_TOKEN")
+            if not hf_token:
+                return "", {"status": "error", "reason": "HF_TOKEN not set"}
+            try:
+                description = generate_description(dataset_id, hf_token)
+                return description, {"status": "success", "length": len(description)}
+            except Exception as e:
+                return "", {"status": "error", "reason": str(e)}
+        test_btn.click(
+            fn=test_generate,
+            inputs=test_input,
+            outputs=[test_output, test_status],
+        )
+    with gr.Tab("Trigger PR"):
+        gr.Markdown(
+            "Manually trigger description generation and PR creation.\n\n"
+            "**Warning:** This will open a real PR!"
+        )
+        trigger_input = gr.Textbox(
+            label="Dataset ID",
+            placeholder="davanstrien/test-dataset",
+        )
+        trigger_btn = gr.Button("Generate & Open PR", variant="primary")
+        trigger_output = gr.JSON(label="Result")
+        async def trigger_pr(dataset_id: str):
+            if not dataset_id:
+                return {"status": "error", "reason": "no dataset ID provided"}
+            hf_token = os.getenv("HF_TOKEN")
+            if not hf_token:
+                return {"status": "error", "reason": "HF_TOKEN not set"}
+            result = await process_dataset(dataset_id, hf_token)
+            # Save to processed log
+            if result.get("status") == "pr_created":
+                processed = load_processed()
+                processed[dataset_id] = {
+                    "pr_url": result.get("pr_url"),
+                    "timestamp": datetime.now().isoformat(),
+                    "status": "pr_created",
+                    "trigger": "manual",
+                }
+                save_processed(processed)
+            return result
+        trigger_btn.click(
+            fn=trigger_pr,
+            inputs=trigger_input,
+            outputs=trigger_output,
+        )
+# WebhooksServer with automatic secret verification
+app = WebhooksServer(ui=demo, webhook_secret=os.getenv("WEBHOOK_SECRET"))
+@app.add_webhook("/dataset_update")
+async def handle_dataset_webhook(payload: WebhookPayload) -> dict:
+    """Handle dataset creation/update webhooks."""
+    # Filter for datasets only
+    if payload.repo.type != "dataset":
+        return {"status": "skipped", "reason": "not a dataset"}
+    # Filter for watched repos
+    if not is_watched_repo(payload.repo.name):
+        return {"status": "skipped", "reason": "not in watched list"}
+    dataset_id = payload.repo.name
+    # Get token
+    hf_token = os.getenv("HF_TOKEN")
+    if not hf_token:
+        return {"status": "error", "reason": "HF_TOKEN not configured"}
+    # Process the dataset
+    result = await process_dataset(dataset_id, hf_token)
+    # Save to processed log
+    processed = load_processed()
+    processed[dataset_id] = {
+        "pr_url": result.get("pr_url"),
+        "timestamp": datetime.now().isoformat(),
+        "status": result.get("status"),
+        "reason": result.get("reason"),
+        "trigger": "webhook",
+        "event": payload.event.action if payload.event else None,
+    }
+    save_processed(processed)
+    return result
+if __name__ == "__main__":
+    app.launch()

description_generator.py ADDED Viewed

	@@ -0,0 +1,173 @@

+"""Generate dataset descriptions using an LLM with a single prompt."""
+import json
+import re
+from datasets_server import DatasetsServerClient
+from huggingface_hub import InferenceClient
+DEFAULT_MODEL = "zai-org/GLM-4.6V:zai-org"
+def gather_dataset_info(dataset: str, hf_token: str | None = None) -> dict:
+    """Gather all dataset information upfront from Datasets Viewer API."""
+    client = DatasetsServerClient(token=hf_token)
+    info = {"dataset": dataset}
+    # Get validity and splits
+    try:
+        validity = client.is_valid(dataset)
+        info["validity"] = {
+            "viewer": validity.viewer,
+            "preview": validity.preview,
+            "search": validity.search,
+            "filter": validity.filter,
+            "statistics": validity.statistics,
+        }
+    except Exception as e:
+        info["validity_error"] = str(e)
+        return info  # Can't continue without validity
+    # Get splits
+    try:
+        splits = client.list_splits(dataset)
+        info["splits"] = [{"config": s.config, "split": s.split} for s in splits]
+        size = client.get_size(dataset)
+        info["size"] = size.size.get("dataset", {}) if size.size else {}
+    except Exception as e:
+        info["splits_error"] = str(e)
+    # Get features and sample rows
+    if splits:
+        first_split = splits[0]
+        try:
+            preview = client.preview(dataset, first_split.config, first_split.split)
+            info["features"] = preview.features[:10]  # Limit features
+        except Exception as e:
+            info["features_error"] = str(e)
+        try:
+            samples = client.sample_rows(
+                dataset,
+                first_split.config,
+                first_split.split,
+                n_samples=15,
+                seed=42,
+                max_requests=10,
+            )
+            # Truncate long values, tracking truncation
+            rows = []
+            truncation_occurred = False
+            for row_data in samples.rows:
+                row = row_data.get("row", {})
+                processed = {}
+                for k, v in row.items():
+                    v_str = str(v)
+                    if len(v_str) > 1200:
+                        processed[k] = (
+                            v_str[:1200]
+                            + f"... [truncated, original {len(v_str)} chars]"
+                        )
+                        truncation_occurred = True
+                    else:
+                        processed[k] = v
+                rows.append(processed)
+            info["sample_rows"] = rows
+            info["samples_truncated"] = truncation_occurred
+            info["num_rows_total"] = samples.num_rows_total
+        except Exception as e:
+            info["samples_error"] = str(e)
+    # Get statistics if available
+    if info.get("validity", {}).get("statistics"):
+        try:
+            first_split = splits[0]
+            stats = client.get_statistics(
+                dataset, first_split.config, first_split.split
+            )
+            info["statistics"] = stats.statistics  # Pass raw stats to model
+        except Exception as e:
+            info["statistics_error"] = str(e)
+    else:
+        info["statistics"] = "Not available for this dataset"
+    return info
+def build_prompt(dataset_info: dict) -> str:
+    """Build the prompt with all gathered information."""
+    dataset_id = dataset_info["dataset"]
+    # Format the info nicely
+    info_text = json.dumps(dataset_info, indent=2, default=str)
+    return f"""Write a description for the HuggingFace dataset '{dataset_id}'.
+Below is information from the Datasets Viewer API:
+- Dataset metadata (splits, size, features)
+- A random sample of rows (not the full dataset)
+- Column statistics (if available)
+DATASETS VIEWER INFO:
+{info_text}
+Requirements:
+- 2-4 sentences, concise but complete, suitable for a dataset card
+- Start with "This dataset..."
+- Include: what the data contains, size, and structure
+- For text data, mention the language(s) if evident from samples
+- Mention the likely domain and ML task if reasonably confident
+- Note any notable patterns in statistics (e.g., class imbalance)
+- Use hedging ("appears suitable for", "likely") for inferred purposes
+Important:
+- Only state facts verifiable from the provided data
+- Do not guess at licensing, collection methods, or details not shown
+- The dataset ID may hint at the source or purpose
+Respond with ONLY the description in <description> tags."""
+def generate_description(
+    dataset_id: str,
+    hf_token: str,
+    model: str = DEFAULT_MODEL,
+) -> str:
+    """Generate a description for a dataset using LLM.
+    Args:
+        dataset_id: HuggingFace dataset ID (e.g., 'username/dataset')
+        hf_token: HuggingFace token for API access
+        model: Model to use for generation
+    Returns:
+        Generated description string
+    """
+    # Gather dataset information
+    dataset_info = gather_dataset_info(dataset_id, hf_token)
+    # Build prompt
+    prompt = build_prompt(dataset_info)
+    # Call LLM using InferenceClient
+    client = InferenceClient(token=hf_token)
+    response = client.chat_completion(
+        model=model,
+        messages=[{"role": "user", "content": prompt}],
+        max_tokens=2000,
+    )
+    final_description = response.choices[0].message.content
+    # Extract description from tags if present
+    if final_description:
+        match = re.search(
+            r"<description>\s*(.*?)\s*</description>", final_description, re.DOTALL
+        )
+        if match:
+            final_description = match.group(1).strip()
+    return final_description or ""

requirements.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+gradio>=4.0
+huggingface_hub>=0.26
+datasets-server-py @ git+https://github.com/davanstrien/datasets-server-py.git