davanstrien HF Staff Claude Opus 4.5 commited on
Commit
72431fa
·
1 Parent(s): 7622a26

Add MVP implementation for dataset card drafter

Browse files

- app.py: WebhooksServer + Gradio UI for webhook handling
- description_generator.py: LLM-based description generation
- requirements.txt: Dependencies (gradio, huggingface_hub, datasets-server-py)

Features:
- Watches davanstrien/* datasets via webhooks
- Uses DatasetCard for YAML-aware README handling
- Generates descriptions with GLM-4.6V via InferenceClient
- Opens PRs with card.push_to_hub(create_pr=True)
- Manual test and trigger UI tabs
- JSON persistence in /data for Space

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <[email protected]>

Files changed (5) hide show
  1. .beads/issues.jsonl +1 -0
  2. .gitignore +26 -0
  3. app.py +221 -0
  4. description_generator.py +173 -0
  5. requirements.txt +3 -0
.beads/issues.jsonl ADDED
@@ -0,0 +1 @@
 
 
1
+ {"id":"dataset-card-drafter-wbd","title":"MVP implementation: WebhooksServer + DatasetCard + InferenceClient","description":"","status":"closed","priority":1,"issue_type":"feature","created_at":"2025-12-15T17:24:36.365733Z","updated_at":"2025-12-15T17:28:21.127763Z","closed_at":"2025-12-15T17:28:21.127763Z","close_reason":"MVP implemented with WebhooksServer, DatasetCard, and InferenceClient"}
.gitignore ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.so
6
+ .Python
7
+ .venv/
8
+ venv/
9
+ ENV/
10
+
11
+ # IDE
12
+ .idea/
13
+ .vscode/
14
+ *.swp
15
+ *.swo
16
+
17
+ # Local data (for development)
18
+ local_data/
19
+
20
+ # Environment
21
+ .env
22
+ .env.local
23
+
24
+ # OS
25
+ .DS_Store
26
+ Thumbs.db
app.py ADDED
@@ -0,0 +1,221 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Dataset Card Drafter - MVP Space.
2
+
3
+ Watches davanstrien/* datasets and opens PRs with auto-generated descriptions.
4
+ """
5
+
6
+ import json
7
+ import os
8
+ from datetime import datetime
9
+ from pathlib import Path
10
+
11
+ import gradio as gr
12
+ from huggingface_hub import DatasetCard, WebhookPayload, WebhooksServer
13
+
14
+ from description_generator import generate_description
15
+
16
+ # Configuration
17
+ WATCHED_PREFIXES = ["davanstrien/"] # Repos to watch
18
+ MIN_DESCRIPTION_LENGTH = 100 # Chars below which we generate
19
+
20
+ # Persistence directory
21
+ DATA_DIR = Path("/data") if Path("/data").exists() else Path("./local_data")
22
+ DATA_DIR.mkdir(exist_ok=True)
23
+ PROCESSED_FILE = DATA_DIR / "processed.json"
24
+
25
+
26
+ def load_processed() -> dict:
27
+ """Load processed datasets from persistence."""
28
+ if PROCESSED_FILE.exists():
29
+ return json.loads(PROCESSED_FILE.read_text())
30
+ return {}
31
+
32
+
33
+ def save_processed(data: dict) -> None:
34
+ """Save processed datasets to persistence."""
35
+ PROCESSED_FILE.write_text(json.dumps(data, indent=2))
36
+
37
+
38
+ def is_watched_repo(repo_name: str) -> bool:
39
+ """Check if a repo is in our watched list."""
40
+ return any(repo_name.startswith(prefix) for prefix in WATCHED_PREFIXES)
41
+
42
+
43
+ def should_generate(card: DatasetCard) -> bool:
44
+ """Check if a dataset card needs a description."""
45
+ if not card.text:
46
+ return True
47
+ return len(card.text.strip()) < MIN_DESCRIPTION_LENGTH
48
+
49
+
50
+ async def process_dataset(dataset_id: str, hf_token: str) -> dict:
51
+ """Process a single dataset: check, generate, and open PR.
52
+
53
+ Returns a status dict with results.
54
+ """
55
+ # Load current card
56
+ try:
57
+ card = DatasetCard.load(dataset_id)
58
+ except Exception as e:
59
+ return {"status": "error", "reason": f"card load failed: {e}"}
60
+
61
+ # Check if description needed
62
+ if not should_generate(card):
63
+ return {"status": "skipped", "reason": "description exists"}
64
+
65
+ # Generate description
66
+ try:
67
+ description = generate_description(dataset_id, hf_token)
68
+ except Exception as e:
69
+ return {"status": "error", "reason": f"generation failed: {e}"}
70
+
71
+ if not description:
72
+ return {"status": "error", "reason": "empty description generated"}
73
+
74
+ # Update card and push as PR
75
+ card.text = description
76
+
77
+ try:
78
+ commit_info = card.push_to_hub(
79
+ repo_id=dataset_id,
80
+ repo_type="dataset",
81
+ commit_message="Add dataset description",
82
+ create_pr=True,
83
+ token=hf_token,
84
+ )
85
+ pr_url = getattr(commit_info, "pr_url", str(commit_info))
86
+ except Exception as e:
87
+ return {"status": "error", "reason": f"PR creation failed: {e}"}
88
+
89
+ return {"status": "pr_created", "pr_url": pr_url, "description": description}
90
+
91
+
92
+ # Gradio UI
93
+ with gr.Blocks(title="Dataset Card Drafter") as demo:
94
+ gr.Markdown("# Dataset Card Drafter MVP")
95
+ gr.Markdown(
96
+ f"Watching datasets matching: `{'`, `'.join(WATCHED_PREFIXES)}`\n\n"
97
+ f"Triggers when description < {MIN_DESCRIPTION_LENGTH} characters."
98
+ )
99
+
100
+ with gr.Tab("Status"):
101
+ status_display = gr.JSON(label="Processed Datasets", value=load_processed)
102
+ refresh_btn = gr.Button("Refresh")
103
+ refresh_btn.click(fn=load_processed, outputs=status_display)
104
+
105
+ with gr.Tab("Manual Test"):
106
+ gr.Markdown(
107
+ "Test description generation without opening a PR.\n\n"
108
+ "**Note:** This requires `HF_TOKEN` to be set."
109
+ )
110
+ test_input = gr.Textbox(
111
+ label="Dataset ID",
112
+ placeholder="davanstrien/test-dataset",
113
+ )
114
+ test_btn = gr.Button("Generate Description (Preview)")
115
+ test_output = gr.Textbox(label="Generated Description", lines=5)
116
+ test_status = gr.JSON(label="Status")
117
+
118
+ def test_generate(dataset_id: str):
119
+ if not dataset_id:
120
+ return "", {"status": "error", "reason": "no dataset ID provided"}
121
+
122
+ hf_token = os.getenv("HF_TOKEN")
123
+ if not hf_token:
124
+ return "", {"status": "error", "reason": "HF_TOKEN not set"}
125
+
126
+ try:
127
+ description = generate_description(dataset_id, hf_token)
128
+ return description, {"status": "success", "length": len(description)}
129
+ except Exception as e:
130
+ return "", {"status": "error", "reason": str(e)}
131
+
132
+ test_btn.click(
133
+ fn=test_generate,
134
+ inputs=test_input,
135
+ outputs=[test_output, test_status],
136
+ )
137
+
138
+ with gr.Tab("Trigger PR"):
139
+ gr.Markdown(
140
+ "Manually trigger description generation and PR creation.\n\n"
141
+ "**Warning:** This will open a real PR!"
142
+ )
143
+ trigger_input = gr.Textbox(
144
+ label="Dataset ID",
145
+ placeholder="davanstrien/test-dataset",
146
+ )
147
+ trigger_btn = gr.Button("Generate & Open PR", variant="primary")
148
+ trigger_output = gr.JSON(label="Result")
149
+
150
+ async def trigger_pr(dataset_id: str):
151
+ if not dataset_id:
152
+ return {"status": "error", "reason": "no dataset ID provided"}
153
+
154
+ hf_token = os.getenv("HF_TOKEN")
155
+ if not hf_token:
156
+ return {"status": "error", "reason": "HF_TOKEN not set"}
157
+
158
+ result = await process_dataset(dataset_id, hf_token)
159
+
160
+ # Save to processed log
161
+ if result.get("status") == "pr_created":
162
+ processed = load_processed()
163
+ processed[dataset_id] = {
164
+ "pr_url": result.get("pr_url"),
165
+ "timestamp": datetime.now().isoformat(),
166
+ "status": "pr_created",
167
+ "trigger": "manual",
168
+ }
169
+ save_processed(processed)
170
+
171
+ return result
172
+
173
+ trigger_btn.click(
174
+ fn=trigger_pr,
175
+ inputs=trigger_input,
176
+ outputs=trigger_output,
177
+ )
178
+
179
+
180
+ # WebhooksServer with automatic secret verification
181
+ app = WebhooksServer(ui=demo, webhook_secret=os.getenv("WEBHOOK_SECRET"))
182
+
183
+
184
+ @app.add_webhook("/dataset_update")
185
+ async def handle_dataset_webhook(payload: WebhookPayload) -> dict:
186
+ """Handle dataset creation/update webhooks."""
187
+ # Filter for datasets only
188
+ if payload.repo.type != "dataset":
189
+ return {"status": "skipped", "reason": "not a dataset"}
190
+
191
+ # Filter for watched repos
192
+ if not is_watched_repo(payload.repo.name):
193
+ return {"status": "skipped", "reason": "not in watched list"}
194
+
195
+ dataset_id = payload.repo.name
196
+
197
+ # Get token
198
+ hf_token = os.getenv("HF_TOKEN")
199
+ if not hf_token:
200
+ return {"status": "error", "reason": "HF_TOKEN not configured"}
201
+
202
+ # Process the dataset
203
+ result = await process_dataset(dataset_id, hf_token)
204
+
205
+ # Save to processed log
206
+ processed = load_processed()
207
+ processed[dataset_id] = {
208
+ "pr_url": result.get("pr_url"),
209
+ "timestamp": datetime.now().isoformat(),
210
+ "status": result.get("status"),
211
+ "reason": result.get("reason"),
212
+ "trigger": "webhook",
213
+ "event": payload.event.action if payload.event else None,
214
+ }
215
+ save_processed(processed)
216
+
217
+ return result
218
+
219
+
220
+ if __name__ == "__main__":
221
+ app.launch()
description_generator.py ADDED
@@ -0,0 +1,173 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Generate dataset descriptions using an LLM with a single prompt."""
2
+
3
+ import json
4
+ import re
5
+
6
+ from datasets_server import DatasetsServerClient
7
+ from huggingface_hub import InferenceClient
8
+
9
+ DEFAULT_MODEL = "zai-org/GLM-4.6V:zai-org"
10
+
11
+
12
+ def gather_dataset_info(dataset: str, hf_token: str | None = None) -> dict:
13
+ """Gather all dataset information upfront from Datasets Viewer API."""
14
+ client = DatasetsServerClient(token=hf_token)
15
+
16
+ info = {"dataset": dataset}
17
+
18
+ # Get validity and splits
19
+ try:
20
+ validity = client.is_valid(dataset)
21
+ info["validity"] = {
22
+ "viewer": validity.viewer,
23
+ "preview": validity.preview,
24
+ "search": validity.search,
25
+ "filter": validity.filter,
26
+ "statistics": validity.statistics,
27
+ }
28
+ except Exception as e:
29
+ info["validity_error"] = str(e)
30
+ return info # Can't continue without validity
31
+
32
+ # Get splits
33
+ try:
34
+ splits = client.list_splits(dataset)
35
+ info["splits"] = [{"config": s.config, "split": s.split} for s in splits]
36
+
37
+ size = client.get_size(dataset)
38
+ info["size"] = size.size.get("dataset", {}) if size.size else {}
39
+ except Exception as e:
40
+ info["splits_error"] = str(e)
41
+
42
+ # Get features and sample rows
43
+ if splits:
44
+ first_split = splits[0]
45
+ try:
46
+ preview = client.preview(dataset, first_split.config, first_split.split)
47
+ info["features"] = preview.features[:10] # Limit features
48
+ except Exception as e:
49
+ info["features_error"] = str(e)
50
+
51
+ try:
52
+ samples = client.sample_rows(
53
+ dataset,
54
+ first_split.config,
55
+ first_split.split,
56
+ n_samples=15,
57
+ seed=42,
58
+ max_requests=10,
59
+ )
60
+ # Truncate long values, tracking truncation
61
+ rows = []
62
+ truncation_occurred = False
63
+ for row_data in samples.rows:
64
+ row = row_data.get("row", {})
65
+ processed = {}
66
+ for k, v in row.items():
67
+ v_str = str(v)
68
+ if len(v_str) > 1200:
69
+ processed[k] = (
70
+ v_str[:1200]
71
+ + f"... [truncated, original {len(v_str)} chars]"
72
+ )
73
+ truncation_occurred = True
74
+ else:
75
+ processed[k] = v
76
+ rows.append(processed)
77
+ info["sample_rows"] = rows
78
+ info["samples_truncated"] = truncation_occurred
79
+ info["num_rows_total"] = samples.num_rows_total
80
+ except Exception as e:
81
+ info["samples_error"] = str(e)
82
+
83
+ # Get statistics if available
84
+ if info.get("validity", {}).get("statistics"):
85
+ try:
86
+ first_split = splits[0]
87
+ stats = client.get_statistics(
88
+ dataset, first_split.config, first_split.split
89
+ )
90
+ info["statistics"] = stats.statistics # Pass raw stats to model
91
+ except Exception as e:
92
+ info["statistics_error"] = str(e)
93
+ else:
94
+ info["statistics"] = "Not available for this dataset"
95
+
96
+ return info
97
+
98
+
99
+ def build_prompt(dataset_info: dict) -> str:
100
+ """Build the prompt with all gathered information."""
101
+ dataset_id = dataset_info["dataset"]
102
+
103
+ # Format the info nicely
104
+ info_text = json.dumps(dataset_info, indent=2, default=str)
105
+
106
+ return f"""Write a description for the HuggingFace dataset '{dataset_id}'.
107
+
108
+ Below is information from the Datasets Viewer API:
109
+ - Dataset metadata (splits, size, features)
110
+ - A random sample of rows (not the full dataset)
111
+ - Column statistics (if available)
112
+
113
+ DATASETS VIEWER INFO:
114
+ {info_text}
115
+
116
+ Requirements:
117
+ - 2-4 sentences, concise but complete, suitable for a dataset card
118
+ - Start with "This dataset..."
119
+ - Include: what the data contains, size, and structure
120
+ - For text data, mention the language(s) if evident from samples
121
+ - Mention the likely domain and ML task if reasonably confident
122
+ - Note any notable patterns in statistics (e.g., class imbalance)
123
+ - Use hedging ("appears suitable for", "likely") for inferred purposes
124
+
125
+ Important:
126
+ - Only state facts verifiable from the provided data
127
+ - Do not guess at licensing, collection methods, or details not shown
128
+ - The dataset ID may hint at the source or purpose
129
+
130
+ Respond with ONLY the description in <description> tags."""
131
+
132
+
133
+ def generate_description(
134
+ dataset_id: str,
135
+ hf_token: str,
136
+ model: str = DEFAULT_MODEL,
137
+ ) -> str:
138
+ """Generate a description for a dataset using LLM.
139
+
140
+ Args:
141
+ dataset_id: HuggingFace dataset ID (e.g., 'username/dataset')
142
+ hf_token: HuggingFace token for API access
143
+ model: Model to use for generation
144
+
145
+ Returns:
146
+ Generated description string
147
+ """
148
+ # Gather dataset information
149
+ dataset_info = gather_dataset_info(dataset_id, hf_token)
150
+
151
+ # Build prompt
152
+ prompt = build_prompt(dataset_info)
153
+
154
+ # Call LLM using InferenceClient
155
+ client = InferenceClient(token=hf_token)
156
+
157
+ response = client.chat_completion(
158
+ model=model,
159
+ messages=[{"role": "user", "content": prompt}],
160
+ max_tokens=2000,
161
+ )
162
+
163
+ final_description = response.choices[0].message.content
164
+
165
+ # Extract description from tags if present
166
+ if final_description:
167
+ match = re.search(
168
+ r"<description>\s*(.*?)\s*</description>", final_description, re.DOTALL
169
+ )
170
+ if match:
171
+ final_description = match.group(1).strip()
172
+
173
+ return final_description or ""
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ gradio>=4.0
2
+ huggingface_hub>=0.26
3
+ datasets-server-py @ git+https://github.com/davanstrien/datasets-server-py.git