dataset-card-drafter / description_generator.py
davanstrien's picture
davanstrien HF Staff
Add pending queue for datasets where viewer isn't ready
763c57d
"""Generate dataset descriptions using an LLM with a single prompt."""
import json
import re
from datasets_server import DatasetsServerClient
from huggingface_hub import InferenceClient
DEFAULT_MODEL = "zai-org/GLM-4.6V:zai-org"
class ViewerNotReadyError(Exception):
"""Raised when the Datasets Viewer hasn't processed a dataset yet."""
pass
def gather_dataset_info(dataset: str, hf_token: str | None = None) -> dict:
"""Gather all dataset information upfront from Datasets Viewer API.
Raises:
ViewerNotReadyError: If the dataset preview is not available yet.
"""
client = DatasetsServerClient(token=hf_token)
info = {"dataset": dataset}
# Get validity and splits
try:
validity = client.is_valid(dataset)
info["validity"] = {
"viewer": validity.viewer,
"preview": validity.preview,
"search": validity.search,
"filter": validity.filter,
"statistics": validity.statistics,
}
# Check if preview is ready - we need it to get sample rows
if not validity.preview:
raise ViewerNotReadyError(
f"Dataset viewer not ready for '{dataset}'. "
"The dataset may be new or still processing."
)
except ViewerNotReadyError:
raise # Re-raise our custom exception
except Exception as e:
info["validity_error"] = str(e)
return info # Can't continue without validity
# Get splits
try:
splits = client.list_splits(dataset)
info["splits"] = [{"config": s.config, "split": s.split} for s in splits]
size = client.get_size(dataset)
info["size"] = size.size.get("dataset", {}) if size.size else {}
except Exception as e:
info["splits_error"] = str(e)
# Get features and sample rows
if splits:
first_split = splits[0]
try:
preview = client.preview(dataset, first_split.config, first_split.split)
info["features"] = preview.features[:10] # Limit features
except Exception as e:
info["features_error"] = str(e)
try:
samples = client.sample_rows(
dataset,
first_split.config,
first_split.split,
n_samples=15,
seed=42,
max_requests=10,
)
# Truncate long values, tracking truncation
rows = []
truncation_occurred = False
for row_data in samples.rows:
row = row_data.get("row", {})
processed = {}
for k, v in row.items():
v_str = str(v)
if len(v_str) > 1200:
processed[k] = (
v_str[:1200]
+ f"... [truncated, original {len(v_str)} chars]"
)
truncation_occurred = True
else:
processed[k] = v
rows.append(processed)
info["sample_rows"] = rows
info["samples_truncated"] = truncation_occurred
info["num_rows_total"] = samples.num_rows_total
except Exception as e:
info["samples_error"] = str(e)
# Get statistics if available
if info.get("validity", {}).get("statistics"):
try:
first_split = splits[0]
stats = client.get_statistics(
dataset, first_split.config, first_split.split
)
info["statistics"] = stats.statistics # Pass raw stats to model
except Exception as e:
info["statistics_error"] = str(e)
else:
info["statistics"] = "Not available for this dataset"
return info
def build_prompt(dataset_info: dict) -> str:
"""Build the prompt with all gathered information."""
dataset_id = dataset_info["dataset"]
# Format the info nicely
info_text = json.dumps(dataset_info, indent=2, default=str)
return f"""Write a description for the HuggingFace dataset '{dataset_id}'.
Below is information from the Datasets Viewer API:
- Dataset metadata (splits, size, features)
- A random sample of rows (not the full dataset)
- Column statistics (if available)
DATASETS VIEWER INFO:
{info_text}
Requirements:
- 2-4 sentences, concise but complete, suitable for a dataset card
- Start with "This dataset..."
- Include: what the data contains, size, and structure
- For text data, mention the language(s) if evident from samples
- Mention the likely domain and ML task if reasonably confident
- Note any notable patterns in statistics (e.g., class imbalance)
- Use hedging ("appears suitable for", "likely") for inferred purposes
Important:
- Only state facts verifiable from the provided data
- Do not guess at licensing, collection methods, or details not shown
- The dataset ID may hint at the source or purpose
Respond with ONLY the description in <description> tags."""
def generate_description(
dataset_id: str,
hf_token: str,
model: str = DEFAULT_MODEL,
) -> str:
"""Generate a description for a dataset using LLM.
Args:
dataset_id: HuggingFace dataset ID (e.g., 'username/dataset')
hf_token: HuggingFace token for API access
model: Model to use for generation
Returns:
Generated description string
"""
# Gather dataset information
dataset_info = gather_dataset_info(dataset_id, hf_token)
# Build prompt
prompt = build_prompt(dataset_info)
# Call LLM using InferenceClient
client = InferenceClient(token=hf_token)
response = client.chat_completion(
model=model,
messages=[{"role": "user", "content": prompt}],
max_tokens=2000,
)
final_description = response.choices[0].message.content
# Extract description from tags if present
if final_description:
match = re.search(
r"<description>\s*(.*?)\s*</description>", final_description, re.DOTALL
)
if match:
final_description = match.group(1).strip()
return final_description or ""