Spaces:

librarian-bots
/

dataset-card-drafter

Running

App Files Files Community

dataset-card-drafter / description_generator.py

davanstrien HF Staff

Add pending queue for datasets where viewer isn't ready

763c57d 4 days ago

raw

history blame contribute delete

6.2 kB

	"""Generate dataset descriptions using an LLM with a single prompt."""

	import json
	import re

	from datasets_server import DatasetsServerClient
	from huggingface_hub import InferenceClient

	DEFAULT_MODEL = "zai-org/GLM-4.6V:zai-org"


	class ViewerNotReadyError(Exception):
	"""Raised when the Datasets Viewer hasn't processed a dataset yet."""

	pass


	def gather_dataset_info(dataset: str, hf_token: str \| None = None) -> dict:
	"""Gather all dataset information upfront from Datasets Viewer API.

	Raises:
	ViewerNotReadyError: If the dataset preview is not available yet.
	"""
	client = DatasetsServerClient(token=hf_token)

	info = {"dataset": dataset}

	# Get validity and splits
	try:
	validity = client.is_valid(dataset)
	info["validity"] = {
	"viewer": validity.viewer,
	"preview": validity.preview,
	"search": validity.search,
	"filter": validity.filter,
	"statistics": validity.statistics,
	}

	# Check if preview is ready - we need it to get sample rows
	if not validity.preview:
	raise ViewerNotReadyError(
	f"Dataset viewer not ready for '{dataset}'. "
	"The dataset may be new or still processing."
	)
	except ViewerNotReadyError:
	raise # Re-raise our custom exception
	except Exception as e:
	info["validity_error"] = str(e)
	return info # Can't continue without validity

	# Get splits
	try:
	splits = client.list_splits(dataset)
	info["splits"] = [{"config": s.config, "split": s.split} for s in splits]

	size = client.get_size(dataset)
	info["size"] = size.size.get("dataset", {}) if size.size else {}
	except Exception as e:
	info["splits_error"] = str(e)

	# Get features and sample rows
	if splits:
	first_split = splits[0]
	try:
	preview = client.preview(dataset, first_split.config, first_split.split)
	info["features"] = preview.features[:10] # Limit features
	except Exception as e:
	info["features_error"] = str(e)

	try:
	samples = client.sample_rows(
	dataset,
	first_split.config,
	first_split.split,
	n_samples=15,
	seed=42,
	max_requests=10,
	)
	# Truncate long values, tracking truncation
	rows = []
	truncation_occurred = False
	for row_data in samples.rows:
	row = row_data.get("row", {})
	processed = {}
	for k, v in row.items():
	v_str = str(v)
	if len(v_str) > 1200:
	processed[k] = (
	v_str[:1200]
	+ f"... [truncated, original {len(v_str)} chars]"
	)
	truncation_occurred = True
	else:
	processed[k] = v
	rows.append(processed)
	info["sample_rows"] = rows
	info["samples_truncated"] = truncation_occurred
	info["num_rows_total"] = samples.num_rows_total
	except Exception as e:
	info["samples_error"] = str(e)

	# Get statistics if available
	if info.get("validity", {}).get("statistics"):
	try:
	first_split = splits[0]
	stats = client.get_statistics(
	dataset, first_split.config, first_split.split
	)
	info["statistics"] = stats.statistics # Pass raw stats to model
	except Exception as e:
	info["statistics_error"] = str(e)
	else:
	info["statistics"] = "Not available for this dataset"

	return info


	def build_prompt(dataset_info: dict) -> str:
	"""Build the prompt with all gathered information."""
	dataset_id = dataset_info["dataset"]

	# Format the info nicely
	info_text = json.dumps(dataset_info, indent=2, default=str)

	return f"""Write a description for the HuggingFace dataset '{dataset_id}'.

	Below is information from the Datasets Viewer API:
	- Dataset metadata (splits, size, features)
	- A random sample of rows (not the full dataset)
	- Column statistics (if available)

	DATASETS VIEWER INFO:
	{info_text}

	Requirements:
	- 2-4 sentences, concise but complete, suitable for a dataset card
	- Start with "This dataset..."
	- Include: what the data contains, size, and structure
	- For text data, mention the language(s) if evident from samples
	- Mention the likely domain and ML task if reasonably confident
	- Note any notable patterns in statistics (e.g., class imbalance)
	- Use hedging ("appears suitable for", "likely") for inferred purposes

	Important:
	- Only state facts verifiable from the provided data
	- Do not guess at licensing, collection methods, or details not shown
	- The dataset ID may hint at the source or purpose

	Respond with ONLY the description in <description> tags."""


	def generate_description(
	dataset_id: str,
	hf_token: str,
	model: str = DEFAULT_MODEL,
	) -> str:
	"""Generate a description for a dataset using LLM.

	Args:
	dataset_id: HuggingFace dataset ID (e.g., 'username/dataset')
	hf_token: HuggingFace token for API access
	model: Model to use for generation

	Returns:
	Generated description string
	"""
	# Gather dataset information
	dataset_info = gather_dataset_info(dataset_id, hf_token)

	# Build prompt
	prompt = build_prompt(dataset_info)

	# Call LLM using InferenceClient
	client = InferenceClient(token=hf_token)

	response = client.chat_completion(
	model=model,
	messages=[{"role": "user", "content": prompt}],
	max_tokens=2000,
	)

	final_description = response.choices[0].message.content

	# Extract description from tags if present
	if final_description:
	match = re.search(
	r"<description>\s(.?)\s*</description>", final_description, re.DOTALL
	)
	if match:
	final_description = match.group(1).strip()

	return final_description or ""