|
|
"""Generate dataset descriptions using an LLM with a single prompt.""" |
|
|
|
|
|
import json |
|
|
import re |
|
|
|
|
|
from datasets_server import DatasetsServerClient |
|
|
from huggingface_hub import InferenceClient |
|
|
|
|
|
DEFAULT_MODEL = "zai-org/GLM-4.6V:zai-org" |
|
|
|
|
|
|
|
|
class ViewerNotReadyError(Exception): |
|
|
"""Raised when the Datasets Viewer hasn't processed a dataset yet.""" |
|
|
|
|
|
pass |
|
|
|
|
|
|
|
|
def gather_dataset_info(dataset: str, hf_token: str | None = None) -> dict: |
|
|
"""Gather all dataset information upfront from Datasets Viewer API. |
|
|
|
|
|
Raises: |
|
|
ViewerNotReadyError: If the dataset preview is not available yet. |
|
|
""" |
|
|
client = DatasetsServerClient(token=hf_token) |
|
|
|
|
|
info = {"dataset": dataset} |
|
|
|
|
|
|
|
|
try: |
|
|
validity = client.is_valid(dataset) |
|
|
info["validity"] = { |
|
|
"viewer": validity.viewer, |
|
|
"preview": validity.preview, |
|
|
"search": validity.search, |
|
|
"filter": validity.filter, |
|
|
"statistics": validity.statistics, |
|
|
} |
|
|
|
|
|
|
|
|
if not validity.preview: |
|
|
raise ViewerNotReadyError( |
|
|
f"Dataset viewer not ready for '{dataset}'. " |
|
|
"The dataset may be new or still processing." |
|
|
) |
|
|
except ViewerNotReadyError: |
|
|
raise |
|
|
except Exception as e: |
|
|
info["validity_error"] = str(e) |
|
|
return info |
|
|
|
|
|
|
|
|
try: |
|
|
splits = client.list_splits(dataset) |
|
|
info["splits"] = [{"config": s.config, "split": s.split} for s in splits] |
|
|
|
|
|
size = client.get_size(dataset) |
|
|
info["size"] = size.size.get("dataset", {}) if size.size else {} |
|
|
except Exception as e: |
|
|
info["splits_error"] = str(e) |
|
|
|
|
|
|
|
|
if splits: |
|
|
first_split = splits[0] |
|
|
try: |
|
|
preview = client.preview(dataset, first_split.config, first_split.split) |
|
|
info["features"] = preview.features[:10] |
|
|
except Exception as e: |
|
|
info["features_error"] = str(e) |
|
|
|
|
|
try: |
|
|
samples = client.sample_rows( |
|
|
dataset, |
|
|
first_split.config, |
|
|
first_split.split, |
|
|
n_samples=15, |
|
|
seed=42, |
|
|
max_requests=10, |
|
|
) |
|
|
|
|
|
rows = [] |
|
|
truncation_occurred = False |
|
|
for row_data in samples.rows: |
|
|
row = row_data.get("row", {}) |
|
|
processed = {} |
|
|
for k, v in row.items(): |
|
|
v_str = str(v) |
|
|
if len(v_str) > 1200: |
|
|
processed[k] = ( |
|
|
v_str[:1200] |
|
|
+ f"... [truncated, original {len(v_str)} chars]" |
|
|
) |
|
|
truncation_occurred = True |
|
|
else: |
|
|
processed[k] = v |
|
|
rows.append(processed) |
|
|
info["sample_rows"] = rows |
|
|
info["samples_truncated"] = truncation_occurred |
|
|
info["num_rows_total"] = samples.num_rows_total |
|
|
except Exception as e: |
|
|
info["samples_error"] = str(e) |
|
|
|
|
|
|
|
|
if info.get("validity", {}).get("statistics"): |
|
|
try: |
|
|
first_split = splits[0] |
|
|
stats = client.get_statistics( |
|
|
dataset, first_split.config, first_split.split |
|
|
) |
|
|
info["statistics"] = stats.statistics |
|
|
except Exception as e: |
|
|
info["statistics_error"] = str(e) |
|
|
else: |
|
|
info["statistics"] = "Not available for this dataset" |
|
|
|
|
|
return info |
|
|
|
|
|
|
|
|
def build_prompt(dataset_info: dict) -> str: |
|
|
"""Build the prompt with all gathered information.""" |
|
|
dataset_id = dataset_info["dataset"] |
|
|
|
|
|
|
|
|
info_text = json.dumps(dataset_info, indent=2, default=str) |
|
|
|
|
|
return f"""Write a description for the HuggingFace dataset '{dataset_id}'. |
|
|
|
|
|
Below is information from the Datasets Viewer API: |
|
|
- Dataset metadata (splits, size, features) |
|
|
- A random sample of rows (not the full dataset) |
|
|
- Column statistics (if available) |
|
|
|
|
|
DATASETS VIEWER INFO: |
|
|
{info_text} |
|
|
|
|
|
Requirements: |
|
|
- 2-4 sentences, concise but complete, suitable for a dataset card |
|
|
- Start with "This dataset..." |
|
|
- Include: what the data contains, size, and structure |
|
|
- For text data, mention the language(s) if evident from samples |
|
|
- Mention the likely domain and ML task if reasonably confident |
|
|
- Note any notable patterns in statistics (e.g., class imbalance) |
|
|
- Use hedging ("appears suitable for", "likely") for inferred purposes |
|
|
|
|
|
Important: |
|
|
- Only state facts verifiable from the provided data |
|
|
- Do not guess at licensing, collection methods, or details not shown |
|
|
- The dataset ID may hint at the source or purpose |
|
|
|
|
|
Respond with ONLY the description in <description> tags.""" |
|
|
|
|
|
|
|
|
def generate_description( |
|
|
dataset_id: str, |
|
|
hf_token: str, |
|
|
model: str = DEFAULT_MODEL, |
|
|
) -> str: |
|
|
"""Generate a description for a dataset using LLM. |
|
|
|
|
|
Args: |
|
|
dataset_id: HuggingFace dataset ID (e.g., 'username/dataset') |
|
|
hf_token: HuggingFace token for API access |
|
|
model: Model to use for generation |
|
|
|
|
|
Returns: |
|
|
Generated description string |
|
|
""" |
|
|
|
|
|
dataset_info = gather_dataset_info(dataset_id, hf_token) |
|
|
|
|
|
|
|
|
prompt = build_prompt(dataset_info) |
|
|
|
|
|
|
|
|
client = InferenceClient(token=hf_token) |
|
|
|
|
|
response = client.chat_completion( |
|
|
model=model, |
|
|
messages=[{"role": "user", "content": prompt}], |
|
|
max_tokens=2000, |
|
|
) |
|
|
|
|
|
final_description = response.choices[0].message.content |
|
|
|
|
|
|
|
|
if final_description: |
|
|
match = re.search( |
|
|
r"<description>\s*(.*?)\s*</description>", final_description, re.DOTALL |
|
|
) |
|
|
if match: |
|
|
final_description = match.group(1).strip() |
|
|
|
|
|
return final_description or "" |
|
|
|