Delta-Vector
/

semdupe.py

Model card Files Files and versions

xet

Community

Delta-Vector commited on Aug 17, 2025

Commit

f2725df

verified ·

1 Parent(s): 4c3b79c

Upload semdupe.py with huggingface_hub

Browse files

Files changed (1) hide show

semdupe.py +285 -0

semdupe.py ADDED Viewed

	@@ -0,0 +1,285 @@

+#!/usr/bin/env python3
+"""
+ShareGPT Semantic Deduplication Script using SemHash
+This script deduplicates ShareGPT format JSONL files using semantic similarity.
+It extracts conversation content and removes semantically similar conversations.
+Usage:
+    python deduplicate_sharegpt.py input.jsonl output_dir [options]
+Example ShareGPT format:
+    {"conversations": [{"from": "human", "value": "Hello"}, {"from": "gpt", "value": "Hi there!"}]}
+"""
+import argparse
+import json
+import os
+import sys
+from pathlib import Path
+from typing import List, Dict, Any, Optional
+import logging
+try:
+    from semhash import SemHash
+except ImportError:
+    print("Error: SemHash not installed. Please run: pip install semhash")
+    sys.exit(1)
+def setup_logging(verbose: bool = False):
+    """Setup logging configuration."""
+    level = logging.DEBUG if verbose else logging.INFO
+    logging.basicConfig(
+        level=level,
+        format='%(asctime)s - %(levelname)s - %(message)s',
+        datefmt='%Y-%m-%d %H:%M:%S'
+    )
+def extract_conversation_text(conversation: List[Dict[str, str]],
+                            mode: str = "full") -> str:
+    """
+    Extract text from a conversation for deduplication.
+    Args:
+        conversation: List of conversation turns
+        mode: How to extract text ("full", "human_only", "assistant_only", "first_turn")
+    Returns:
+        Extracted text string
+    """
+    texts = []
+    for turn in conversation:
+        from_role = turn.get("from", "")
+        value = turn.get("value", "")
+        if mode == "full":
+            texts.append(f"{from_role}: {value}")
+        elif mode == "human_only" and from_role in ["human", "user"]:
+            texts.append(value)
+        elif mode == "assistant_only" and from_role in ["gpt", "assistant"]:
+            texts.append(value)
+        elif mode == "first_turn" and len(texts) == 0:
+            texts.append(value)
+    return " ".join(texts).strip()
+def load_sharegpt_jsonl(file_path: str) -> List[Dict[str, Any]]:
+    """Load ShareGPT JSONL file."""
+    conversations = []
+    with open(file_path, 'r', encoding='utf-8') as f:
+        for line_num, line in enumerate(f, 1):
+            line = line.strip()
+            if not line:
+                continue
+            try:
+                data = json.loads(line)
+                conversations.append(data)
+            except json.JSONDecodeError as e:
+                logging.warning(f"Skipping invalid JSON on line {line_num}: {e}")
+                continue
+    logging.info(f"Loaded {len(conversations)} conversations from {file_path}")
+    return conversations
+def save_conversations(conversations: List[Dict[str, Any]],
+                      output_path: str):
+    """Save conversations to JSONL file."""
+    with open(output_path, 'w', encoding='utf-8') as f:
+        for conv in conversations:
+            f.write(json.dumps(conv, ensure_ascii=False) + '\n')
+def main():
+    parser = argparse.ArgumentParser(
+        description="Semantically deduplicate ShareGPT JSONL files using SemHash",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  # Basic deduplication
+  python deduplicate_sharegpt.py input.jsonl output_dir
+  # Only consider human messages for similarity
+  python deduplicate_sharegpt.py input.jsonl output_dir --mode human_only
+  # Use custom similarity threshold
+  python deduplicate_sharegpt.py input.jsonl output_dir --threshold 0.8
+  # Enable verbose logging
+  python deduplicate_sharegpt.py input.jsonl output_dir --verbose
+        """
+    )
+    parser.add_argument("input_file", help="Input ShareGPT JSONL file")
+    parser.add_argument("output_dir", help="Output directory for deduplicated files")
+    parser.add_argument("--mode", choices=["full", "human_only", "assistant_only", "first_turn"],
+                       default="full", help="Text extraction mode (default: full)")
+    parser.add_argument("--threshold", type=float, default=0.85,
+                       help="Similarity threshold for deduplication (default: 0.85)")
+    parser.add_argument("--min-length", type=int, default=10,
+                       help="Minimum text length to consider (default: 10)")
+    parser.add_argument("--max-conversations", type=int,
+                       help="Maximum number of conversations to process")
+    parser.add_argument("--verbose", "-v", action="store_true",
+                       help="Enable verbose logging")
+    parser.add_argument("--save-explanations", action="store_true",
+                       help="Save explanation file showing why items were removed")
+    args = parser.parse_args()
+    # Setup logging
+    setup_logging(args.verbose)
+    # Validate input file
+    if not os.path.exists(args.input_file):
+        logging.error(f"Input file not found: {args.input_file}")
+        sys.exit(1)
+    # Create output directory
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    # Load conversations
+    logging.info("Loading conversations...")
+    conversations = load_sharegpt_jsonl(args.input_file)
+    if args.max_conversations:
+        conversations = conversations[:args.max_conversations]
+        logging.info(f"Limited to {len(conversations)} conversations")
+    # Extract text for deduplication
+    logging.info(f"Extracting text using mode: {args.mode}")
+    texts = []
+    valid_indices = []
+    for i, conv in enumerate(conversations):
+        if "conversations" not in conv:
+            logging.warning(f"Skipping conversation {i}: missing 'conversations' field")
+            continue
+        text = extract_conversation_text(conv["conversations"], args.mode)
+        if len(text) < args.min_length:
+            logging.debug(f"Skipping conversation {i}: text too short ({len(text)} chars)")
+            continue
+        texts.append(text)
+        valid_indices.append(i)
+    logging.info(f"Prepared {len(texts)} conversations for deduplication")
+    if not texts:
+        logging.error("No valid conversations found for deduplication")
+        sys.exit(1)
+    # Perform deduplication
+    logging.info("Initializing SemHash...")
+    try:
+        semhash = SemHash.from_records(records=texts)
+        logging.info("Performing self-deduplication...")
+        result = semhash.self_deduplicate(threshold=args.threshold)
+        # Get deduplicated texts
+        deduplicated_texts = result.selected
+        # Find which original conversations correspond to the deduplicated texts
+        deduplicated_conversations = []
+        deduplicated_indices = []
+        for dedup_text in deduplicated_texts:
+            for i, original_text in enumerate(texts):
+                if original_text == dedup_text:
+                    deduplicated_conversations.append(conversations[valid_indices[i]])
+                    deduplicated_indices.append(i)
+                    break
+        logging.info(f"Deduplication complete:")
+        logging.info(f"  Original: {len(texts)} conversations")
+        logging.info(f"  Deduplicated: {len(deduplicated_conversations)} conversations")
+        logging.info(f"  Removed: {len(texts) - len(deduplicated_conversations)} conversations")
+        logging.info(f"  Reduction: {((len(texts) - len(deduplicated_conversations)) / len(texts) * 100):.1f}%")
+    except Exception as e:
+        logging.error(f"Deduplication failed: {e}")
+        sys.exit(1)
+    # Save results
+    output_file = output_dir / "deduplicated.jsonl"
+    logging.info(f"Saving deduplicated conversations to {output_file}")
+    save_conversations(deduplicated_conversations, str(output_file))
+    # Save statistics
+    stats_file = output_dir / "deduplication_stats.json"
+    stats = {
+        "input_file": args.input_file,
+        "mode": args.mode,
+        "threshold": args.threshold,
+        "min_length": args.min_length,
+        "original_count": len(conversations),
+        "valid_count": len(texts),
+        "deduplicated_count": len(deduplicated_conversations),
+        "removed_count": len(texts) - len(deduplicated_conversations),
+        "reduction_percentage": ((len(texts) - len(deduplicated_conversations)) / len(texts) * 100) if texts else 0,
+        "duplicate_ratio": result.duplicate_ratio if hasattr(result, 'duplicate_ratio') else None,
+        "exact_duplicate_ratio": result.exact_duplicate_ratio if hasattr(result, 'exact_duplicate_ratio') else None
+    }
+    with open(stats_file, 'w', encoding='utf-8') as f:
+        json.dump(stats, f, indent=2, ensure_ascii=False)
+    logging.info(f"Statistics saved to {stats_file}")
+    # Save explanations if requested
+    if args.save_explanations:
+        explanations_file = output_dir / "explanations.txt"
+        logging.info(f"Saving explanations to {explanations_file}")
+        try:
+            # Get removed/duplicate texts from the result object
+            removed_texts = result.filtered if hasattr(result, 'filtered') else []
+            with open(explanations_file, 'w', encoding='utf-8') as f:
+                f.write(f"Deduplication Explanations\n")
+                f.write(f"={'=' * 50}\n\n")
+                f.write(f"Threshold: {args.threshold}\n")
+                f.write(f"Mode: {args.mode}\n")
+                f.write(f"Duplicate ratio: {result.duplicate_ratio if hasattr(result, 'duplicate_ratio') else 'N/A'}\n")
+                f.write(f"Exact duplicate ratio: {result.exact_duplicate_ratio if hasattr(result, 'exact_duplicate_ratio') else 'N/A'}\n\n")
+                if removed_texts:
+                    f.write(f"Removed {len(removed_texts)} conversations:\n\n")
+                    for i, removed_text in enumerate(removed_texts):
+                        # Find the original conversation index for this removed text
+                        original_idx = None
+                        for j, original_text in enumerate(texts):
+                            if original_text == removed_text:
+                                original_idx = valid_indices[j]
+                                break
+                        f.write(f"Removed conversation {original_idx or i}:\n")
+                        f.write(f"Text: {removed_text[:200]}{'...' if len(removed_text) > 200 else ''}\n")
+                        f.write(f"Reason: Semantically similar to retained conversation\n\n")
+                else:
+                    f.write("No specific removal information available from SemHash result.\n")
+                    f.write("Use result.get_least_similar_from_duplicates() for threshold tuning.\n")
+        except Exception as e:
+            logging.warning(f"Could not save explanations: {e}")
+    logging.info("Deduplication completed successfully!")
+if __name__ == "__main__":
+    main()