""" Test Qdrant search with real queries. This is your first end-to-end retrieval test. """ from src.utils.logger import get_logger, setup_logger from src.vectorstore.qdrant_store import QdrantStore from src.embeddings.embedding_model import EmbeddingModel setup_logger() logger = get_logger(__name__) def search_and_display(store: QdrantStore, model: EmbeddingModel, query: str, top_k: int = 3): """Run a search query and display results clearly.""" print(f"\n{'=' * 60}") print(f"QUERY: {query}") print(f"{'=' * 60}") # Embed the query (with BGE prefix) query_vector = model.embed_query(query) # Search Qdrant results = store.search(query_vector, top_k = top_k) if not results: print(f"No results found.") return for i, r in enumerate(results): print(f"\n[{i+1}] Score: {r['score']:.4f}") print(f" Paper: {r.get('paper_id', 'N/A')}") print(f" Title: {r.get('title', 'N/A')[:65]}...") print(f" Date: {r.get('published_date', 'N/A')}") print(f" Category: {r.get('primary_category', 'N/A')}") print(f" Chunk {r.get('chunk_index','?')}/{r.get('total_chunks','?')}") print(f" Text preview: {r.get('text','')[:150].replace(chr(10),' ')}...") def main(): logger.info("Loading model and connecting to Qdrant...") store = QdrantStore() model = EmbeddingModel() # Verify collection exists info = store.get_collection_info() logger.info(f"Collection info: {info}") if info.get("points_count", 0) == 0: logger.error("Collection is empty. Run run_indexing.py first.") return # --- Test queries covering different retrieval scenarios --- # Test 1: Conceptual Query search_and_display(store, model, "how does self-attention mechanism work in transformers", top_k=3 ) # Test 2: Task-specific query search_and_display(store, model, "reinforcement learning for multi-agent systems", top_k=3 ) # Test 3: Method comparison query search_and_display(store, model, "comparison of fine-tuning methods for large language models", top_k=3 ) # Test 4: with metadata filter - only cs.LG papers print(f"\n{'='*60}") print("FILTERED QUERY: 'neural network optimization' (cs.LG only)") print(f"{'='*60}") query_vector = model.embed_query("neural network optimization methods") results = store.search( query_vector, top_k = 3, filter_category = "cs.LG" ) for i, r in enumerate(results): print(f"[{i+1}] {r['score']:.4f} | {r.get('primary_category')} | {r.get('title','')[:55]}...") logger.info("\n✅ Search test complete.") if __name__ == "__main__": main()