Spaces:

aquibmoin
/

NASA-SMD-SIMILARITY-CHECKER

Sleeping

App Files Files Community

aquibmoin commited on Feb 25

Commit

faf73ae

verified ·

1 Parent(s): 7d94a09

Update app.py

Browse files

Files changed (1) hide show

app.py +26 -19

app.py CHANGED Viewed

@@ -3,7 +3,7 @@ import fitz  # PyMuPDF for extracting text from PDFs
 from transformers import AutoTokenizer, AutoModel
 import torch
 from sklearn.metrics.pairwise import cosine_similarity
 # Load the NASA-specific bi-encoder model and tokenizer
 bi_encoder_model_name = "nasa-impact/nasa-smd-ibm-st-v2"
@@ -18,8 +18,8 @@ def extract_text_from_pdf(pdf_file):
             text += page.get_text()  # Extract text from each page
     return text
-# Function to generate embeddings from the text using the NASA Bi-Encoder
-def generate_embedding(text):
     # Tokenize the text and create input tensors
     inputs = bi_tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
@@ -31,14 +31,10 @@ def generate_embedding(text):
     # Mean pooling to get the final embedding for the text
     embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
-    return embedding
-# Function to compute the cosine similarity between two embeddings
-def compute_cosine_similarity(embedding1, embedding2):
-    # Reshape the embeddings and calculate cosine similarity
-    embedding1 = embedding1.reshape(1, -1)
-    embedding2 = embedding2.reshape(1, -1)
-    return cosine_similarity(embedding1, embedding2)[0][0]
 # Function to handle the full workflow: extract text, generate embeddings, and compute similarity
 def compare_pdfs(pdf1, pdf2):
@@ -46,20 +42,31 @@ def compare_pdfs(pdf1, pdf2):
     text1 = extract_text_from_pdf(pdf1)
     text2 = extract_text_from_pdf(pdf2)
-    # Generate embeddings for both texts using the NASA Bi-Encoder
-    embedding1 = generate_embedding(text1)
-    embedding2 = generate_embedding(text2)
     # Compute cosine similarity between the two embeddings
     similarity_score = compute_cosine_similarity(embedding1, embedding2)
-    # Return the similarity score
-    return f"The cosine similarity between the two PDF documents is: {similarity_score:.4f}"
-# Gradio interface: accept two PDF files and output cosine similarity score
 inputs = [gr.File(label="Upload Human SCDD"), gr.File(label="Upload AI SCDD")]
-outputs = gr.Textbox(label="Cosine Similarity Score")
 # Set up the Gradio interface
 gr.Interface(fn=compare_pdfs, inputs=inputs, outputs=outputs, title="AI-Human SCDD Similarity Checker with NASA Bi-Encoder").launch()

 from transformers import AutoTokenizer, AutoModel
 import torch
 from sklearn.metrics.pairwise import cosine_similarity
+import numpy as np
 # Load the NASA-specific bi-encoder model and tokenizer
 bi_encoder_model_name = "nasa-impact/nasa-smd-ibm-st-v2"
             text += page.get_text()  # Extract text from each page
     return text
+# Function to generate embeddings and return dimensions
+def generate_embedding_with_dim(text):
     # Tokenize the text and create input tensors
     inputs = bi_tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
     # Mean pooling to get the final embedding for the text
     embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
+    # Get the number of dimensions
+    embedding_dim = embedding.shape[0]
+    return embedding, f"Embedding Dimensions: {embedding_dim}"
 # Function to handle the full workflow: extract text, generate embeddings, and compute similarity
 def compare_pdfs(pdf1, pdf2):
     text1 = extract_text_from_pdf(pdf1)
     text2 = extract_text_from_pdf(pdf2)
+    # Generate embeddings and get their dimensions
+    embedding1, dim1 = generate_embedding_with_dim(text1)
+    embedding2, dim2 = generate_embedding_with_dim(text2)
     # Compute cosine similarity between the two embeddings
     similarity_score = compute_cosine_similarity(embedding1, embedding2)
+    # Return similarity score + embedding dimensions
+    return f"The cosine similarity between the two PDFs is: {similarity_score:.4f}", dim1, dim2
+# Function to compute the cosine similarity between two embeddings
+def compute_cosine_similarity(embedding1, embedding2):
+    embedding1 = embedding1.reshape(1, -1)
+    embedding2 = embedding2.reshape(1, -1)
+    return cosine_similarity(embedding1, embedding2)[0][0]
+# Gradio interface: accept two PDFs, show similarity + embedding dimensions
 inputs = [gr.File(label="Upload Human SCDD"), gr.File(label="Upload AI SCDD")]
+outputs = [
+    gr.Textbox(label="Cosine Similarity Score"),
+    gr.Textbox(label="Embedding Dimensions (PDF 1)"),
+    gr.Textbox(label="Embedding Dimensions (PDF 2)")
+]
 # Set up the Gradio interface
 gr.Interface(fn=compare_pdfs, inputs=inputs, outputs=outputs, title="AI-Human SCDD Similarity Checker with NASA Bi-Encoder").launch()