Final_Assignment_Template

Sleeping

App Files Files Community

TommasoBB commited on Mar 4

Commit

ad49360

verified ·

1 Parent(s): c2a3074

Update app.py

Browse files

Files changed (1) hide show

app.py +107 -21

app.py CHANGED Viewed

@@ -1,4 +1,6 @@
 import os
 import gradio as gr
 from gradio_client import file
 import requests
@@ -16,10 +18,59 @@ from langchain_core.messages import HumanMessage
 DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
 # --- Models ---
-# Vision model for image analysis / OCR
-vision_model = ApiModel(model_id="FireRedTeam/FireRed-OCR", max_new_tokens=2048, temperature=0.3)
-math_model = ApiModel(model_id="Qwen/Qwen2.5-Math-1.5B", max_new_tokens=2048, temperature=0.3)
 #define the state
 class AgentState(TypedDict):
     question: str
@@ -129,17 +180,54 @@ Return a JSON object with the following fields:
     "transcribed_text": "All text visible in the image transcribed here."
 }}"""
-    # Multimodal message: the vision model receives both text and image
-    messages = [
-        HumanMessage(content=[
-            {"type": "text", "text": prompt_text},
-            {"type": "image_url", "image_url": {"url": image_data_uri}}
-        ])
-    ]
-    # Use the dedicated vision model (FireRed-OCR) for image analysis
-    response = vision_model.invoke(messages)
-    image_description = response.get("image_description", "")
-    transcribed_text = response.get("transcribed_text", "")
     print(f"Image description: {image_description[:100]}...")
     print(f"Transcribed text: {transcribed_text[:100]}...")
     new_messages = state.get("messages", []) + [
@@ -184,7 +272,7 @@ Return a JSON object with the following field:
 }}"""
     messages = [HumanMessage(content=prompt)]
     response = model.invoke(messages)
-    extracted_info = response.get("extracted_info", "")
     print(f"Extracted file info: {extracted_info[:100]}...")
     new_messages = state.get("messages", []) + [
         {"role": "system", "content": "Read and extract information from the attached file."},
@@ -202,7 +290,7 @@ def handle_math(state: AgentState) -> str:
     print(f"Agent is handling a math problem: {question[:50]}...")
     messages = [HumanMessage(content=f"Solve the following math problem step by step:\n\n{question}")]
     response = math_model.invoke(messages)
-    solution = response.get("solution", "")
     print(f"Math solution: {solution[:100]}...")
     new_messages = state.get("messages", []) + [
         {"role": "system", "content": "Handle the question if classified as a math problem."},
@@ -236,10 +324,9 @@ Context gathered:
 """
     messages = [HumanMessage(content=prompt)]
     # Use the general model for final answer synthesis
-    general_model = ApiModel(model_id="Qwen3.5-35B-A3B", max_new_tokens=2048, temperature=0.3)
-    response = general_model.invoke(messages)
-    raw_response = response.content if hasattr(response, 'content') else str(response)
     # Extract the final answer after "FINAL ANSWER:" if present
     if "FINAL ANSWER:" in raw_response:
         final_answer = raw_response.split("FINAL ANSWER:")[-1].strip()
@@ -299,7 +386,6 @@ class BasicAgent:
         self.image_reader = tools.ImageReaderTool()
         self.web_search = tools.WebSearchTool()
         self.tools = [self.file_reader, self.image_reader, self.web_search]
-        self.vision_model = vision_model  # FireRedTeam/FireRed-OCR for image tasks
         print("Agent initialized.")
     def __call__(self, question: str, task_id: str = "", file_name: str = "") -> str:

 import os
+import base64
+from io import BytesIO
 import gradio as gr
 from gradio_client import file
 import requests
 DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
 # --- Models ---
+def _build_hf_model(model_name: str) -> HfApiModel:
+    """Build HfApiModel across versions that expect repo_id or model_id."""
+    try:
+        return HfApiModel(repo_id=model_name, max_new_tokens=2048, temperature=0.3)
+    except TypeError:
+        return HfApiModel(model_id=model_name, max_new_tokens=2048, temperature=0.3)
+# Text/math models via smolagents
+model = _build_hf_model("Qwen3.5-35B-A3B")
+math_model = _build_hf_model("Qwen/Qwen2.5-Math-1.5B")
+# FireRed OCR (Transformers) loaded lazily to avoid startup crashes
+_fire_red_model = None
+_fire_red_processor = None
+def _load_fire_red_ocr():
+    """Lazy-load FireRed OCR model and processor using Transformers."""
+    global _fire_red_model, _fire_red_processor
+    if _fire_red_model is not None and _fire_red_processor is not None:
+        return _fire_red_model, _fire_red_processor
+    import torch
+    from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
+    _fire_red_model = Qwen3VLForConditionalGeneration.from_pretrained(
+        "FireRedTeam/FireRed-OCR",
+        torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
+        device_map="auto",
+    )
+    _fire_red_processor = AutoProcessor.from_pretrained("FireRedTeam/FireRed-OCR")
+    return _fire_red_model, _fire_red_processor
+def _extract_text_from_response(response: Any) -> str:
+    """Normalize model responses into plain text."""
+    if response is None:
+        return ""
+    if isinstance(response, str):
+        return response
+    if isinstance(response, dict):
+        for key in ("content", "answer", "output", "text", "solution", "extracted_info"):
+            if key in response and response[key] is not None:
+                return str(response[key])
+        return str(response)
+    content = getattr(response, "content", None)
+    if content is not None:
+        return str(content)
+    return str(response)
 #define the state
 class AgentState(TypedDict):
     question: str
     "transcribed_text": "All text visible in the image transcribed here."
 }}"""
+    try:
+        # Decode base64 data URI into bytes/PIL image
+        _, b64_data = image_data_uri.split(",", 1)
+        image_bytes = base64.b64decode(b64_data)
+        from PIL import Image
+        image = Image.open(BytesIO(image_bytes)).convert("RGB")
+        ocr_model, ocr_processor = _load_fire_red_ocr()
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image", "image": image},
+                    {"type": "text", "text": prompt_text},
+                ],
+            }
+        ]
+        text = ocr_processor.apply_chat_template(
+            messages,
+            tokenize=False,
+            add_generation_prompt=True,
+        )
+        inputs = ocr_processor(
+            text=[text],
+            images=[image],
+            return_tensors="pt",
+            padding=True,
+        )
+        inputs = {k: v.to(ocr_model.device) for k, v in inputs.items()}
+        generated_ids = ocr_model.generate(**inputs, max_new_tokens=2048)
+        prompt_len = inputs["input_ids"].shape[1]
+        generated_trimmed = generated_ids[:, prompt_len:]
+        output_text = ocr_processor.batch_decode(
+            generated_trimmed,
+            skip_special_tokens=True,
+            clean_up_tokenization_spaces=False,
+        )
+        ocr_text = output_text[0].strip() if output_text else ""
+    except Exception as e:
+        ocr_text = f"OCR error: {e}"
+    image_description = ocr_text
+    transcribed_text = ocr_text
     print(f"Image description: {image_description[:100]}...")
     print(f"Transcribed text: {transcribed_text[:100]}...")
     new_messages = state.get("messages", []) + [
 }}"""
     messages = [HumanMessage(content=prompt)]
     response = model.invoke(messages)
+    extracted_info = _extract_text_from_response(response)
     print(f"Extracted file info: {extracted_info[:100]}...")
     new_messages = state.get("messages", []) + [
         {"role": "system", "content": "Read and extract information from the attached file."},
     print(f"Agent is handling a math problem: {question[:50]}...")
     messages = [HumanMessage(content=f"Solve the following math problem step by step:\n\n{question}")]
     response = math_model.invoke(messages)
+    solution = _extract_text_from_response(response)
     print(f"Math solution: {solution[:100]}...")
     new_messages = state.get("messages", []) + [
         {"role": "system", "content": "Handle the question if classified as a math problem."},
 """
     messages = [HumanMessage(content=prompt)]
     # Use the general model for final answer synthesis
+    response = model.invoke(messages)
+    raw_response = _extract_text_from_response(response)
     # Extract the final answer after "FINAL ANSWER:" if present
     if "FINAL ANSWER:" in raw_response:
         final_answer = raw_response.split("FINAL ANSWER:")[-1].strip()
         self.image_reader = tools.ImageReaderTool()
         self.web_search = tools.WebSearchTool()
         self.tools = [self.file_reader, self.image_reader, self.web_search]
         print("Agent initialized.")
     def __call__(self, question: str, task_id: str = "", file_name: str = "") -> str: