import gradio as gr
import torch
from PIL import Image
import requests
from io import BytesIO
from sglang import Engine
from qwen_vl_utils import process_vision_info
from transformers import AutoProcessor

# --- Configuration ---
CHECKPOINT_PATH = "Qwen/Qwen3-VL-2B-Instruct-FP8"

# --- Model and Processor Loading ---
# Note: This is a heavy operation and will be done once when the Space starts.
processor = AutoProcessor.from_pretrained(CHECKPOINT_PATH, trust_remote_code=True)

# SGLang Engine setup for GPU
# For a CPU space, this will be extremely slow. A GPU is strongly recommended.
llm_engine = Engine(
    model_path=CHECKPOINT_PATH,
    enable_multimodal=True,
    mem_fraction_static=0.8,
    tp_size=1, # Set to 1 for a single GPU
    attention_backend="fa3"
)

# --- Inference Function ---
def process_and_generate(image_input, text_prompt):
    """
    Processes the image and text prompt, and generates a response from the model.
    """
    if image_input is None or text_prompt.strip() == "":
        return "Please provide both an image and a text prompt."

    # Convert Gradio's image input (numpy array) to a PIL Image
    pil_image = Image.fromarray(image_input)

    # Prepare the messages payload for the model
    messages = [
        {
            "role": "user",
            "content": [
                {"type": "image", "image": pil_image},
                {"type": "text", "text": text_prompt},
            ],
        }
    ]

    # Apply the chat template and process vision info
    text = processor.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )

    image_inputs, _ = process_vision_info(
        messages,
        image_patch_size=processor.image_processor.patch_size
    )

    # Define sampling parameters
    sampling_params = {"max_new_tokens": 1024, "temperature": 0.7}

    # Generate the response
    try:
        response = llm_engine.generate(
            prompt=text,
            image_data=image_inputs,
            sampling_params=sampling_params
        )
        return response['text']
    except Exception as e:
        return f"An error occurred during generation: {str(e)}"

# --- Gradio Interface ---
with gr.Blocks() as demo:
    gr.Markdown(
        """
        # Qwen3-VL-2B-Instruct-FP8 Demo
        This Space demonstrates the capabilities of the Qwen3-VL-2B-Instruct-FP8 model.
        Upload an image, type a question or a command, and see the model's response.
        **Note:** This demo is running on a CPU and may be slow. For better performance, consider upgrading to a GPU Space.
        """
    )

    with gr.Row():
        with gr.Column():
            image_input = gr.Image(type="numpy", label="Upload Image")
            text_prompt = gr.Textbox(label="Prompt", placeholder="e.g., Describe this image in detail.")
            submit_button = gr.Button("Generate Response")
        with gr.Column():
            output_text = gr.Textbox(label="Model Output", lines=10, interactive=False)

    submit_button.click(
        fn=process_and_generate,
        inputs=[image_input, text_prompt],
        outputs=output_text
    )

    gr.Examples(
        examples=[
            ["https://qianwen-res.oss-accelerate.aliyuncs.com/Qwen3-VL/receipt.png", "Read all the text in the image."],
            ["https://qianwen-res.oss-accelerate.aliyuncs.com/Qwen3-VL/what_is_in_the_box.jpg", "What is in the red box?"],
            ["https://qianwen-res.oss-accelerate.aliyuncs.com/Qwen3-VL/chart.png", "What is the value for 'Training & Other'?"],
        ],
        inputs=[image_input, text_prompt]
    )

if __name__ == "__main__":
    demo.launch()