broadfield-dev's picture
Create app.py
c2ef06d verified
raw
history blame
3.67 kB
import gradio as gr
import torch
from PIL import Image
import requests
from io import BytesIO
from sglang import Engine
from qwen_vl_utils import process_vision_info
from transformers import AutoProcessor
# --- Configuration ---
CHECKPOINT_PATH = "Qwen/Qwen3-VL-2B-Instruct-FP8"
# --- Model and Processor Loading ---
# Note: This is a heavy operation and will be done once when the Space starts.
processor = AutoProcessor.from_pretrained(CHECKPOINT_PATH, trust_remote_code=True)
# SGLang Engine setup for GPU
# For a CPU space, this will be extremely slow. A GPU is strongly recommended.
llm_engine = Engine(
model_path=CHECKPOINT_PATH,
enable_multimodal=True,
mem_fraction_static=0.8,
tp_size=1, # Set to 1 for a single GPU
attention_backend="fa3"
)
# --- Inference Function ---
def process_and_generate(image_input, text_prompt):
"""
Processes the image and text prompt, and generates a response from the model.
"""
if image_input is None or text_prompt.strip() == "":
return "Please provide both an image and a text prompt."
# Convert Gradio's image input (numpy array) to a PIL Image
pil_image = Image.fromarray(image_input)
# Prepare the messages payload for the model
messages = [
{
"role": "user",
"content": [
{"type": "image", "image": pil_image},
{"type": "text", "text": text_prompt},
],
}
]
# Apply the chat template and process vision info
text = processor.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True
)
image_inputs, _ = process_vision_info(
messages,
image_patch_size=processor.image_processor.patch_size
)
# Define sampling parameters
sampling_params = {"max_new_tokens": 1024, "temperature": 0.7}
# Generate the response
try:
response = llm_engine.generate(
prompt=text,
image_data=image_inputs,
sampling_params=sampling_params
)
return response['text']
except Exception as e:
return f"An error occurred during generation: {str(e)}"
# --- Gradio Interface ---
with gr.Blocks() as demo:
gr.Markdown(
"""
# Qwen3-VL-2B-Instruct-FP8 Demo
This Space demonstrates the capabilities of the Qwen3-VL-2B-Instruct-FP8 model.
Upload an image, type a question or a command, and see the model's response.
**Note:** This demo is running on a CPU and may be slow. For better performance, consider upgrading to a GPU Space.
"""
)
with gr.Row():
with gr.Column():
image_input = gr.Image(type="numpy", label="Upload Image")
text_prompt = gr.Textbox(label="Prompt", placeholder="e.g., Describe this image in detail.")
submit_button = gr.Button("Generate Response")
with gr.Column():
output_text = gr.Textbox(label="Model Output", lines=10, interactive=False)
submit_button.click(
fn=process_and_generate,
inputs=[image_input, text_prompt],
outputs=output_text
)
gr.Examples(
examples=[
["https://qianwen-res.oss-accelerate.aliyuncs.com/Qwen3-VL/receipt.png", "Read all the text in the image."],
["https://qianwen-res.oss-accelerate.aliyuncs.com/Qwen3-VL/what_is_in_the_box.jpg", "What is in the red box?"],
["https://qianwen-res.oss-accelerate.aliyuncs.com/Qwen3-VL/chart.png", "What is the value for 'Training & Other'?"],
],
inputs=[image_input, text_prompt]
)
if __name__ == "__main__":
demo.launch()