|
|
import gradio as gr |
|
|
import torch |
|
|
from PIL import Image |
|
|
import requests |
|
|
from io import BytesIO |
|
|
from sglang import Engine |
|
|
from qwen_vl_utils import process_vision_info |
|
|
from transformers import AutoProcessor |
|
|
|
|
|
|
|
|
CHECKPOINT_PATH = "Qwen/Qwen3-VL-2B-Instruct-FP8" |
|
|
|
|
|
|
|
|
|
|
|
processor = AutoProcessor.from_pretrained(CHECKPOINT_PATH, trust_remote_code=True) |
|
|
|
|
|
|
|
|
|
|
|
llm_engine = Engine( |
|
|
model_path=CHECKPOINT_PATH, |
|
|
enable_multimodal=True, |
|
|
mem_fraction_static=0.8, |
|
|
tp_size=1, |
|
|
attention_backend="fa3" |
|
|
) |
|
|
|
|
|
|
|
|
def process_and_generate(image_input, text_prompt): |
|
|
""" |
|
|
Processes the image and text prompt, and generates a response from the model. |
|
|
""" |
|
|
if image_input is None or text_prompt.strip() == "": |
|
|
return "Please provide both an image and a text prompt." |
|
|
|
|
|
|
|
|
pil_image = Image.fromarray(image_input) |
|
|
|
|
|
|
|
|
messages = [ |
|
|
{ |
|
|
"role": "user", |
|
|
"content": [ |
|
|
{"type": "image", "image": pil_image}, |
|
|
{"type": "text", "text": text_prompt}, |
|
|
], |
|
|
} |
|
|
] |
|
|
|
|
|
|
|
|
text = processor.apply_chat_template( |
|
|
messages, |
|
|
tokenize=False, |
|
|
add_generation_prompt=True |
|
|
) |
|
|
|
|
|
image_inputs, _ = process_vision_info( |
|
|
messages, |
|
|
image_patch_size=processor.image_processor.patch_size |
|
|
) |
|
|
|
|
|
|
|
|
sampling_params = {"max_new_tokens": 1024, "temperature": 0.7} |
|
|
|
|
|
|
|
|
try: |
|
|
response = llm_engine.generate( |
|
|
prompt=text, |
|
|
image_data=image_inputs, |
|
|
sampling_params=sampling_params |
|
|
) |
|
|
return response['text'] |
|
|
except Exception as e: |
|
|
return f"An error occurred during generation: {str(e)}" |
|
|
|
|
|
|
|
|
with gr.Blocks() as demo: |
|
|
gr.Markdown( |
|
|
""" |
|
|
# Qwen3-VL-2B-Instruct-FP8 Demo |
|
|
This Space demonstrates the capabilities of the Qwen3-VL-2B-Instruct-FP8 model. |
|
|
Upload an image, type a question or a command, and see the model's response. |
|
|
**Note:** This demo is running on a CPU and may be slow. For better performance, consider upgrading to a GPU Space. |
|
|
""" |
|
|
) |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(): |
|
|
image_input = gr.Image(type="numpy", label="Upload Image") |
|
|
text_prompt = gr.Textbox(label="Prompt", placeholder="e.g., Describe this image in detail.") |
|
|
submit_button = gr.Button("Generate Response") |
|
|
with gr.Column(): |
|
|
output_text = gr.Textbox(label="Model Output", lines=10, interactive=False) |
|
|
|
|
|
submit_button.click( |
|
|
fn=process_and_generate, |
|
|
inputs=[image_input, text_prompt], |
|
|
outputs=output_text |
|
|
) |
|
|
|
|
|
gr.Examples( |
|
|
examples=[ |
|
|
["https://qianwen-res.oss-accelerate.aliyuncs.com/Qwen3-VL/receipt.png", "Read all the text in the image."], |
|
|
["https://qianwen-res.oss-accelerate.aliyuncs.com/Qwen3-VL/what_is_in_the_box.jpg", "What is in the red box?"], |
|
|
["https://qianwen-res.oss-accelerate.aliyuncs.com/Qwen3-VL/chart.png", "What is the value for 'Training & Other'?"], |
|
|
], |
|
|
inputs=[image_input, text_prompt] |
|
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo.launch() |