import gradio as gr import torch from PIL import Image import requests from io import BytesIO from sglang import Engine from qwen_vl_utils import process_vision_info from transformers import AutoProcessor # --- Configuration --- CHECKPOINT_PATH = "Qwen/Qwen3-VL-2B-Instruct-FP8" # --- Model and Processor Loading --- # Note: This is a heavy operation and will be done once when the Space starts. processor = AutoProcessor.from_pretrained(CHECKPOINT_PATH, trust_remote_code=True) # SGLang Engine setup for GPU # For a CPU space, this will be extremely slow. A GPU is strongly recommended. llm_engine = Engine( model_path=CHECKPOINT_PATH, enable_multimodal=True, mem_fraction_static=0.8, tp_size=1, # Set to 1 for a single GPU attention_backend="fa3" ) # --- Inference Function --- def process_and_generate(image_input, text_prompt): """ Processes the image and text prompt, and generates a response from the model. """ if image_input is None or text_prompt.strip() == "": return "Please provide both an image and a text prompt." # Convert Gradio's image input (numpy array) to a PIL Image pil_image = Image.fromarray(image_input) # Prepare the messages payload for the model messages = [ { "role": "user", "content": [ {"type": "image", "image": pil_image}, {"type": "text", "text": text_prompt}, ], } ] # Apply the chat template and process vision info text = processor.apply_chat_template( messages, tokenize=False, add_generation_prompt=True ) image_inputs, _ = process_vision_info( messages, image_patch_size=processor.image_processor.patch_size ) # Define sampling parameters sampling_params = {"max_new_tokens": 1024, "temperature": 0.7} # Generate the response try: response = llm_engine.generate( prompt=text, image_data=image_inputs, sampling_params=sampling_params ) return response['text'] except Exception as e: return f"An error occurred during generation: {str(e)}" # --- Gradio Interface --- with gr.Blocks() as demo: gr.Markdown( """ # Qwen3-VL-2B-Instruct-FP8 Demo This Space demonstrates the capabilities of the Qwen3-VL-2B-Instruct-FP8 model. Upload an image, type a question or a command, and see the model's response. **Note:** This demo is running on a CPU and may be slow. For better performance, consider upgrading to a GPU Space. """ ) with gr.Row(): with gr.Column(): image_input = gr.Image(type="numpy", label="Upload Image") text_prompt = gr.Textbox(label="Prompt", placeholder="e.g., Describe this image in detail.") submit_button = gr.Button("Generate Response") with gr.Column(): output_text = gr.Textbox(label="Model Output", lines=10, interactive=False) submit_button.click( fn=process_and_generate, inputs=[image_input, text_prompt], outputs=output_text ) gr.Examples( examples=[ ["https://qianwen-res.oss-accelerate.aliyuncs.com/Qwen3-VL/receipt.png", "Read all the text in the image."], ["https://qianwen-res.oss-accelerate.aliyuncs.com/Qwen3-VL/what_is_in_the_box.jpg", "What is in the red box?"], ["https://qianwen-res.oss-accelerate.aliyuncs.com/Qwen3-VL/chart.png", "What is the value for 'Training & Other'?"], ], inputs=[image_input, text_prompt] ) if __name__ == "__main__": demo.launch()