Spaces:

broadfield-dev
/

qwen3-vl-2b-instruct

Paused

App Files Files Community

qwen3-vl-2b-instruct / app.py

broadfield-dev

Create app.py

c2ef06d verified 2 months ago

raw

history blame

3.67 kB

	import gradio as gr
	import torch
	from PIL import Image
	import requests
	from io import BytesIO
	from sglang import Engine
	from qwen_vl_utils import process_vision_info
	from transformers import AutoProcessor

	# --- Configuration ---
	CHECKPOINT_PATH = "Qwen/Qwen3-VL-2B-Instruct-FP8"

	# --- Model and Processor Loading ---
	# Note: This is a heavy operation and will be done once when the Space starts.
	processor = AutoProcessor.from_pretrained(CHECKPOINT_PATH, trust_remote_code=True)

	# SGLang Engine setup for GPU
	# For a CPU space, this will be extremely slow. A GPU is strongly recommended.
	llm_engine = Engine(
	model_path=CHECKPOINT_PATH,
	enable_multimodal=True,
	mem_fraction_static=0.8,
	tp_size=1, # Set to 1 for a single GPU
	attention_backend="fa3"
	)

	# --- Inference Function ---
	def process_and_generate(image_input, text_prompt):
	"""
	Processes the image and text prompt, and generates a response from the model.
	"""
	if image_input is None or text_prompt.strip() == "":
	return "Please provide both an image and a text prompt."

	# Convert Gradio's image input (numpy array) to a PIL Image
	pil_image = Image.fromarray(image_input)

	# Prepare the messages payload for the model
	messages = [
	{
	"role": "user",
	"content": [
	{"type": "image", "image": pil_image},
	{"type": "text", "text": text_prompt},
	],
	}
	]

	# Apply the chat template and process vision info
	text = processor.apply_chat_template(
	messages,
	tokenize=False,
	add_generation_prompt=True
	)

	image_inputs, _ = process_vision_info(
	messages,
	image_patch_size=processor.image_processor.patch_size
	)

	# Define sampling parameters
	sampling_params = {"max_new_tokens": 1024, "temperature": 0.7}

	# Generate the response
	try:
	response = llm_engine.generate(
	prompt=text,
	image_data=image_inputs,
	sampling_params=sampling_params
	)
	return response['text']
	except Exception as e:
	return f"An error occurred during generation: {str(e)}"

	# --- Gradio Interface ---
	with gr.Blocks() as demo:
	gr.Markdown(
	"""
	# Qwen3-VL-2B-Instruct-FP8 Demo
	This Space demonstrates the capabilities of the Qwen3-VL-2B-Instruct-FP8 model.
	Upload an image, type a question or a command, and see the model's response.
	Note: This demo is running on a CPU and may be slow. For better performance, consider upgrading to a GPU Space.
	"""
	)

	with gr.Row():
	with gr.Column():
	image_input = gr.Image(type="numpy", label="Upload Image")
	text_prompt = gr.Textbox(label="Prompt", placeholder="e.g., Describe this image in detail.")
	submit_button = gr.Button("Generate Response")
	with gr.Column():
	output_text = gr.Textbox(label="Model Output", lines=10, interactive=False)

	submit_button.click(
	fn=process_and_generate,
	inputs=[image_input, text_prompt],
	outputs=output_text
	)

	gr.Examples(
	examples=[
	["https://qianwen-res.oss-accelerate.aliyuncs.com/Qwen3-VL/receipt.png", "Read all the text in the image."],
	["https://qianwen-res.oss-accelerate.aliyuncs.com/Qwen3-VL/what_is_in_the_box.jpg", "What is in the red box?"],
	["https://qianwen-res.oss-accelerate.aliyuncs.com/Qwen3-VL/chart.png", "What is the value for 'Training & Other'?"],
	],
	inputs=[image_input, text_prompt]
	)

	if __name__ == "__main__":
	demo.launch()