broadfield-dev commited on
Commit
0c9364d
·
verified ·
1 Parent(s): c2ef06d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +57 -57
app.py CHANGED
@@ -1,85 +1,86 @@
1
  import gradio as gr
2
  import torch
3
  from PIL import Image
 
 
4
  import requests
5
  from io import BytesIO
6
- from sglang import Engine
7
- from qwen_vl_utils import process_vision_info
8
- from transformers import AutoProcessor
9
 
10
  # --- Configuration ---
11
- CHECKPOINT_PATH = "Qwen/Qwen3-VL-2B-Instruct-FP8"
 
 
12
 
13
- # --- Model and Processor Loading ---
14
- # Note: This is a heavy operation and will be done once when the Space starts.
15
- processor = AutoProcessor.from_pretrained(CHECKPOINT_PATH, trust_remote_code=True)
 
16
 
17
- # SGLang Engine setup for GPU
18
- # For a CPU space, this will be extremely slow. A GPU is strongly recommended.
19
- llm_engine = Engine(
20
- model_path=CHECKPOINT_PATH,
21
- enable_multimodal=True,
22
- mem_fraction_static=0.8,
23
- tp_size=1, # Set to 1 for a single GPU
24
- attention_backend="fa3"
25
- )
 
 
 
 
 
 
 
 
 
 
 
26
 
27
  # --- Inference Function ---
28
  def process_and_generate(image_input, text_prompt):
29
  """
30
- Processes the image and text prompt, and generates a response from the model.
31
  """
32
  if image_input is None or text_prompt.strip() == "":
33
  return "Please provide both an image and a text prompt."
34
 
35
- # Convert Gradio's image input (numpy array) to a PIL Image
36
  pil_image = Image.fromarray(image_input)
 
 
 
37
 
38
- # Prepare the messages payload for the model
39
- messages = [
40
- {
41
- "role": "user",
42
- "content": [
43
- {"type": "image", "image": pil_image},
44
- {"type": "text", "text": text_prompt},
45
- ],
46
- }
47
- ]
48
-
49
- # Apply the chat template and process vision info
50
- text = processor.apply_chat_template(
51
- messages,
52
- tokenize=False,
53
- add_generation_prompt=True
54
- )
55
-
56
- image_inputs, _ = process_vision_info(
57
- messages,
58
- image_patch_size=processor.image_processor.patch_size
59
- )
60
-
61
- # Define sampling parameters
62
- sampling_params = {"max_new_tokens": 1024, "temperature": 0.7}
63
 
64
- # Generate the response
65
  try:
66
- response = llm_engine.generate(
67
- prompt=text,
68
- image_data=image_inputs,
69
- sampling_params=sampling_params
70
- )
71
- return response['text']
 
72
  except Exception as e:
 
 
 
73
  return f"An error occurred during generation: {str(e)}"
74
 
75
  # --- Gradio Interface ---
76
  with gr.Blocks() as demo:
77
  gr.Markdown(
78
  """
79
- # Qwen3-VL-2B-Instruct-FP8 Demo
80
- This Space demonstrates the capabilities of the Qwen3-VL-2B-Instruct-FP8 model.
81
- Upload an image, type a question or a command, and see the model's response.
82
- **Note:** This demo is running on a CPU and may be slow. For better performance, consider upgrading to a GPU Space.
83
  """
84
  )
85
 
@@ -99,9 +100,8 @@ with gr.Blocks() as demo:
99
 
100
  gr.Examples(
101
  examples=[
102
- ["https://qianwen-res.oss-accelerate.aliyuncs.com/Qwen3-VL/receipt.png", "Read all the text in the image."],
103
- ["https://qianwen-res.oss-accelerate.aliyuncs.com/Qwen3-VL/what_is_in_the_box.jpg", "What is in the red box?"],
104
- ["https://qianwen-res.oss-accelerate.aliyuncs.com/Qwen3-VL/chart.png", "What is the value for 'Training & Other'?"],
105
  ],
106
  inputs=[image_input, text_prompt]
107
  )
 
1
  import gradio as gr
2
  import torch
3
  from PIL import Image
4
+ from transformers import AutoModelForCausalLM, AutoTokenizer
5
+ from transformers.generation import GenerationConfig
6
  import requests
7
  from io import BytesIO
8
+ import os
 
 
9
 
10
  # --- Configuration ---
11
+ # Using a CPU-compatible model from the Qwen family
12
+ MODEL_PATH = "Qwen/Qwen-VL-Chat"
13
+ CPU_DEVICE = "cpu" # Explicitly use CPU
14
 
15
+ # --- Model and Tokenizer Loading ---
16
+ # This will be done once when the Space starts. It will be slow on a CPU.
17
+ print("Loading model and tokenizer... This may take a while on a CPU.")
18
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True)
19
 
20
+ # For CPU, we load the model in bfloat16 if supported, otherwise float32.
21
+ # Note: This will consume a significant amount of RAM.
22
+ try:
23
+ model = AutoModelForCausalLM.from_pretrained(
24
+ MODEL_PATH,
25
+ device_map=CPU_DEVICE,
26
+ trust_remote_code=True,
27
+ bf16=torch.cuda.is_bf16_supported(), # bf16 on CPU can be slow, but uses less memory
28
+ ).eval()
29
+ except RuntimeError:
30
+ # Fallback to float32 if bf16 is not supported or causes issues
31
+ model = AutoModelForCausalLM.from_pretrained(
32
+ MODEL_PATH,
33
+ device_map=CPU_DEVICE,
34
+ trust_remote_code=True
35
+ ).eval()
36
+
37
+ # Specify generation configuration
38
+ model.generation_config = GenerationConfig.from_pretrained(MODEL_PATH, trust_remote_code=True)
39
+ print("Model and tokenizer loaded successfully.")
40
 
41
  # --- Inference Function ---
42
  def process_and_generate(image_input, text_prompt):
43
  """
44
+ Processes the image and text prompt, and generates a response from the model on the CPU.
45
  """
46
  if image_input is None or text_prompt.strip() == "":
47
  return "Please provide both an image and a text prompt."
48
 
49
+ # Convert Gradio's numpy array to a PIL Image
50
  pil_image = Image.fromarray(image_input)
51
+ # Create a temporary path to save the image
52
+ temp_image_path = "temp_image.png"
53
+ pil_image.save(temp_image_path)
54
 
55
+ # The model's tokenizer can directly handle an image path.
56
+ # We construct the query according to the model's required format.
57
+ query = tokenizer.from_list_format([
58
+ {'image': temp_image_path},
59
+ {'text': text_prompt},
60
+ ])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
 
62
+ print("Generating response... This will be slow.")
63
  try:
64
+ # Generate the response
65
+ response, history = model.chat(tokenizer, query=query, history=None)
66
+
67
+ # Clean up the temporary image file
68
+ os.remove(temp_image_path)
69
+
70
+ return response
71
  except Exception as e:
72
+ # Clean up even if there's an error
73
+ if os.path.exists(temp_image_path):
74
+ os.remove(temp_image_path)
75
  return f"An error occurred during generation: {str(e)}"
76
 
77
  # --- Gradio Interface ---
78
  with gr.Blocks() as demo:
79
  gr.Markdown(
80
  """
81
+ # Qwen-VL-Chat CPU Demo
82
+ This Space demonstrates the `Qwen/Qwen-VL-Chat` model, a CPU-compatible alternative to Qwen3-VL.
83
+ **Warning:** Running this vision-language model on a CPU is very slow. Please be patient after clicking generate.
 
84
  """
85
  )
86
 
 
100
 
101
  gr.Examples(
102
  examples=[
103
+ ["https://qianwen-res.oss-cn-beijing.aliyuncs.com/QWEN-VL/assets/demo.jpeg", "这是什么?"],
104
+ ["https://qianwen-res.oss-cn-beijing.aliyuncs.com/QWEN-VL/assets/demo.jpeg", "框出图中礼服和帽子"],
 
105
  ],
106
  inputs=[image_input, text_prompt]
107
  )