Update app.py
Browse files
app.py
CHANGED
|
@@ -1,85 +1,86 @@
|
|
| 1 |
import gradio as gr
|
| 2 |
import torch
|
| 3 |
from PIL import Image
|
|
|
|
|
|
|
| 4 |
import requests
|
| 5 |
from io import BytesIO
|
| 6 |
-
|
| 7 |
-
from qwen_vl_utils import process_vision_info
|
| 8 |
-
from transformers import AutoProcessor
|
| 9 |
|
| 10 |
# --- Configuration ---
|
| 11 |
-
|
|
|
|
|
|
|
| 12 |
|
| 13 |
-
# --- Model and
|
| 14 |
-
#
|
| 15 |
-
|
|
|
|
| 16 |
|
| 17 |
-
#
|
| 18 |
-
#
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
|
| 27 |
# --- Inference Function ---
|
| 28 |
def process_and_generate(image_input, text_prompt):
|
| 29 |
"""
|
| 30 |
-
Processes the image and text prompt, and generates a response from the model.
|
| 31 |
"""
|
| 32 |
if image_input is None or text_prompt.strip() == "":
|
| 33 |
return "Please provide both an image and a text prompt."
|
| 34 |
|
| 35 |
-
# Convert Gradio's
|
| 36 |
pil_image = Image.fromarray(image_input)
|
|
|
|
|
|
|
|
|
|
| 37 |
|
| 38 |
-
#
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
{"type": "text", "text": text_prompt},
|
| 45 |
-
],
|
| 46 |
-
}
|
| 47 |
-
]
|
| 48 |
-
|
| 49 |
-
# Apply the chat template and process vision info
|
| 50 |
-
text = processor.apply_chat_template(
|
| 51 |
-
messages,
|
| 52 |
-
tokenize=False,
|
| 53 |
-
add_generation_prompt=True
|
| 54 |
-
)
|
| 55 |
-
|
| 56 |
-
image_inputs, _ = process_vision_info(
|
| 57 |
-
messages,
|
| 58 |
-
image_patch_size=processor.image_processor.patch_size
|
| 59 |
-
)
|
| 60 |
-
|
| 61 |
-
# Define sampling parameters
|
| 62 |
-
sampling_params = {"max_new_tokens": 1024, "temperature": 0.7}
|
| 63 |
|
| 64 |
-
|
| 65 |
try:
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
)
|
| 71 |
-
|
|
|
|
| 72 |
except Exception as e:
|
|
|
|
|
|
|
|
|
|
| 73 |
return f"An error occurred during generation: {str(e)}"
|
| 74 |
|
| 75 |
# --- Gradio Interface ---
|
| 76 |
with gr.Blocks() as demo:
|
| 77 |
gr.Markdown(
|
| 78 |
"""
|
| 79 |
-
#
|
| 80 |
-
This Space demonstrates the
|
| 81 |
-
|
| 82 |
-
**Note:** This demo is running on a CPU and may be slow. For better performance, consider upgrading to a GPU Space.
|
| 83 |
"""
|
| 84 |
)
|
| 85 |
|
|
@@ -99,9 +100,8 @@ with gr.Blocks() as demo:
|
|
| 99 |
|
| 100 |
gr.Examples(
|
| 101 |
examples=[
|
| 102 |
-
["https://qianwen-res.oss-
|
| 103 |
-
["https://qianwen-res.oss-
|
| 104 |
-
["https://qianwen-res.oss-accelerate.aliyuncs.com/Qwen3-VL/chart.png", "What is the value for 'Training & Other'?"],
|
| 105 |
],
|
| 106 |
inputs=[image_input, text_prompt]
|
| 107 |
)
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
import torch
|
| 3 |
from PIL import Image
|
| 4 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer
|
| 5 |
+
from transformers.generation import GenerationConfig
|
| 6 |
import requests
|
| 7 |
from io import BytesIO
|
| 8 |
+
import os
|
|
|
|
|
|
|
| 9 |
|
| 10 |
# --- Configuration ---
|
| 11 |
+
# Using a CPU-compatible model from the Qwen family
|
| 12 |
+
MODEL_PATH = "Qwen/Qwen-VL-Chat"
|
| 13 |
+
CPU_DEVICE = "cpu" # Explicitly use CPU
|
| 14 |
|
| 15 |
+
# --- Model and Tokenizer Loading ---
|
| 16 |
+
# This will be done once when the Space starts. It will be slow on a CPU.
|
| 17 |
+
print("Loading model and tokenizer... This may take a while on a CPU.")
|
| 18 |
+
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True)
|
| 19 |
|
| 20 |
+
# For CPU, we load the model in bfloat16 if supported, otherwise float32.
|
| 21 |
+
# Note: This will consume a significant amount of RAM.
|
| 22 |
+
try:
|
| 23 |
+
model = AutoModelForCausalLM.from_pretrained(
|
| 24 |
+
MODEL_PATH,
|
| 25 |
+
device_map=CPU_DEVICE,
|
| 26 |
+
trust_remote_code=True,
|
| 27 |
+
bf16=torch.cuda.is_bf16_supported(), # bf16 on CPU can be slow, but uses less memory
|
| 28 |
+
).eval()
|
| 29 |
+
except RuntimeError:
|
| 30 |
+
# Fallback to float32 if bf16 is not supported or causes issues
|
| 31 |
+
model = AutoModelForCausalLM.from_pretrained(
|
| 32 |
+
MODEL_PATH,
|
| 33 |
+
device_map=CPU_DEVICE,
|
| 34 |
+
trust_remote_code=True
|
| 35 |
+
).eval()
|
| 36 |
+
|
| 37 |
+
# Specify generation configuration
|
| 38 |
+
model.generation_config = GenerationConfig.from_pretrained(MODEL_PATH, trust_remote_code=True)
|
| 39 |
+
print("Model and tokenizer loaded successfully.")
|
| 40 |
|
| 41 |
# --- Inference Function ---
|
| 42 |
def process_and_generate(image_input, text_prompt):
|
| 43 |
"""
|
| 44 |
+
Processes the image and text prompt, and generates a response from the model on the CPU.
|
| 45 |
"""
|
| 46 |
if image_input is None or text_prompt.strip() == "":
|
| 47 |
return "Please provide both an image and a text prompt."
|
| 48 |
|
| 49 |
+
# Convert Gradio's numpy array to a PIL Image
|
| 50 |
pil_image = Image.fromarray(image_input)
|
| 51 |
+
# Create a temporary path to save the image
|
| 52 |
+
temp_image_path = "temp_image.png"
|
| 53 |
+
pil_image.save(temp_image_path)
|
| 54 |
|
| 55 |
+
# The model's tokenizer can directly handle an image path.
|
| 56 |
+
# We construct the query according to the model's required format.
|
| 57 |
+
query = tokenizer.from_list_format([
|
| 58 |
+
{'image': temp_image_path},
|
| 59 |
+
{'text': text_prompt},
|
| 60 |
+
])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 61 |
|
| 62 |
+
print("Generating response... This will be slow.")
|
| 63 |
try:
|
| 64 |
+
# Generate the response
|
| 65 |
+
response, history = model.chat(tokenizer, query=query, history=None)
|
| 66 |
+
|
| 67 |
+
# Clean up the temporary image file
|
| 68 |
+
os.remove(temp_image_path)
|
| 69 |
+
|
| 70 |
+
return response
|
| 71 |
except Exception as e:
|
| 72 |
+
# Clean up even if there's an error
|
| 73 |
+
if os.path.exists(temp_image_path):
|
| 74 |
+
os.remove(temp_image_path)
|
| 75 |
return f"An error occurred during generation: {str(e)}"
|
| 76 |
|
| 77 |
# --- Gradio Interface ---
|
| 78 |
with gr.Blocks() as demo:
|
| 79 |
gr.Markdown(
|
| 80 |
"""
|
| 81 |
+
# Qwen-VL-Chat CPU Demo
|
| 82 |
+
This Space demonstrates the `Qwen/Qwen-VL-Chat` model, a CPU-compatible alternative to Qwen3-VL.
|
| 83 |
+
**Warning:** Running this vision-language model on a CPU is very slow. Please be patient after clicking generate.
|
|
|
|
| 84 |
"""
|
| 85 |
)
|
| 86 |
|
|
|
|
| 100 |
|
| 101 |
gr.Examples(
|
| 102 |
examples=[
|
| 103 |
+
["https://qianwen-res.oss-cn-beijing.aliyuncs.com/QWEN-VL/assets/demo.jpeg", "这是什么?"],
|
| 104 |
+
["https://qianwen-res.oss-cn-beijing.aliyuncs.com/QWEN-VL/assets/demo.jpeg", "框出图中礼服和帽子"],
|
|
|
|
| 105 |
],
|
| 106 |
inputs=[image_input, text_prompt]
|
| 107 |
)
|