|
|
""" |
|
|
Qwen3-VL model with ZeroGPU support for Hugging Face Spaces. |
|
|
Uses transformers with @spaces.GPU decorator. |
|
|
""" |
|
|
import torch |
|
|
from typing import List, Dict |
|
|
from transformers import AutoProcessor, Qwen3VLForConditionalGeneration |
|
|
import spaces |
|
|
|
|
|
|
|
|
class QwenZeroGPUAnalyzer: |
|
|
""" |
|
|
Qwen3 model analyzer with ZeroGPU support. |
|
|
Uses Qwen3-VL-4B-Instruct for diagram generation. |
|
|
""" |
|
|
|
|
|
def __init__( |
|
|
self, |
|
|
model_name: str = "Qwen/Qwen3-VL-4B-Instruct" |
|
|
): |
|
|
""" |
|
|
Initialize the Qwen ZeroGPU analyzer. |
|
|
|
|
|
Args: |
|
|
model_name: HuggingFace model ID |
|
|
""" |
|
|
self.model_name = model_name |
|
|
self.model = None |
|
|
self.processor = None |
|
|
|
|
|
print(f"β Qwen ZeroGPU analyzer initialized (model will load on first inference)") |
|
|
print(f" Model: {self.model_name}") |
|
|
|
|
|
def _load_model(self): |
|
|
"""Load model and processor (called on first inference).""" |
|
|
if self.model is not None: |
|
|
return |
|
|
|
|
|
print(f"Loading model: {self.model_name}...") |
|
|
|
|
|
|
|
|
self.processor = AutoProcessor.from_pretrained( |
|
|
self.model_name |
|
|
) |
|
|
|
|
|
|
|
|
self.model = Qwen3VLForConditionalGeneration.from_pretrained( |
|
|
self.model_name, |
|
|
torch_dtype="auto", |
|
|
device_map="auto" |
|
|
) |
|
|
|
|
|
print(f"β Model loaded: {self.model_name}") |
|
|
|
|
|
@spaces.GPU(duration=60) |
|
|
def generate_response(self, conversation: List[Dict[str, str]], max_tokens: int = 4000) -> str: |
|
|
""" |
|
|
Generate response from conversation history using ZeroGPU. |
|
|
|
|
|
Args: |
|
|
conversation: List of conversation messages with 'role' and 'content' |
|
|
max_tokens: Maximum tokens to generate |
|
|
|
|
|
Returns: |
|
|
Generated response text |
|
|
""" |
|
|
|
|
|
if self.model is None: |
|
|
self._load_model() |
|
|
|
|
|
|
|
|
|
|
|
messages = [] |
|
|
for msg in conversation: |
|
|
role = msg["role"] |
|
|
content = msg["content"] |
|
|
|
|
|
|
|
|
messages.append({ |
|
|
"role": role, |
|
|
"content": [{"type": "text", "text": content}] |
|
|
}) |
|
|
|
|
|
|
|
|
inputs = self.processor.apply_chat_template( |
|
|
messages, |
|
|
tokenize=True, |
|
|
add_generation_prompt=True, |
|
|
return_dict=True, |
|
|
return_tensors="pt" |
|
|
) |
|
|
inputs = inputs.to(self.model.device) |
|
|
|
|
|
|
|
|
generated_ids = self.model.generate( |
|
|
**inputs, |
|
|
max_new_tokens=max_tokens |
|
|
) |
|
|
|
|
|
|
|
|
generated_ids_trimmed = [ |
|
|
out_ids[len(in_ids):] |
|
|
for in_ids, out_ids in zip(inputs.input_ids, generated_ids) |
|
|
] |
|
|
|
|
|
|
|
|
output_text = self.processor.batch_decode( |
|
|
generated_ids_trimmed, |
|
|
skip_special_tokens=True, |
|
|
clean_up_tokenization_spaces=False |
|
|
) |
|
|
|
|
|
return output_text[0].strip() |
|
|
|
|
|
def cleanup_model(self): |
|
|
"""Cleanup (managed by ZeroGPU).""" |
|
|
|
|
|
pass |
|
|
|