Spaces:
Runtime error
Runtime error
File size: 4,064 Bytes
259d504 53afe5a 4bf5dae 259d504 4bf5dae 53afe5a dc1dca9 259d504 5bfb146 259d504 4bf5dae 259d504 1310d0e 259d504 86f16e5 259d504 e62a54b 05a348e 5ee2847 86f16e5 259d504 37dfdda 259d504 37dfdda 259d504 37dfdda 259d504 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 |
import gradio as gr
from transformers import AutoModelForCausalLM, AutoProcessor, GenerationConfig
from PIL import Image
import torch
import spaces
import json
# Load the processor and model
processor = AutoProcessor.from_pretrained(
'allenai/Molmo-7B-D-0924',
trust_remote_code=True,
torch_dtype='auto',
device_map='auto'
)
model = AutoModelForCausalLM.from_pretrained(
'allenai/Molmo-7B-D-0924',
trust_remote_code=True,
torch_dtype='auto',
device_map='auto'
)
import json
def wrap_json_in_markdown(text):
result = []
stack = []
json_start = None
in_json = False
i = 0
while i < len(text):
char = text[i]
if char in ['{', '[']:
if not in_json:
json_start = i
in_json = True
stack.append(char)
else:
stack.append(char)
elif char in ['}', ']'] and in_json:
if not stack:
# Unbalanced bracket, reset
in_json = False
json_start = None
else:
last = stack.pop()
if (last == '{' and char != '}') or (last == '[' and char != ']'):
# Mismatched brackets
in_json = False
json_start = None
if in_json and not stack:
# Potential end of JSON
json_str = text[json_start:i+1]
try:
# Try to parse the JSON to ensure it's valid
parsed = json.loads(json_str)
# Wrap in Markdown code block
wrapped = f"\n```json\n{json.dumps(parsed, indent=4)}\n```\n"
result.append(text[:json_start]) # Append text before JSON
result.append(wrapped) # Append wrapped JSON
text = text[i+1:] # Update the remaining text
i = -1 # Reset index
except json.JSONDecodeError:
# Not valid JSON, continue searching
pass
in_json = False
json_start = None
i += 1
result.append(text) # Append any remaining text
return ''.join(result)
@spaces.GPU()
def process_image_and_text(image, text):
# Process the image and text
inputs = processor.process(
images=[Image.fromarray(image)],
text=text
)
# Move inputs to the correct device and make a batch of size 1
inputs = {k: v.to(model.device).unsqueeze(0) for k, v in inputs.items()}
# Generate output
output = model.generate_from_batch(
inputs,
GenerationConfig(max_new_tokens=1024, stop_strings="<|endoftext|>"),
tokenizer=processor.tokenizer
)
# Only get generated tokens; decode them to text
generated_tokens = output[0, inputs['input_ids'].size(1):]
generated_text = processor.tokenizer.decode(generated_tokens, skip_special_tokens=True)
generated_text_w_json_wrapper = wrap_json_in_markdown(generated_text)
return generated_text_w_json_wrapper
def chatbot(image, text, history):
if image is None:
return history + [("Please upload an image first.", None)]
response = process_image_and_text(image, text)
history.append({"role": "user", "content": text})
history.append({"role": "assistant", "content": response})
return history
# Define the Gradio interface
with gr.Blocks() as demo:
gr.Markdown("# Image Chatbot with Molmo-7B-D-0924")
with gr.Row():
image_input = gr.Image(type="numpy")
chatbot_output = gr.Chatbot(type="messages")
text_input = gr.Textbox(placeholder="Ask a question about the image...")
submit_button = gr.Button("Submit")
state = gr.State([])
submit_button.click(
chatbot,
inputs=[image_input, text_input, state],
outputs=[chatbot_output]
)
text_input.submit(
chatbot,
inputs=[image_input, text_input, state],
outputs=[chatbot_output]
)
demo.launch() |