Spaces:

sflindrs
/

Molmo-7B-D-0924-extended-tokens

Runtime error

App Files Files Community

Molmo-7B-D-0924-extended-tokens / app.py

sflindrs

Update app.py

4bf5dae verified 11 months ago

raw

history blame

4.06 kB

	import gradio as gr
	from transformers import AutoModelForCausalLM, AutoProcessor, GenerationConfig
	from PIL import Image
	import torch
	import spaces
	import json

	# Load the processor and model
	processor = AutoProcessor.from_pretrained(
	'allenai/Molmo-7B-D-0924',
	trust_remote_code=True,
	torch_dtype='auto',
	device_map='auto'
	)

	model = AutoModelForCausalLM.from_pretrained(
	'allenai/Molmo-7B-D-0924',
	trust_remote_code=True,
	torch_dtype='auto',
	device_map='auto'
	)

	import json

	def wrap_json_in_markdown(text):
	result = []
	stack = []
	json_start = None
	in_json = False
	i = 0
	while i < len(text):
	char = text[i]
	if char in ['{', '[']:
	if not in_json:
	json_start = i
	in_json = True
	stack.append(char)
	else:
	stack.append(char)
	elif char in ['}', ']'] and in_json:
	if not stack:
	# Unbalanced bracket, reset
	in_json = False
	json_start = None
	else:
	last = stack.pop()
	if (last == '{' and char != '}') or (last == '[' and char != ']'):
	# Mismatched brackets
	in_json = False
	json_start = None
	if in_json and not stack:
	# Potential end of JSON
	json_str = text[json_start:i+1]
	try:
	# Try to parse the JSON to ensure it's valid
	parsed = json.loads(json_str)
	# Wrap in Markdown code block
	wrapped = f"\n```json\n{json.dumps(parsed, indent=4)}\n```\n"
	result.append(text[:json_start]) # Append text before JSON
	result.append(wrapped) # Append wrapped JSON
	text = text[i+1:] # Update the remaining text
	i = -1 # Reset index
	except json.JSONDecodeError:
	# Not valid JSON, continue searching
	pass
	in_json = False
	json_start = None
	i += 1
	result.append(text) # Append any remaining text
	return ''.join(result)

	@spaces.GPU()
	def process_image_and_text(image, text):
	# Process the image and text
	inputs = processor.process(
	images=[Image.fromarray(image)],
	text=text
	)

	# Move inputs to the correct device and make a batch of size 1
	inputs = {k: v.to(model.device).unsqueeze(0) for k, v in inputs.items()}

	# Generate output
	output = model.generate_from_batch(
	inputs,
	GenerationConfig(max_new_tokens=1024, stop_strings="<\|endoftext\|>"),
	tokenizer=processor.tokenizer
	)

	# Only get generated tokens; decode them to text
	generated_tokens = output[0, inputs['input_ids'].size(1):]
	generated_text = processor.tokenizer.decode(generated_tokens, skip_special_tokens=True)
	generated_text_w_json_wrapper = wrap_json_in_markdown(generated_text)

	return generated_text_w_json_wrapper

	def chatbot(image, text, history):
	if image is None:
	return history + [("Please upload an image first.", None)]

	response = process_image_and_text(image, text)

	history.append({"role": "user", "content": text})
	history.append({"role": "assistant", "content": response})
	return history

	# Define the Gradio interface
	with gr.Blocks() as demo:
	gr.Markdown("# Image Chatbot with Molmo-7B-D-0924")

	with gr.Row():
	image_input = gr.Image(type="numpy")
	chatbot_output = gr.Chatbot(type="messages")

	text_input = gr.Textbox(placeholder="Ask a question about the image...")
	submit_button = gr.Button("Submit")

	state = gr.State([])

	submit_button.click(
	chatbot,
	inputs=[image_input, text_input, state],
	outputs=[chatbot_output]
	)

	text_input.submit(
	chatbot,
	inputs=[image_input, text_input, state],
	outputs=[chatbot_output]
	)

	demo.launch()