Spaces:

sflindrs
/

Molmo-7B-D-0924-extended-tokens

Runtime error

File size: 4,064 Bytes

import gradio as gr
from transformers import AutoModelForCausalLM, AutoProcessor, GenerationConfig
from PIL import Image
import torch
import spaces
import json

# Load the processor and model
processor = AutoProcessor.from_pretrained(
    'allenai/Molmo-7B-D-0924',
    trust_remote_code=True,
    torch_dtype='auto',
    device_map='auto'
)

model = AutoModelForCausalLM.from_pretrained(
    'allenai/Molmo-7B-D-0924',
    trust_remote_code=True,
    torch_dtype='auto',
    device_map='auto'
)

import json

def wrap_json_in_markdown(text):
    result = []
    stack = []
    json_start = None
    in_json = False
    i = 0
    while i < len(text):
        char = text[i]
        if char in ['{', '[']:
            if not in_json:
                json_start = i
                in_json = True
                stack.append(char)
            else:
                stack.append(char)
        elif char in ['}', ']'] and in_json:
            if not stack:
                # Unbalanced bracket, reset
                in_json = False
                json_start = None
            else:
                last = stack.pop()
                if (last == '{' and char != '}') or (last == '[' and char != ']'):
                    # Mismatched brackets
                    in_json = False
                    json_start = None
        if in_json and not stack:
            # Potential end of JSON
            json_str = text[json_start:i+1]
            try:
                # Try to parse the JSON to ensure it's valid
                parsed = json.loads(json_str)
                # Wrap in Markdown code block
                wrapped = f"\n```json\n{json.dumps(parsed, indent=4)}\n```\n"
                result.append(text[:json_start])  # Append text before JSON
                result.append(wrapped)           # Append wrapped JSON
                text = text[i+1:]                # Update the remaining text
                i = -1                           # Reset index
            except json.JSONDecodeError:
                # Not valid JSON, continue searching
                pass
            in_json = False
            json_start = None
        i += 1
    result.append(text)  # Append any remaining text
    return ''.join(result)

@spaces.GPU()
def process_image_and_text(image, text):
    # Process the image and text
    inputs = processor.process(
        images=[Image.fromarray(image)],
        text=text
    )

    # Move inputs to the correct device and make a batch of size 1
    inputs = {k: v.to(model.device).unsqueeze(0) for k, v in inputs.items()}

    # Generate output
    output = model.generate_from_batch(
        inputs,
        GenerationConfig(max_new_tokens=1024, stop_strings="<|endoftext|>"),
        tokenizer=processor.tokenizer
    )

    # Only get generated tokens; decode them to text
    generated_tokens = output[0, inputs['input_ids'].size(1):]
    generated_text = processor.tokenizer.decode(generated_tokens, skip_special_tokens=True)
    generated_text_w_json_wrapper = wrap_json_in_markdown(generated_text)
    
    return generated_text_w_json_wrapper

def chatbot(image, text, history):
    if image is None:
        return history + [("Please upload an image first.", None)]

    response = process_image_and_text(image, text)

    history.append({"role": "user", "content": text})
    history.append({"role": "assistant", "content": response})
    return history

# Define the Gradio interface
with gr.Blocks() as demo:
    gr.Markdown("# Image Chatbot with Molmo-7B-D-0924")
    
    with gr.Row():
        image_input = gr.Image(type="numpy")
        chatbot_output = gr.Chatbot(type="messages")
    
    text_input = gr.Textbox(placeholder="Ask a question about the image...")
    submit_button = gr.Button("Submit")

    state = gr.State([])

    submit_button.click(
        chatbot,
        inputs=[image_input, text_input, state],
        outputs=[chatbot_output]
    )

    text_input.submit(
        chatbot,
        inputs=[image_input, text_input, state],
        outputs=[chatbot_output]
    )

demo.launch()