Spaces:

hsienchen
/

gemini-mm-cot

Sleeping

App Files Files Community

gemini-mm-cot / app_m.py

hsienchen

Rename app.py to app_m.py

cb3b8f7 verified almost 2 years ago

raw

history blame contribute delete

4.11 kB

	import PIL.Image
	import gradio as gr
	import base64
	import time
	import os
	import google.generativeai as genai

	import pathlib

	txt_model = genai.GenerativeModel('gemini-pro')
	vis_model = genai.GenerativeModel('gemini-pro-vision')

	import os

	GOOGLE_API_KEY=os.getenv('GOOGLE_API_KEY')

	genai.configure(api_key=GOOGLE_API_KEY)

	# Image to Base 64 Converter
	def image_to_base64(image_path):
	with open(image_path, 'rb') as img:
	encoded_string = base64.b64encode(img.read())
	return encoded_string.decode('utf-8')

	# Function that takes User Inputs and displays it on ChatUI
	def query_message(history,txt,img):
	if not img:
	history += [(txt,None)]
	return history
	base64 = image_to_base64(img)
	data_url = f"data:image/jpeg;base64,{base64}"
	history += [(f"{txt} ![]({data_url})", None)]
	return history

	# Function that takes User Inputs, generates Response and displays on Chat UI
	def llm_response(history,text,img):
	if not img:
	response = txt_model.generate_content(text)
	history += [(None,response.text)]
	return history

	else:
	img = PIL.Image.open(img)
	response = vis_model.generate_content([text,img])
	history += [(None,response.text)]
	return history

	# Function that takes User Inputs and displays it on ChatUI
	text_box_01 = "what is in the image"
	def output_query_message(img):
	if not img:
	return text_box_01
	base64 = image_to_base64(img)
	data_url = f"data:image/jpeg;base64,{base64}"
	outputText = [(f"{text_box_01} ![]({data_url})", None)]
	return outputText

	# Function that takes User Inputs, generates Response and displays on Chat UI
	def output_llm_response(img):
	if not img:
	response = txt_model.generate_content(text_box_01)
	return response.text

	else:
	img = PIL.Image.open(img)
	response = vis_model.generate_content([text_box_01,img])
	return response.text


	# Interface Code- Selector method

	def sentence_builder(animal, place):
	return f"""how many {animal}s from the {place} are shown in the picture?"""

	# gradio block

	with gr.Blocks(theme='snehilsanyal/scikit-learn') as app1:
	with gr.Column():
	outputbox = gr.Textbox(label="line clearance...")
	image_box = gr.Image(type="filepath")

	btn = gr.Button("Check This")
	clicked = btn.click(output_query_message,
	[image_box],
	outputbox
	).then(output_llm_response,
	[image_box],
	outputbox
	)
	gr.Markdown("""
	## SOP-302: Line Clearance ##

	<h5 align="center"><i>"XXXX here here."</i></h5>

	Multimodal-CoT incorporates vision features in a decoupled training framework. The framework consists of two training stages: (i) rationale generation and (ii) answer inference. Both stages share the same model architecture but differ in the input and output.
	""")

	with gr.Blocks(theme='snehilsanyal/scikit-learn') as app2:
	gr.Markdown("## MM 2BB ##")
	with gr.Row():
	image_box = gr.Image(type="filepath")

	chatbot = gr.Chatbot(
	scale = 2,
	height=750
	)
	text_box = gr.Dropdown(
	["what is in the image",
	"provide alternative title for the image",
	"how many birds can be seen in the picture?"],
	label="Select--",
	info="Will add more animals later!"
	)

	btn = gr.Button("Submit")
	clicked = btn.click(query_message,
	[chatbot,text_box,image_box],
	chatbot
	).then(llm_response,
	[chatbot,text_box],
	chatbot
	)
	with gr.Blocks(theme='snehilsanyal/scikit-learn') as demo:
	gr.Markdown("## SOP Camera ##")
	gr.TabbedInterface([app1, app2], ["Check #1", "Check #2"])

	demo.queue()
	demo.launch()