Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,143 +1,41 @@
|
|
| 1 |
-
import base64
|
| 2 |
-
from io import BytesIO
|
| 3 |
-
import json
|
| 4 |
import os
|
| 5 |
-
from openai import OpenAI
|
| 6 |
from dotenv import load_dotenv
|
| 7 |
-
from
|
| 8 |
import gradio as gr
|
| 9 |
from PIL import Image
|
|
|
|
| 10 |
|
| 11 |
load_dotenv()
|
| 12 |
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
primary_hue=gr.themes.Color(
|
| 17 |
-
c50="#f7f7fd",
|
| 18 |
-
c100="#dfdef8",
|
| 19 |
-
c200="#c4c1f2",
|
| 20 |
-
c300="#a29eea",
|
| 21 |
-
c400="#8f8ae6",
|
| 22 |
-
c500="#756fe0",
|
| 23 |
-
c600="#635cc1",
|
| 24 |
-
c700="#4f4a9b",
|
| 25 |
-
c800="#433f83",
|
| 26 |
-
c900="#302d5e",
|
| 27 |
-
c950="#302d5e",
|
| 28 |
-
),
|
| 29 |
-
secondary_hue="rose",
|
| 30 |
-
neutral_hue="stone",
|
| 31 |
-
)
|
| 32 |
|
| 33 |
-
def
|
| 34 |
-
if
|
| 35 |
-
return
|
| 36 |
|
| 37 |
-
|
|
|
|
| 38 |
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
target_image_dim=1800,
|
| 45 |
-
target_text_length=8000,
|
| 46 |
-
page_num=page_number if page_number else 1
|
| 47 |
-
)
|
| 48 |
-
|
| 49 |
-
# Extract the image from the message content for display
|
| 50 |
-
image_url = messages[0]["content"][1]["image_url"]["url"]
|
| 51 |
-
image_base64 = image_url.replace("data:image/png;base64,", "")
|
| 52 |
-
image_pil = Image.open(BytesIO(base64.b64decode(image_base64)))
|
| 53 |
-
|
| 54 |
-
# Send messages to OpenAI compatible API
|
| 55 |
-
response = openai.chat.completions.create(
|
| 56 |
-
model=os.getenv("TYPHOON_OCR_MODEL"),
|
| 57 |
-
messages=messages,
|
| 58 |
-
max_tokens=16384,
|
| 59 |
-
extra_body={
|
| 60 |
-
"repetition_penalty": 1.2,
|
| 61 |
-
"temperature": 0.1,
|
| 62 |
-
"top_p": 0.6,
|
| 63 |
-
},
|
| 64 |
-
)
|
| 65 |
-
text_output = response.choices[0].message.content
|
| 66 |
-
|
| 67 |
-
# Try to parse the output assuming it is a Python dictionary containing 'natural_text'
|
| 68 |
-
try:
|
| 69 |
-
json_data = json.loads(text_output)
|
| 70 |
-
markdown_out = json_data.get('natural_text', "").replace("<figure>", "").replace("</figure>", "")
|
| 71 |
-
except Exception as e:
|
| 72 |
-
markdown_out = f"⚠️ Could not extract `natural_text` from output.\nError: {str(e)}"
|
| 73 |
-
|
| 74 |
-
return image_pil, markdown_out
|
| 75 |
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
# Build the Gradio UI.
|
| 81 |
-
with gr.Blocks(theme=theme) as demo:
|
| 82 |
-
title = gr.HTML("""
|
| 83 |
-
<h1>Typhoon OCR</h1>
|
| 84 |
-
<ul>
|
| 85 |
-
<li>🤗 <b>Model weights</b>: <a href="https://huggingface.co/scb10x/typhoon-ocr-7b" target="_blank">https://huggingface.co/scb10x/typhoon-ocr-7b</a></li>
|
| 86 |
-
</ul>
|
| 87 |
-
<br />
|
| 88 |
-
<details>
|
| 89 |
-
<summary><strong>Disclaimer</strong></summary>
|
| 90 |
-
The responses generated by this Artificial Intelligence (AI) system are autonomously constructed and do not necessarily reflect the views or positions of the developing organizations, their affiliates, or any of their employees. These AI-generated responses do not represent those of the organizations. The organizations do not endorse, support, sanction, encourage, verify, or agree with the comments, opinions, or statements generated by this AI. The information produced by this AI is not intended to malign any religion, ethnic group, club, organization, company, individual, anyone, or anything. It is not the intent of the organizations to malign any group or individual. The AI operates based on its programming and training data and its responses should not be interpreted as the explicit intent or opinion of the organizations.
|
| 91 |
-
</details>
|
| 92 |
-
<br />
|
| 93 |
-
<details>
|
| 94 |
-
<summary><strong>Terms of use</strong></summary>
|
| 95 |
-
By using this service, users are required to agree to the following terms: The service is a research preview intended for non-commercial use only. It only provides limited safety measures and may generate offensive content. It must not be used for any illegal, harmful, violent, racist, or sexual purposes. Vision language models are prone to hallucinations to a greater extent compared to text-only LLMs.
|
| 96 |
-
</details>
|
| 97 |
-
<br />
|
| 98 |
-
<details>
|
| 99 |
-
<summary><strong>License</strong></summary>
|
| 100 |
-
This project utilizes certain datasets and checkpoints that are subject to their respective original licenses. Users must comply with all terms and conditions of these original licenses. The content of this project itself is licensed under the Apache license 2.0.
|
| 101 |
-
</details>
|
| 102 |
-
""")
|
| 103 |
-
with gr.Row():
|
| 104 |
-
with gr.Column(scale=1):
|
| 105 |
-
# Update file_types to accept PDF as well as common image formats.
|
| 106 |
-
pdf_input = gr.File(label="📄 Upload Image file or PDF file", file_types=[".pdf", ".png", ".jpg", ".jpeg"])
|
| 107 |
-
|
| 108 |
-
with gr.Group(elem_classes=["task-background"]):
|
| 109 |
-
task_dropdown = gr.Radio(["default", "structure"], label="🎯 Select Task", value="default")
|
| 110 |
-
gr.HTML("""
|
| 111 |
-
<p><b>default</b>: This mode works for most cases and is recommended for files without a clear template such as infographics.</p>
|
| 112 |
-
<p><b>structure</b>: This mode offers improved performance for complex layout documents such as those containing images, tables and forms.</p>
|
| 113 |
-
<p>We recommend trying both and see which one works better for your use case.</p>
|
| 114 |
-
""", elem_classes=["task-dropdown-info"])
|
| 115 |
-
demo.css = """
|
| 116 |
-
.task-background {
|
| 117 |
-
background: var(--block-background-fill) !important;
|
| 118 |
-
|
| 119 |
-
}
|
| 120 |
-
.task-background > * {
|
| 121 |
-
background: var(--block-background-fill) !important;
|
| 122 |
-
}
|
| 123 |
-
.task-dropdown-info {
|
| 124 |
-
padding: 0 16px;
|
| 125 |
-
font-size: 12px;
|
| 126 |
-
}
|
| 127 |
-
"""
|
| 128 |
-
page_number = gr.Number(label="📄 Page Number (for PDFs only)", value=1, minimum=1, step=1)
|
| 129 |
-
run_button = gr.Button("🚀 Run")
|
| 130 |
-
image_output = gr.Image(label="📸 Preview Image", type="pil")
|
| 131 |
-
with gr.Column(scale=2):
|
| 132 |
-
markdown_output = gr.Markdown(label='Markdown Result', show_label=True)
|
| 133 |
-
|
| 134 |
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 141 |
|
| 142 |
-
|
| 143 |
-
demo.launch(share=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import os
|
|
|
|
| 2 |
from dotenv import load_dotenv
|
| 3 |
+
from huggingface_hub import InferenceClient
|
| 4 |
import gradio as gr
|
| 5 |
from PIL import Image
|
| 6 |
+
from io import BytesIO
|
| 7 |
|
| 8 |
load_dotenv()
|
| 9 |
|
| 10 |
+
# Initialize Hugging Face Inference client
|
| 11 |
+
hf_token = os.getenv("HF_API_TOKEN")
|
| 12 |
+
client = InferenceClient(token=hf_token)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
|
| 14 |
+
def run_ocr(file):
|
| 15 |
+
if file is None:
|
| 16 |
+
return "No file uploaded"
|
| 17 |
|
| 18 |
+
# Read uploaded file
|
| 19 |
+
data = file.read()
|
| 20 |
|
| 21 |
+
# Call the Typhoon OCR model
|
| 22 |
+
result = client.text_generation(
|
| 23 |
+
model="scb10x/typhoon-ocr1.5-2b",
|
| 24 |
+
inputs=data
|
| 25 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
|
| 27 |
+
# Extract generated text
|
| 28 |
+
ocr_text = result[0]["generated_text"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
|
| 30 |
+
return ocr_text
|
| 31 |
+
|
| 32 |
+
# Build Gradio interface
|
| 33 |
+
with gr.Blocks() as demo:
|
| 34 |
+
gr.Markdown("## Typhoon OCR Web App")
|
| 35 |
+
file_input = gr.File(label="Upload PDF or Image", file_types=[".pdf", ".png", ".jpg", ".jpeg"])
|
| 36 |
+
output_text = gr.Textbox(label="OCR Result", lines=20)
|
| 37 |
+
run_btn = gr.Button("Run OCR")
|
| 38 |
+
|
| 39 |
+
run_btn.click(fn=run_ocr, inputs=file_input, outputs=output_text)
|
| 40 |
|
| 41 |
+
demo.launch()
|
|
|