Spaces:
Runtime error
Runtime error
Upload 17 files
Browse files- .env.template +3 -0
- .gitattributes +1 -0
- .gitignore +3 -0
- README.md +79 -12
- app.py +67 -47
- examples/simple_ocr.py +10 -0
- examples/test.png +3 -0
- packages/typhoon_ocr/.gitignore +52 -0
- packages/typhoon_ocr/LICENSE +176 -0
- packages/typhoon_ocr/MANIFEST.in +3 -0
- packages/typhoon_ocr/README.md +134 -0
- packages/typhoon_ocr/build_and_upload.sh +1 -0
- packages/typhoon_ocr/pyproject.toml +12 -0
- packages/typhoon_ocr/setup.py +39 -0
- packages/typhoon_ocr/typhoon_ocr/__init__.py +39 -0
- packages/typhoon_ocr/typhoon_ocr/ocr_utils.py +623 -0
- packages/typhoon_ocr/typhoon_ocr/pdf_utils.py +46 -0
- requirements.txt +4 -2
.env.template
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
TYPHOON_BASE_URL=
|
| 2 |
+
TYPHOON_API_KEY=
|
| 3 |
+
TYPHOON_OCR_MODEL=typhoon-ocr-preview
|
.gitattributes
CHANGED
|
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
examples/test.png filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
.env
|
| 2 |
+
*.pyc
|
| 3 |
+
.specstory
|
README.md
CHANGED
|
@@ -1,12 +1,79 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
## Typhoon OCR
|
| 2 |
+
|
| 3 |
+
Typhoon OCR is a model for extracting structured markdown from images or PDFs. It supports document layout analysis and table extraction, returning results in markdown or HTML. This package is a simple Gradio website to demonstrate the performance of Typhoon OCR.
|
| 4 |
+
|
| 5 |
+
### Features
|
| 6 |
+
|
| 7 |
+
- Upload a PDF or image (single page)
|
| 8 |
+
- Extracts and reconstructs document content as markdown
|
| 9 |
+
- Supports different prompt modes for layout or structure
|
| 10 |
+
- Language: English, Thai
|
| 11 |
+
- Uses a local or remote OpenAI-compatible API (e.g., vllm, opentyphoon.ai)
|
| 12 |
+
- See blog for more detail https://opentyphoon.ai/blog/en/typhoon-ocr-release
|
| 13 |
+
|
| 14 |
+
### Requirements
|
| 15 |
+
|
| 16 |
+
- Linux / Mac with python (window not supported at the moment)
|
| 17 |
+
|
| 18 |
+
### Install
|
| 19 |
+
|
| 20 |
+
```bash
|
| 21 |
+
pip install typhoon-ocr
|
| 22 |
+
```
|
| 23 |
+
|
| 24 |
+
or to run the gradio app.
|
| 25 |
+
|
| 26 |
+
```bash
|
| 27 |
+
pip install -r requirements.txt
|
| 28 |
+
# edit .env
|
| 29 |
+
# pip install vllm # optional for hosting a local server
|
| 30 |
+
```
|
| 31 |
+
|
| 32 |
+
### Mac specific
|
| 33 |
+
|
| 34 |
+
```
|
| 35 |
+
brew install poppler
|
| 36 |
+
# The following binaries are required and provided by poppler:
|
| 37 |
+
# - pdfinfo
|
| 38 |
+
# - pdftoppm
|
| 39 |
+
```
|
| 40 |
+
|
| 41 |
+
### Linux specific
|
| 42 |
+
|
| 43 |
+
```
|
| 44 |
+
sudo apt-get update
|
| 45 |
+
sudo apt-get install poppler-utils
|
| 46 |
+
# The following binaries are required and provided by poppler-utils:
|
| 47 |
+
# - pdfinfo
|
| 48 |
+
# - pdftoppm
|
| 49 |
+
```
|
| 50 |
+
|
| 51 |
+
### Start vllm
|
| 52 |
+
|
| 53 |
+
```bash
|
| 54 |
+
vllm serve scb10x/typhoon-ocr-7b --served-model-name typhoon-ocr --dtype bfloat16 --port 8101
|
| 55 |
+
```
|
| 56 |
+
|
| 57 |
+
### Run Gradio demo
|
| 58 |
+
|
| 59 |
+
```bash
|
| 60 |
+
python app.py
|
| 61 |
+
```
|
| 62 |
+
|
| 63 |
+
### Dependencies
|
| 64 |
+
|
| 65 |
+
- openai
|
| 66 |
+
- python-dotenv
|
| 67 |
+
- ftfy
|
| 68 |
+
- pypdf
|
| 69 |
+
- gradio
|
| 70 |
+
- vllm (for hosting an inference server)
|
| 71 |
+
- pillow
|
| 72 |
+
|
| 73 |
+
### Debug
|
| 74 |
+
|
| 75 |
+
- If `Error processing document` occur. Make sure you have install `brew install poppler` or `apt-get install poppler-utils`.
|
| 76 |
+
|
| 77 |
+
### License
|
| 78 |
+
|
| 79 |
+
This project is licensed under the Apache 2.0 License. See individual datasets and checkpoints for their respective licenses.
|
app.py
CHANGED
|
@@ -10,11 +10,7 @@ from PIL import Image
|
|
| 10 |
|
| 11 |
load_dotenv()
|
| 12 |
|
| 13 |
-
|
| 14 |
-
openai = OpenAI(
|
| 15 |
-
base_url=os.getenv("TYPHOON_BASE_URL"),
|
| 16 |
-
api_key=os.getenv("TYPHOON_API_KEY")
|
| 17 |
-
)
|
| 18 |
|
| 19 |
theme = gr.themes.Soft(
|
| 20 |
primary_hue=gr.themes.Color(
|
|
@@ -37,11 +33,11 @@ theme = gr.themes.Soft(
|
|
| 37 |
def process_pdf(pdf_or_image_file, task_type, page_number):
|
| 38 |
if pdf_or_image_file is None:
|
| 39 |
return None, "No file uploaded"
|
| 40 |
-
|
| 41 |
orig_filename = pdf_or_image_file.name
|
| 42 |
-
|
| 43 |
try:
|
| 44 |
-
#
|
| 45 |
messages = prepare_ocr_messages(
|
| 46 |
pdf_or_image_path=orig_filename,
|
| 47 |
task_type=task_type,
|
|
@@ -49,13 +45,13 @@ def process_pdf(pdf_or_image_file, task_type, page_number):
|
|
| 49 |
target_text_length=8000,
|
| 50 |
page_num=page_number if page_number else 1
|
| 51 |
)
|
| 52 |
-
|
| 53 |
-
# Extract image for
|
| 54 |
image_url = messages[0]["content"][1]["image_url"]["url"]
|
| 55 |
image_base64 = image_url.replace("data:image/png;base64,", "")
|
| 56 |
image_pil = Image.open(BytesIO(base64.b64decode(image_base64)))
|
| 57 |
-
|
| 58 |
-
# Send
|
| 59 |
response = openai.chat.completions.create(
|
| 60 |
model=os.getenv("TYPHOON_OCR_MODEL"),
|
| 61 |
messages=messages,
|
|
@@ -66,58 +62,82 @@ def process_pdf(pdf_or_image_file, task_type, page_number):
|
|
| 66 |
"top_p": 0.6,
|
| 67 |
},
|
| 68 |
)
|
| 69 |
-
|
| 70 |
text_output = response.choices[0].message.content
|
| 71 |
-
|
| 72 |
-
# Try to
|
| 73 |
try:
|
| 74 |
json_data = json.loads(text_output)
|
| 75 |
-
markdown_out = json_data.get(
|
| 76 |
-
except:
|
| 77 |
-
markdown_out =
|
| 78 |
-
|
| 79 |
return image_pil, markdown_out
|
| 80 |
-
|
| 81 |
except Exception as e:
|
| 82 |
-
return None, f"Error: {str(e)}"
|
| 83 |
|
| 84 |
|
|
|
|
| 85 |
with gr.Blocks(theme=theme) as demo:
|
| 86 |
title = gr.HTML("""
|
| 87 |
-
<h1>Typhoon OCR
|
| 88 |
-
<
|
| 89 |
-
|
| 90 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 91 |
with gr.Row():
|
| 92 |
with gr.Column(scale=1):
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
)
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 112 |
image_output = gr.Image(label="📸 Preview Image", type="pil")
|
| 113 |
-
|
| 114 |
with gr.Column(scale=2):
|
| 115 |
-
markdown_output = gr.Markdown(label=
|
| 116 |
|
|
|
|
|
|
|
| 117 |
run_button.click(
|
| 118 |
fn=process_pdf,
|
| 119 |
inputs=[pdf_input, task_dropdown, page_number],
|
| 120 |
outputs=[image_output, markdown_output]
|
| 121 |
)
|
| 122 |
|
| 123 |
-
demo
|
|
|
|
|
|
| 10 |
|
| 11 |
load_dotenv()
|
| 12 |
|
| 13 |
+
openai = OpenAI(base_url=os.getenv("TYPHOON_BASE_URL"), api_key=os.getenv("TYPHOON_API_KEY"))
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
|
| 15 |
theme = gr.themes.Soft(
|
| 16 |
primary_hue=gr.themes.Color(
|
|
|
|
| 33 |
def process_pdf(pdf_or_image_file, task_type, page_number):
|
| 34 |
if pdf_or_image_file is None:
|
| 35 |
return None, "No file uploaded"
|
| 36 |
+
|
| 37 |
orig_filename = pdf_or_image_file.name
|
| 38 |
+
|
| 39 |
try:
|
| 40 |
+
# Use the new simplified function to prepare OCR messages with page number
|
| 41 |
messages = prepare_ocr_messages(
|
| 42 |
pdf_or_image_path=orig_filename,
|
| 43 |
task_type=task_type,
|
|
|
|
| 45 |
target_text_length=8000,
|
| 46 |
page_num=page_number if page_number else 1
|
| 47 |
)
|
| 48 |
+
|
| 49 |
+
# Extract the image from the message content for display
|
| 50 |
image_url = messages[0]["content"][1]["image_url"]["url"]
|
| 51 |
image_base64 = image_url.replace("data:image/png;base64,", "")
|
| 52 |
image_pil = Image.open(BytesIO(base64.b64decode(image_base64)))
|
| 53 |
+
|
| 54 |
+
# Send messages to OpenAI compatible API
|
| 55 |
response = openai.chat.completions.create(
|
| 56 |
model=os.getenv("TYPHOON_OCR_MODEL"),
|
| 57 |
messages=messages,
|
|
|
|
| 62 |
"top_p": 0.6,
|
| 63 |
},
|
| 64 |
)
|
|
|
|
| 65 |
text_output = response.choices[0].message.content
|
| 66 |
+
|
| 67 |
+
# Try to parse the output assuming it is a Python dictionary containing 'natural_text'
|
| 68 |
try:
|
| 69 |
json_data = json.loads(text_output)
|
| 70 |
+
markdown_out = json_data.get('natural_text', "").replace("<figure>", "").replace("</figure>", "")
|
| 71 |
+
except Exception as e:
|
| 72 |
+
markdown_out = f"⚠️ Could not extract `natural_text` from output.\nError: {str(e)}"
|
| 73 |
+
|
| 74 |
return image_pil, markdown_out
|
| 75 |
+
|
| 76 |
except Exception as e:
|
| 77 |
+
return None, f"Error processing file: {str(e)}"
|
| 78 |
|
| 79 |
|
| 80 |
+
# Build the Gradio UI.
|
| 81 |
with gr.Blocks(theme=theme) as demo:
|
| 82 |
title = gr.HTML("""
|
| 83 |
+
<h1>Typhoon OCR</h1>
|
| 84 |
+
<ul>
|
| 85 |
+
<li>🤗 <b>Model weights</b>: <a href="https://huggingface.co/scb10x/typhoon-ocr-7b" target="_blank">https://huggingface.co/scb10x/typhoon-ocr-7b</a></li>
|
| 86 |
+
</ul>
|
| 87 |
+
<br />
|
| 88 |
+
<details>
|
| 89 |
+
<summary><strong>Disclaimer</strong></summary>
|
| 90 |
+
The responses generated by this Artificial Intelligence (AI) system are autonomously constructed and do not necessarily reflect the views or positions of the developing organizations, their affiliates, or any of their employees. These AI-generated responses do not represent those of the organizations. The organizations do not endorse, support, sanction, encourage, verify, or agree with the comments, opinions, or statements generated by this AI. The information produced by this AI is not intended to malign any religion, ethnic group, club, organization, company, individual, anyone, or anything. It is not the intent of the organizations to malign any group or individual. The AI operates based on its programming and training data and its responses should not be interpreted as the explicit intent or opinion of the organizations.
|
| 91 |
+
</details>
|
| 92 |
+
<br />
|
| 93 |
+
<details>
|
| 94 |
+
<summary><strong>Terms of use</strong></summary>
|
| 95 |
+
By using this service, users are required to agree to the following terms: The service is a research preview intended for non-commercial use only. It only provides limited safety measures and may generate offensive content. It must not be used for any illegal, harmful, violent, racist, or sexual purposes. Vision language models are prone to hallucinations to a greater extent compared to text-only LLMs.
|
| 96 |
+
</details>
|
| 97 |
+
<br />
|
| 98 |
+
<details>
|
| 99 |
+
<summary><strong>License</strong></summary>
|
| 100 |
+
This project utilizes certain datasets and checkpoints that are subject to their respective original licenses. Users must comply with all terms and conditions of these original licenses. The content of this project itself is licensed under the Apache license 2.0.
|
| 101 |
+
</details>
|
| 102 |
+
""")
|
| 103 |
with gr.Row():
|
| 104 |
with gr.Column(scale=1):
|
| 105 |
+
# Update file_types to accept PDF as well as common image formats.
|
| 106 |
+
pdf_input = gr.File(label="📄 Upload Image file or PDF file", file_types=[".pdf", ".png", ".jpg", ".jpeg"])
|
| 107 |
+
|
| 108 |
+
with gr.Group(elem_classes=["task-background"]):
|
| 109 |
+
task_dropdown = gr.Radio(["default", "structure"], label="🎯 Select Task", value="default")
|
| 110 |
+
gr.HTML("""
|
| 111 |
+
<p><b>default</b>: This mode works for most cases and is recommended for files without a clear template such as infographics.</p>
|
| 112 |
+
<p><b>structure</b>: This mode offers improved performance for complex layout documents such as those containing images, tables and forms.</p>
|
| 113 |
+
<p>We recommend trying both and see which one works better for your use case.</p>
|
| 114 |
+
""", elem_classes=["task-dropdown-info"])
|
| 115 |
+
demo.css = """
|
| 116 |
+
.task-background {
|
| 117 |
+
background: var(--block-background-fill) !important;
|
| 118 |
+
|
| 119 |
+
}
|
| 120 |
+
.task-background > * {
|
| 121 |
+
background: var(--block-background-fill) !important;
|
| 122 |
+
}
|
| 123 |
+
.task-dropdown-info {
|
| 124 |
+
padding: 0 16px;
|
| 125 |
+
font-size: 12px;
|
| 126 |
+
}
|
| 127 |
+
"""
|
| 128 |
+
page_number = gr.Number(label="📄 Page Number (for PDFs only)", value=1, minimum=1, step=1)
|
| 129 |
+
run_button = gr.Button("🚀 Run")
|
| 130 |
image_output = gr.Image(label="📸 Preview Image", type="pil")
|
|
|
|
| 131 |
with gr.Column(scale=2):
|
| 132 |
+
markdown_output = gr.Markdown(label='Markdown Result', show_label=True)
|
| 133 |
|
| 134 |
+
|
| 135 |
+
# Connect the UI inputs to the processing function.
|
| 136 |
run_button.click(
|
| 137 |
fn=process_pdf,
|
| 138 |
inputs=[pdf_input, task_dropdown, page_number],
|
| 139 |
outputs=[image_output, markdown_output]
|
| 140 |
)
|
| 141 |
|
| 142 |
+
# Launch the Gradio demo (temporary public share for 72 hours)
|
| 143 |
+
demo.launch(share=False)
|
examples/simple_ocr.py
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typhoon_ocr import ocr_document
|
| 2 |
+
import os
|
| 3 |
+
|
| 4 |
+
# please set env TYPHOON_API_KEY or OPENAI_API_KEY to use this function
|
| 5 |
+
|
| 6 |
+
script_dir = os.path.dirname(os.path.abspath(__file__))
|
| 7 |
+
image_path = os.path.join(script_dir, "test.png")
|
| 8 |
+
|
| 9 |
+
markdown = ocr_document(image_path)
|
| 10 |
+
print(markdown)
|
examples/test.png
ADDED
|
Git LFS Details
|
packages/typhoon_ocr/.gitignore
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Distribution / packaging
|
| 2 |
+
dist/
|
| 3 |
+
build/
|
| 4 |
+
*.egg-info/
|
| 5 |
+
*.egg
|
| 6 |
+
|
| 7 |
+
# Byte-compiled / optimized / DLL files
|
| 8 |
+
__pycache__/
|
| 9 |
+
*.py[cod]
|
| 10 |
+
*$py.class
|
| 11 |
+
*.so
|
| 12 |
+
.Python
|
| 13 |
+
|
| 14 |
+
# Virtual environments
|
| 15 |
+
venv/
|
| 16 |
+
env/
|
| 17 |
+
ENV/
|
| 18 |
+
.env
|
| 19 |
+
.venv
|
| 20 |
+
env.bak/
|
| 21 |
+
venv.bak/
|
| 22 |
+
|
| 23 |
+
# Unit test / coverage reports
|
| 24 |
+
htmlcov/
|
| 25 |
+
.tox/
|
| 26 |
+
.coverage
|
| 27 |
+
.coverage.*
|
| 28 |
+
.cache
|
| 29 |
+
nosetests.xml
|
| 30 |
+
coverage.xml
|
| 31 |
+
*.cover
|
| 32 |
+
.hypothesis/
|
| 33 |
+
.pytest_cache/
|
| 34 |
+
|
| 35 |
+
# Sphinx documentation
|
| 36 |
+
docs/_build/
|
| 37 |
+
docs/api/
|
| 38 |
+
|
| 39 |
+
# Jupyter Notebook
|
| 40 |
+
.ipynb_checkpoints
|
| 41 |
+
|
| 42 |
+
# IDE specific files
|
| 43 |
+
.idea/
|
| 44 |
+
.vscode/
|
| 45 |
+
*.swp
|
| 46 |
+
*.swo
|
| 47 |
+
.DS_Store
|
| 48 |
+
|
| 49 |
+
# Project specific
|
| 50 |
+
*.bak
|
| 51 |
+
*.tmp
|
| 52 |
+
temp/
|
packages/typhoon_ocr/LICENSE
ADDED
|
@@ -0,0 +1,176 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Apache License
|
| 2 |
+
Version 2.0, January 2004
|
| 3 |
+
http://www.apache.org/licenses/
|
| 4 |
+
|
| 5 |
+
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
| 6 |
+
|
| 7 |
+
1. Definitions.
|
| 8 |
+
|
| 9 |
+
"License" shall mean the terms and conditions for use, reproduction,
|
| 10 |
+
and distribution as defined by Sections 1 through 9 of this document.
|
| 11 |
+
|
| 12 |
+
"Licensor" shall mean the copyright owner or entity authorized by
|
| 13 |
+
the copyright owner that is granting the License.
|
| 14 |
+
|
| 15 |
+
"Legal Entity" shall mean the union of the acting entity and all
|
| 16 |
+
other entities that control, are controlled by, or are under common
|
| 17 |
+
control with that entity. For the purposes of this definition,
|
| 18 |
+
"control" means (i) the power, direct or indirect, to cause the
|
| 19 |
+
direction or management of such entity, whether by contract or
|
| 20 |
+
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
| 21 |
+
outstanding shares, or (iii) beneficial ownership of such entity.
|
| 22 |
+
|
| 23 |
+
"You" (or "Your") shall mean an individual or Legal Entity
|
| 24 |
+
exercising permissions granted by this License.
|
| 25 |
+
|
| 26 |
+
"Source" form shall mean the preferred form for making modifications,
|
| 27 |
+
including but not limited to software source code, documentation
|
| 28 |
+
source, and configuration files.
|
| 29 |
+
|
| 30 |
+
"Object" form shall mean any form resulting from mechanical
|
| 31 |
+
transformation or translation of a Source form, including but
|
| 32 |
+
not limited to compiled object code, generated documentation,
|
| 33 |
+
and conversions to other media types.
|
| 34 |
+
|
| 35 |
+
"Work" shall mean the work of authorship, whether in Source or
|
| 36 |
+
Object form, made available under the License, as indicated by a
|
| 37 |
+
copyright notice that is included in or attached to the work
|
| 38 |
+
(an example is provided in the Appendix below).
|
| 39 |
+
|
| 40 |
+
"Derivative Works" shall mean any work, whether in Source or Object
|
| 41 |
+
form, that is based on (or derived from) the Work and for which the
|
| 42 |
+
editorial revisions, annotations, elaborations, or other modifications
|
| 43 |
+
represent, as a whole, an original work of authorship. For the purposes
|
| 44 |
+
of this License, Derivative Works shall not include works that remain
|
| 45 |
+
separable from, or merely link (or bind by name) to the interfaces of,
|
| 46 |
+
the Work and Derivative Works thereof.
|
| 47 |
+
|
| 48 |
+
"Contribution" shall mean any work of authorship, including
|
| 49 |
+
the original version of the Work and any modifications or additions
|
| 50 |
+
to that Work or Derivative Works thereof, that is intentionally
|
| 51 |
+
submitted to Licensor for inclusion in the Work by the copyright owner
|
| 52 |
+
or by an individual or Legal Entity authorized to submit on behalf of
|
| 53 |
+
the copyright owner. For the purposes of this definition, "submitted"
|
| 54 |
+
means any form of electronic, verbal, or written communication sent
|
| 55 |
+
to the Licensor or its representatives, including but not limited to
|
| 56 |
+
communication on electronic mailing lists, source code control systems,
|
| 57 |
+
and issue tracking systems that are managed by, or on behalf of, the
|
| 58 |
+
Licensor for the purpose of discussing and improving the Work, but
|
| 59 |
+
excluding communication that is conspicuously marked or otherwise
|
| 60 |
+
designated in writing by the copyright owner as "Not a Contribution."
|
| 61 |
+
|
| 62 |
+
"Contributor" shall mean Licensor and any individual or Legal Entity
|
| 63 |
+
on behalf of whom a Contribution has been received by Licensor and
|
| 64 |
+
subsequently incorporated within the Work.
|
| 65 |
+
|
| 66 |
+
2. Grant of Copyright License. Subject to the terms and conditions of
|
| 67 |
+
this License, each Contributor hereby grants to You a perpetual,
|
| 68 |
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
| 69 |
+
copyright license to reproduce, prepare Derivative Works of,
|
| 70 |
+
publicly display, publicly perform, sublicense, and distribute the
|
| 71 |
+
Work and such Derivative Works in Source or Object form.
|
| 72 |
+
|
| 73 |
+
3. Grant of Patent License. Subject to the terms and conditions of
|
| 74 |
+
this License, each Contributor hereby grants to You a perpetual,
|
| 75 |
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
| 76 |
+
(except as stated in this section) patent license to make, have made,
|
| 77 |
+
use, offer to sell, sell, import, and otherwise transfer the Work,
|
| 78 |
+
where such license applies only to those patent claims licensable
|
| 79 |
+
by such Contributor that are necessarily infringed by their
|
| 80 |
+
Contribution(s) alone or by combination of their Contribution(s)
|
| 81 |
+
with the Work to which such Contribution(s) was submitted. If You
|
| 82 |
+
institute patent litigation against any entity (including a
|
| 83 |
+
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
| 84 |
+
or a Contribution incorporated within the Work constitutes direct
|
| 85 |
+
or contributory patent infringement, then any patent licenses
|
| 86 |
+
granted to You under this License for that Work shall terminate
|
| 87 |
+
as of the date such litigation is filed.
|
| 88 |
+
|
| 89 |
+
4. Redistribution. You may reproduce and distribute copies of the
|
| 90 |
+
Work or Derivative Works thereof in any medium, with or without
|
| 91 |
+
modifications, and in Source or Object form, provided that You
|
| 92 |
+
meet the following conditions:
|
| 93 |
+
|
| 94 |
+
(a) You must give any other recipients of the Work or
|
| 95 |
+
Derivative Works a copy of this License; and
|
| 96 |
+
|
| 97 |
+
(b) You must cause any modified files to carry prominent notices
|
| 98 |
+
stating that You changed the files; and
|
| 99 |
+
|
| 100 |
+
(c) You must retain, in the Source form of any Derivative Works
|
| 101 |
+
that You distribute, all copyright, patent, trademark, and
|
| 102 |
+
attribution notices from the Source form of the Work,
|
| 103 |
+
excluding those notices that do not pertain to any part of
|
| 104 |
+
the Derivative Works; and
|
| 105 |
+
|
| 106 |
+
(d) If the Work includes a "NOTICE" text file as part of its
|
| 107 |
+
distribution, then any Derivative Works that You distribute must
|
| 108 |
+
include a readable copy of the attribution notices contained
|
| 109 |
+
within such NOTICE file, excluding those notices that do not
|
| 110 |
+
pertain to any part of the Derivative Works, in at least one
|
| 111 |
+
of the following places: within a NOTICE text file distributed
|
| 112 |
+
as part of the Derivative Works; within the Source form or
|
| 113 |
+
documentation, if provided along with the Derivative Works; or,
|
| 114 |
+
within a display generated by the Derivative Works, if and
|
| 115 |
+
wherever such third-party notices normally appear. The contents
|
| 116 |
+
of the NOTICE file are for informational purposes only and
|
| 117 |
+
do not modify the License. You may add Your own attribution
|
| 118 |
+
notices within Derivative Works that You distribute, alongside
|
| 119 |
+
or as an addendum to the NOTICE text from the Work, provided
|
| 120 |
+
that such additional attribution notices cannot be construed
|
| 121 |
+
as modifying the License.
|
| 122 |
+
|
| 123 |
+
You may add Your own copyright statement to Your modifications and
|
| 124 |
+
may provide additional or different license terms and conditions
|
| 125 |
+
for use, reproduction, or distribution of Your modifications, or
|
| 126 |
+
for any such Derivative Works as a whole, provided Your use,
|
| 127 |
+
reproduction, and distribution of the Work otherwise complies with
|
| 128 |
+
the conditions stated in this License.
|
| 129 |
+
|
| 130 |
+
5. Submission of Contributions. Unless You explicitly state otherwise,
|
| 131 |
+
any Contribution intentionally submitted for inclusion in the Work
|
| 132 |
+
by You to the Licensor shall be under the terms and conditions of
|
| 133 |
+
this License, without any additional terms or conditions.
|
| 134 |
+
Notwithstanding the above, nothing herein shall supersede or modify
|
| 135 |
+
the terms of any separate license agreement you may have executed
|
| 136 |
+
with Licensor regarding such Contributions.
|
| 137 |
+
|
| 138 |
+
6. Trademarks. This License does not grant permission to use the trade
|
| 139 |
+
names, trademarks, service marks, or product names of the Licensor,
|
| 140 |
+
except as required for reasonable and customary use in describing the
|
| 141 |
+
origin of the Work and reproducing the content of the NOTICE file.
|
| 142 |
+
|
| 143 |
+
7. Disclaimer of Warranty. Unless required by applicable law or
|
| 144 |
+
agreed to in writing, Licensor provides the Work (and each
|
| 145 |
+
Contributor provides its Contributions) on an "AS IS" BASIS,
|
| 146 |
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
| 147 |
+
implied, including, without limitation, any warranties or conditions
|
| 148 |
+
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
| 149 |
+
PARTICULAR PURPOSE. You are solely responsible for determining the
|
| 150 |
+
appropriateness of using or redistributing the Work and assume any
|
| 151 |
+
risks associated with Your exercise of permissions under this License.
|
| 152 |
+
|
| 153 |
+
8. Limitation of Liability. In no event and under no legal theory,
|
| 154 |
+
whether in tort (including negligence), contract, or otherwise,
|
| 155 |
+
unless required by applicable law (such as deliberate and grossly
|
| 156 |
+
negligent acts) or agreed to in writing, shall any Contributor be
|
| 157 |
+
liable to You for damages, including any direct, indirect, special,
|
| 158 |
+
incidental, or consequential damages of any character arising as a
|
| 159 |
+
result of this License or out of the use or inability to use the
|
| 160 |
+
Work (including but not limited to damages for loss of goodwill,
|
| 161 |
+
work stoppage, computer failure or malfunction, or any and all
|
| 162 |
+
other commercial damages or losses), even if such Contributor
|
| 163 |
+
has been advised of the possibility of such damages.
|
| 164 |
+
|
| 165 |
+
9. Accepting Warranty or Additional Liability. While redistributing
|
| 166 |
+
the Work or Derivative Works thereof, You may choose to offer,
|
| 167 |
+
and charge a fee for, acceptance of support, warranty, indemnity,
|
| 168 |
+
or other liability obligations and/or rights consistent with this
|
| 169 |
+
License. However, in accepting such obligations, You may act only
|
| 170 |
+
on Your own behalf and on Your sole responsibility, not on behalf
|
| 171 |
+
of any other Contributor, and only if You agree to indemnify,
|
| 172 |
+
defend, and hold each Contributor harmless for any liability
|
| 173 |
+
incurred by, or claims asserted against, such Contributor by reason
|
| 174 |
+
of your accepting any such warranty or additional liability.
|
| 175 |
+
|
| 176 |
+
END OF TERMS AND CONDITIONS
|
packages/typhoon_ocr/MANIFEST.in
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
include LICENSE
|
| 2 |
+
include README.md
|
| 3 |
+
include pyproject.toml
|
packages/typhoon_ocr/README.md
ADDED
|
@@ -0,0 +1,134 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Typhoon OCR
|
| 2 |
+
|
| 3 |
+
Typhoon OCR is a model for extracting structured markdown from images or PDFs. It supports document layout analysis and table extraction, returning results in markdown or HTML. This package provides utilities to convert images and PDFs to the format supported by the Typhoon OCR model.
|
| 4 |
+
|
| 5 |
+
## Languages Supported
|
| 6 |
+
|
| 7 |
+
The Typhoon OCR model supports:
|
| 8 |
+
- English
|
| 9 |
+
- Thai
|
| 10 |
+
|
| 11 |
+
## Features
|
| 12 |
+
|
| 13 |
+
- Convert images to PDFs for unified processing
|
| 14 |
+
- Extract text and layout information from PDFs and images
|
| 15 |
+
- Generate OCR-ready messages for API processing with Typhoon OCR model
|
| 16 |
+
- Built-in prompt templates for different document processing tasks
|
| 17 |
+
- Process specific pages from multi-page PDF documents
|
| 18 |
+
|
| 19 |
+
## Installation
|
| 20 |
+
|
| 21 |
+
```bash
|
| 22 |
+
pip install typhoon-ocr
|
| 23 |
+
```
|
| 24 |
+
|
| 25 |
+
### System Requirements
|
| 26 |
+
|
| 27 |
+
The package requires the Poppler utilities to be installed on your system:
|
| 28 |
+
|
| 29 |
+
#### For macOS:
|
| 30 |
+
```bash
|
| 31 |
+
brew install poppler
|
| 32 |
+
```
|
| 33 |
+
|
| 34 |
+
#### For Linux:
|
| 35 |
+
```bash
|
| 36 |
+
sudo apt-get update
|
| 37 |
+
sudo apt-get install poppler-utils
|
| 38 |
+
```
|
| 39 |
+
|
| 40 |
+
The following binaries are required:
|
| 41 |
+
- `pdfinfo`
|
| 42 |
+
- `pdftoppm`
|
| 43 |
+
|
| 44 |
+
## Usage
|
| 45 |
+
|
| 46 |
+
### Core functionality
|
| 47 |
+
|
| 48 |
+
The package provides 2 main functions:
|
| 49 |
+
|
| 50 |
+
```python
|
| 51 |
+
from typhoon_ocr import ocr_document, prepare_ocr_messages
|
| 52 |
+
```
|
| 53 |
+
* `ocr_document`: Full OCR pipeline for Typhoon OCR model via opentyphoon.ai or OpenAI compatible api (such as vllm)
|
| 54 |
+
* `prepare_ocr_messages`: Generate complete OCR-ready messages for the Typhoon OCR model
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
### Complete OCR workflow
|
| 58 |
+
|
| 59 |
+
Use the simplified API to ocr the document or prepare messages for OpenAI compatible api at opentyphoon.ai:
|
| 60 |
+
|
| 61 |
+
```python
|
| 62 |
+
from typhoon_ocr import ocr_document
|
| 63 |
+
|
| 64 |
+
markdown = ocr_document(
|
| 65 |
+
pdf_or_image_path="document.pdf", # Works with PDFs or images
|
| 66 |
+
task_type="default", # Choose between "default" or "structure"
|
| 67 |
+
page_num=2 # Process page 2 of a PDF (default is 1, always 1 for images)
|
| 68 |
+
)
|
| 69 |
+
|
| 70 |
+
# Or with image
|
| 71 |
+
markdown = ocr_document(
|
| 72 |
+
pdf_or_image_path="scan.jpg", # Works with PDFs or images
|
| 73 |
+
task_type="default", # Choose between "default" or "structure"
|
| 74 |
+
)
|
| 75 |
+
```
|
| 76 |
+
|
| 77 |
+
Prepare the messages manually.
|
| 78 |
+
|
| 79 |
+
```python
|
| 80 |
+
from typhoon_ocr import prepare_ocr_messages
|
| 81 |
+
from openai import OpenAI
|
| 82 |
+
|
| 83 |
+
# Prepare messages for OCR processing
|
| 84 |
+
messages = prepare_ocr_messages(
|
| 85 |
+
pdf_or_image_path="document.pdf", # Works with PDFs or images
|
| 86 |
+
task_type="default", # Choose between "default" or "structure"
|
| 87 |
+
page_num=2 # Process page 2 of a PDF (default is 1, always 1 for images)
|
| 88 |
+
)
|
| 89 |
+
|
| 90 |
+
# Use with https://opentyphoon.ai/ api or self-host model via vllm
|
| 91 |
+
# See model list at https://huggingface.co/collections/scb10x/typhoon-ocr-682713483cb934ab0cf069bd
|
| 92 |
+
client = OpenAI(base_url='https://api.opentyphoon.ai/v1')
|
| 93 |
+
response = client.chat.completions.create(
|
| 94 |
+
model="typhoon-ocr-preview",
|
| 95 |
+
messages=messages,
|
| 96 |
+
max_tokens=16000,
|
| 97 |
+
extra_body={
|
| 98 |
+
"repetition_penalty": 1.2,
|
| 99 |
+
"temperature": 0.1,
|
| 100 |
+
"top_p": 0.6,
|
| 101 |
+
},
|
| 102 |
+
|
| 103 |
+
)
|
| 104 |
+
|
| 105 |
+
# Parse the JSON response
|
| 106 |
+
text_output = response.choices[0].message.content
|
| 107 |
+
markdown = json.loads(text_output)['natural_text']
|
| 108 |
+
print(markdown)
|
| 109 |
+
```
|
| 110 |
+
|
| 111 |
+
### Available task types
|
| 112 |
+
|
| 113 |
+
The package comes with built-in prompt templates for different OCR tasks:
|
| 114 |
+
|
| 115 |
+
- `default`: Extracts markdown representation of the document with tables in markdown format
|
| 116 |
+
- `structure`: Provides more structured output with HTML tables and image analysis placeholders
|
| 117 |
+
|
| 118 |
+
## Document Extraction Capabilities
|
| 119 |
+
|
| 120 |
+
The Typhoon OCR model, when used with this package, can extract:
|
| 121 |
+
|
| 122 |
+
- Structured text with proper layout preservation
|
| 123 |
+
- Tables (in markdown or HTML format)
|
| 124 |
+
- Document hierarchy (headings, paragraphs, lists)
|
| 125 |
+
- Text with positional information
|
| 126 |
+
- Basic image analysis and placement
|
| 127 |
+
|
| 128 |
+
## License
|
| 129 |
+
|
| 130 |
+
This project code is licensed under the Apache 2.0 License.
|
| 131 |
+
|
| 132 |
+
## Acknowledgments
|
| 133 |
+
|
| 134 |
+
The code is based on work from [OlmoCR](https://github.com/allenai/olmocr) under the Apache 2.0 license.
|
packages/typhoon_ocr/build_and_upload.sh
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
python -m build && python -m twine upload dist/* --verbose
|
packages/typhoon_ocr/pyproject.toml
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[build-system]
|
| 2 |
+
requires = ["setuptools>=42", "wheel"]
|
| 3 |
+
build-backend = "setuptools.build_meta"
|
| 4 |
+
|
| 5 |
+
[tool.black]
|
| 6 |
+
line-length = 88
|
| 7 |
+
target-version = ["py37"]
|
| 8 |
+
include = '\.pyi?$'
|
| 9 |
+
|
| 10 |
+
[tool.isort]
|
| 11 |
+
profile = "black"
|
| 12 |
+
line_length = 88
|
packages/typhoon_ocr/setup.py
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from setuptools import setup, find_packages
|
| 2 |
+
|
| 3 |
+
with open("README.md", "r", encoding="utf-8") as fh:
|
| 4 |
+
long_description = fh.read()
|
| 5 |
+
|
| 6 |
+
setup(
|
| 7 |
+
name="typhoon-ocr",
|
| 8 |
+
version="0.3.8",
|
| 9 |
+
author="Typhoon OCR Contributors",
|
| 10 |
+
author_email="contact@opentyphoon.ai",
|
| 11 |
+
description="A package for extracting structured content from PDFs and images using Typhoon OCR models",
|
| 12 |
+
long_description=long_description,
|
| 13 |
+
long_description_content_type="text/markdown",
|
| 14 |
+
url="https://github.com/scb-10x/typhoon-ocr",
|
| 15 |
+
packages=find_packages(),
|
| 16 |
+
classifiers=[
|
| 17 |
+
"Programming Language :: Python :: 3",
|
| 18 |
+
"License :: OSI Approved :: Apache Software License",
|
| 19 |
+
"Operating System :: OS Independent",
|
| 20 |
+
],
|
| 21 |
+
python_requires=">=3.7",
|
| 22 |
+
install_requires=[
|
| 23 |
+
"ftfy",
|
| 24 |
+
"pypdf",
|
| 25 |
+
"pillow",
|
| 26 |
+
"openai",
|
| 27 |
+
],
|
| 28 |
+
extras_require={
|
| 29 |
+
"dev": [
|
| 30 |
+
"pytest",
|
| 31 |
+
"black",
|
| 32 |
+
"flake8",
|
| 33 |
+
],
|
| 34 |
+
"app": [
|
| 35 |
+
"gradio",
|
| 36 |
+
"python-dotenv",
|
| 37 |
+
],
|
| 38 |
+
},
|
| 39 |
+
)
|
packages/typhoon_ocr/typhoon_ocr/__init__.py
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Typhoon OCR is a model for extracting structured markdown from images or PDFs.
|
| 3 |
+
|
| 4 |
+
This package provides utilities for document analysis, layout extraction, and OCR processing.
|
| 5 |
+
It focuses on structured text extraction with proper formatting and layout preservation.
|
| 6 |
+
|
| 7 |
+
Main Functions:
|
| 8 |
+
- prepare_ocr_messages: Generate OCR-ready messages from PDFs or images
|
| 9 |
+
- get_prompt: Access built-in prompt templates for different OCR tasks
|
| 10 |
+
- image_to_pdf: Convert image files to PDF format
|
| 11 |
+
|
| 12 |
+
Requirements:
|
| 13 |
+
- Poppler utilities (pdfinfo, pdftoppm) must be installed on the system
|
| 14 |
+
- Appropriate dependencies (ftfy, pypdf, pillow) for text processing
|
| 15 |
+
|
| 16 |
+
Example Usage:
|
| 17 |
+
>>> from typhoon_ocr import prepare_ocr_messages
|
| 18 |
+
>>> messages = prepare_ocr_messages("document.pdf", task_type="default", page_num=1)
|
| 19 |
+
>>> # Use messages with LLM API for OCR processing
|
| 20 |
+
"""
|
| 21 |
+
from .pdf_utils import pdf_utils_available
|
| 22 |
+
from .ocr_utils import (
|
| 23 |
+
prepare_ocr_messages,
|
| 24 |
+
get_prompt,
|
| 25 |
+
get_anchor_text,
|
| 26 |
+
image_to_pdf,
|
| 27 |
+
ocr_document,
|
| 28 |
+
)
|
| 29 |
+
|
| 30 |
+
__version__ = "0.3.8"
|
| 31 |
+
|
| 32 |
+
__all__ = [
|
| 33 |
+
"pdf_utils_available",
|
| 34 |
+
"prepare_ocr_messages",
|
| 35 |
+
"get_prompt",
|
| 36 |
+
"get_anchor_text",
|
| 37 |
+
"image_to_pdf",
|
| 38 |
+
"ocr_document",
|
| 39 |
+
]
|
packages/typhoon_ocr/typhoon_ocr/ocr_utils.py
ADDED
|
@@ -0,0 +1,623 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Utility functions for Typhoon OCR.
|
| 3 |
+
|
| 4 |
+
This code is adapted from https://github.com/allenai/olmocr
|
| 5 |
+
Under the Apache 2.0 license.
|
| 6 |
+
Edited by Typhoon OCR Contributors.
|
| 7 |
+
"""
|
| 8 |
+
from dataclasses import dataclass
|
| 9 |
+
import json
|
| 10 |
+
from openai import OpenAI
|
| 11 |
+
import os
|
| 12 |
+
import re
|
| 13 |
+
import io
|
| 14 |
+
import tempfile
|
| 15 |
+
from PIL import Image
|
| 16 |
+
import subprocess
|
| 17 |
+
import base64
|
| 18 |
+
from typing import Any, Callable, Dict, List, Literal
|
| 19 |
+
import random
|
| 20 |
+
import ftfy
|
| 21 |
+
from pypdf.generic import RectangleObject
|
| 22 |
+
from pypdf import PdfReader
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
@dataclass(frozen=True)
|
| 26 |
+
class Element:
|
| 27 |
+
pass
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
@dataclass(frozen=True)
|
| 31 |
+
class BoundingBox:
|
| 32 |
+
x0: float
|
| 33 |
+
y0: float
|
| 34 |
+
x1: float
|
| 35 |
+
y1: float
|
| 36 |
+
|
| 37 |
+
@staticmethod
|
| 38 |
+
def from_rectangle(rect: RectangleObject) -> "BoundingBox":
|
| 39 |
+
return BoundingBox(rect[0], rect[1], rect[2], rect[3])
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
@dataclass(frozen=True)
|
| 43 |
+
class TextElement(Element):
|
| 44 |
+
text: str
|
| 45 |
+
x: float
|
| 46 |
+
y: float
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
@dataclass(frozen=True)
|
| 50 |
+
class ImageElement(Element):
|
| 51 |
+
name: str
|
| 52 |
+
bbox: BoundingBox
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
@dataclass(frozen=True)
|
| 56 |
+
class PageReport:
|
| 57 |
+
mediabox: BoundingBox
|
| 58 |
+
text_elements: List[TextElement]
|
| 59 |
+
image_elements: List[ImageElement]
|
| 60 |
+
|
| 61 |
+
def image_to_pdf(image_path):
|
| 62 |
+
try:
|
| 63 |
+
# Open the image file.
|
| 64 |
+
img = Image.open(image_path)
|
| 65 |
+
# Create a temporary file to store the PDF.
|
| 66 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
|
| 67 |
+
filename = tmp.name
|
| 68 |
+
temp_pdf_created = True
|
| 69 |
+
# Convert image to RGB if necessary and save as PDF.
|
| 70 |
+
if img.mode != "RGB":
|
| 71 |
+
img = img.convert("RGB")
|
| 72 |
+
img.save(filename, "PDF")
|
| 73 |
+
return filename
|
| 74 |
+
except Exception as conv_err:
|
| 75 |
+
return None
|
| 76 |
+
|
| 77 |
+
def get_pdf_media_box_width_height(local_pdf_path: str, page_num: int) -> tuple[float, float]:
|
| 78 |
+
"""
|
| 79 |
+
Get the MediaBox dimensions for a specific page in a PDF file using the pdfinfo command.
|
| 80 |
+
|
| 81 |
+
:param pdf_file: Path to the PDF file
|
| 82 |
+
:param page_num: The page number for which to extract MediaBox dimensions
|
| 83 |
+
:return: A dictionary containing MediaBox dimensions or None if not found
|
| 84 |
+
"""
|
| 85 |
+
|
| 86 |
+
# Construct the pdfinfo command to extract info for the specific page
|
| 87 |
+
command = ["pdfinfo", "-f", str(page_num), "-l", str(page_num), "-box", "-enc", "UTF-8", local_pdf_path]
|
| 88 |
+
# Run the command using subprocess
|
| 89 |
+
result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
|
| 90 |
+
|
| 91 |
+
# Check if there is any error in executing the command
|
| 92 |
+
if result.returncode != 0:
|
| 93 |
+
raise ValueError(f"Error running pdfinfo: {result.stderr}")
|
| 94 |
+
|
| 95 |
+
# Parse the output to find MediaBox
|
| 96 |
+
output = result.stdout
|
| 97 |
+
|
| 98 |
+
for line in output.splitlines():
|
| 99 |
+
if "MediaBox" in line:
|
| 100 |
+
media_box_str: List[str] = line.split(":")[1].strip().split()
|
| 101 |
+
media_box: List[float] = [float(x) for x in media_box_str]
|
| 102 |
+
return abs(media_box[0] - media_box[2]), abs(media_box[3] - media_box[1])
|
| 103 |
+
|
| 104 |
+
raise ValueError("MediaBox not found in the PDF info.")
|
| 105 |
+
|
| 106 |
+
def render_pdf_to_base64png(local_pdf_path: str, page_num: int, target_longest_image_dim: int = 2048) -> str:
|
| 107 |
+
|
| 108 |
+
longest_dim = max(get_pdf_media_box_width_height(local_pdf_path, page_num))
|
| 109 |
+
|
| 110 |
+
# Convert PDF page to PNG using pdftoppm
|
| 111 |
+
pdftoppm_result = subprocess.run(
|
| 112 |
+
[
|
| 113 |
+
"pdftoppm",
|
| 114 |
+
"-png",
|
| 115 |
+
"-f",
|
| 116 |
+
str(page_num),
|
| 117 |
+
"-l",
|
| 118 |
+
str(page_num),
|
| 119 |
+
"-r",
|
| 120 |
+
str(target_longest_image_dim * 72 / longest_dim), # 72 pixels per point is the conversion factor
|
| 121 |
+
local_pdf_path,
|
| 122 |
+
],
|
| 123 |
+
timeout=120,
|
| 124 |
+
stdout=subprocess.PIPE,
|
| 125 |
+
stderr=subprocess.PIPE,
|
| 126 |
+
)
|
| 127 |
+
assert pdftoppm_result.returncode == 0, pdftoppm_result.stderr
|
| 128 |
+
return base64.b64encode(pdftoppm_result.stdout).decode("utf-8")
|
| 129 |
+
|
| 130 |
+
|
| 131 |
+
def _linearize_pdf_report(report: PageReport, max_length: int = 4000) -> str:
|
| 132 |
+
result = ""
|
| 133 |
+
result += f"Page dimensions: {report.mediabox.x1:.1f}x{report.mediabox.y1:.1f}\n"
|
| 134 |
+
|
| 135 |
+
if max_length < 20:
|
| 136 |
+
return result
|
| 137 |
+
|
| 138 |
+
images = _merge_image_elements(report.image_elements)
|
| 139 |
+
|
| 140 |
+
# Process image elements
|
| 141 |
+
image_strings = []
|
| 142 |
+
for element in images:
|
| 143 |
+
image_str = f"[Image {element.bbox.x0:.0f}x{element.bbox.y0:.0f} to {element.bbox.x1:.0f}x{element.bbox.y1:.0f}]\n"
|
| 144 |
+
# Use element's unique identifier (e.g., id or position) for comparison
|
| 145 |
+
image_strings.append((element, image_str))
|
| 146 |
+
|
| 147 |
+
# Process text elements
|
| 148 |
+
text_strings = []
|
| 149 |
+
for element in report.text_elements: # type: ignore
|
| 150 |
+
if len(element.text.strip()) == 0: # type: ignore
|
| 151 |
+
continue
|
| 152 |
+
|
| 153 |
+
element_text = _cleanup_element_text(element.text) # type: ignore
|
| 154 |
+
text_str = f"[{element.x:.0f}x{element.y:.0f}]{element_text}\n" # type: ignore
|
| 155 |
+
text_strings.append((element, text_str))
|
| 156 |
+
|
| 157 |
+
# Combine all elements with their positions for sorting
|
| 158 |
+
all_elements: list[tuple[str, ImageElement, str, tuple[float, float]]] = []
|
| 159 |
+
for elem, s in image_strings:
|
| 160 |
+
position = (elem.bbox.x0, elem.bbox.y0)
|
| 161 |
+
all_elements.append(("image", elem, s, position))
|
| 162 |
+
for elem, s in text_strings:
|
| 163 |
+
position = (elem.x, elem.y) # type: ignore
|
| 164 |
+
all_elements.append(("text", elem, s, position))
|
| 165 |
+
|
| 166 |
+
# Calculate total length
|
| 167 |
+
total_length = len(result) + sum(len(s) for _, _, s, _ in all_elements)
|
| 168 |
+
|
| 169 |
+
if total_length <= max_length:
|
| 170 |
+
# Include all elements
|
| 171 |
+
for _, _, s, _ in all_elements:
|
| 172 |
+
result += s
|
| 173 |
+
return result
|
| 174 |
+
|
| 175 |
+
# Identify elements with min/max coordinates
|
| 176 |
+
edge_elements = set()
|
| 177 |
+
|
| 178 |
+
if images:
|
| 179 |
+
min_x0_image = min(images, key=lambda e: e.bbox.x0)
|
| 180 |
+
max_x1_image = max(images, key=lambda e: e.bbox.x1)
|
| 181 |
+
min_y0_image = min(images, key=lambda e: e.bbox.y0)
|
| 182 |
+
max_y1_image = max(images, key=lambda e: e.bbox.y1)
|
| 183 |
+
edge_elements.update([min_x0_image, max_x1_image, min_y0_image, max_y1_image])
|
| 184 |
+
|
| 185 |
+
if report.text_elements:
|
| 186 |
+
text_elements = [e for e in report.text_elements if len(e.text.strip()) > 0]
|
| 187 |
+
if text_elements:
|
| 188 |
+
min_x_text = min(text_elements, key=lambda e: e.x)
|
| 189 |
+
max_x_text = max(text_elements, key=lambda e: e.x)
|
| 190 |
+
min_y_text = min(text_elements, key=lambda e: e.y)
|
| 191 |
+
max_y_text = max(text_elements, key=lambda e: e.y)
|
| 192 |
+
edge_elements.update([min_x_text, max_x_text, min_y_text, max_y_text]) # type: ignore
|
| 193 |
+
|
| 194 |
+
# Keep track of element IDs to prevent duplication
|
| 195 |
+
selected_element_ids = set()
|
| 196 |
+
selected_elements = []
|
| 197 |
+
|
| 198 |
+
# Include edge elements first
|
| 199 |
+
for elem_type, elem, s, position in all_elements:
|
| 200 |
+
if elem in edge_elements and id(elem) not in selected_element_ids:
|
| 201 |
+
selected_elements.append((elem_type, elem, s, position))
|
| 202 |
+
selected_element_ids.add(id(elem))
|
| 203 |
+
|
| 204 |
+
# Calculate remaining length
|
| 205 |
+
current_length = len(result) + sum(len(s) for _, _, s, _ in selected_elements)
|
| 206 |
+
_remaining_length = max_length - current_length
|
| 207 |
+
|
| 208 |
+
# Exclude edge elements from the pool
|
| 209 |
+
remaining_elements = [(elem_type, elem, s, position) for elem_type, elem, s, position in all_elements if id(elem) not in selected_element_ids]
|
| 210 |
+
|
| 211 |
+
# Sort remaining elements by their positions (e.g., x-coordinate and then y-coordinate)
|
| 212 |
+
# remaining_elements.sort(key=lambda x: (x[3][0], x[3][1]))
|
| 213 |
+
|
| 214 |
+
# Shuffle remaining elements randomly
|
| 215 |
+
random.shuffle(remaining_elements)
|
| 216 |
+
|
| 217 |
+
# Add elements until reaching max_length
|
| 218 |
+
for elem_type, elem, s, position in remaining_elements:
|
| 219 |
+
if current_length + len(s) > max_length:
|
| 220 |
+
break
|
| 221 |
+
selected_elements.append((elem_type, elem, s, position))
|
| 222 |
+
selected_element_ids.add(id(elem))
|
| 223 |
+
current_length += len(s)
|
| 224 |
+
|
| 225 |
+
# Sort selected elements by their positions to maintain logical order
|
| 226 |
+
selected_elements.sort(key=lambda x: (x[3][0], x[3][1]))
|
| 227 |
+
|
| 228 |
+
# Build the final result
|
| 229 |
+
for _, _, s, _ in selected_elements:
|
| 230 |
+
result += s
|
| 231 |
+
|
| 232 |
+
return result
|
| 233 |
+
|
| 234 |
+
|
| 235 |
+
def _cap_split_string(text: str, max_length: int) -> str:
|
| 236 |
+
if len(text) <= max_length:
|
| 237 |
+
return text
|
| 238 |
+
|
| 239 |
+
head_length = max_length // 2 - 3
|
| 240 |
+
tail_length = head_length
|
| 241 |
+
|
| 242 |
+
head = text[:head_length].rsplit(" ", 1)[0] or text[:head_length]
|
| 243 |
+
tail = text[-tail_length:].split(" ", 1)[-1] or text[-tail_length:]
|
| 244 |
+
|
| 245 |
+
return f"{head} ... {tail}"
|
| 246 |
+
|
| 247 |
+
|
| 248 |
+
def _cleanup_element_text(element_text: str) -> str:
|
| 249 |
+
MAX_TEXT_ELEMENT_LENGTH = 250
|
| 250 |
+
TEXT_REPLACEMENTS = {"[": "\\[", "]": "\\]", "\n": "\\n", "\r": "\\r", "\t": "\\t"}
|
| 251 |
+
text_replacement_pattern = re.compile("|".join(re.escape(key) for key in TEXT_REPLACEMENTS.keys()))
|
| 252 |
+
|
| 253 |
+
element_text = ftfy.fix_text(element_text).strip()
|
| 254 |
+
|
| 255 |
+
# Replace square brackets with escaped brackets and other escaped chars
|
| 256 |
+
element_text = text_replacement_pattern.sub(lambda match: TEXT_REPLACEMENTS[match.group(0)], element_text)
|
| 257 |
+
|
| 258 |
+
return _cap_split_string(element_text, MAX_TEXT_ELEMENT_LENGTH)
|
| 259 |
+
|
| 260 |
+
def _merge_image_elements(images: List[ImageElement], tolerance: float = 0.5) -> List[ImageElement]:
|
| 261 |
+
n = len(images)
|
| 262 |
+
parent = list(range(n)) # Initialize Union-Find parent pointers
|
| 263 |
+
|
| 264 |
+
def find(i):
|
| 265 |
+
# Find with path compression
|
| 266 |
+
root = i
|
| 267 |
+
while parent[root] != root:
|
| 268 |
+
root = parent[root]
|
| 269 |
+
while parent[i] != i:
|
| 270 |
+
parent_i = parent[i]
|
| 271 |
+
parent[i] = root
|
| 272 |
+
i = parent_i
|
| 273 |
+
return root
|
| 274 |
+
|
| 275 |
+
def union(i, j):
|
| 276 |
+
# Union by attaching root of one tree to another
|
| 277 |
+
root_i = find(i)
|
| 278 |
+
root_j = find(j)
|
| 279 |
+
if root_i != root_j:
|
| 280 |
+
parent[root_i] = root_j
|
| 281 |
+
|
| 282 |
+
def bboxes_overlap(b1: BoundingBox, b2: BoundingBox, tolerance: float) -> bool:
|
| 283 |
+
# Compute horizontal and vertical distances between boxes
|
| 284 |
+
h_dist = max(0, max(b1.x0, b2.x0) - min(b1.x1, b2.x1))
|
| 285 |
+
v_dist = max(0, max(b1.y0, b2.y0) - min(b1.y1, b2.y1))
|
| 286 |
+
# Check if distances are within tolerance
|
| 287 |
+
return h_dist <= tolerance and v_dist <= tolerance
|
| 288 |
+
|
| 289 |
+
# Union overlapping images
|
| 290 |
+
for i in range(n):
|
| 291 |
+
for j in range(i + 1, n):
|
| 292 |
+
if bboxes_overlap(images[i].bbox, images[j].bbox, tolerance):
|
| 293 |
+
union(i, j)
|
| 294 |
+
|
| 295 |
+
# Group images by their root parent
|
| 296 |
+
groups: dict[int, list[int]] = {}
|
| 297 |
+
for i in range(n):
|
| 298 |
+
root = find(i)
|
| 299 |
+
groups.setdefault(root, []).append(i)
|
| 300 |
+
|
| 301 |
+
# Merge images in the same group
|
| 302 |
+
merged_images = []
|
| 303 |
+
for indices in groups.values():
|
| 304 |
+
# Initialize merged bounding box
|
| 305 |
+
merged_bbox = images[indices[0]].bbox
|
| 306 |
+
merged_name = images[indices[0]].name
|
| 307 |
+
|
| 308 |
+
for idx in indices[1:]:
|
| 309 |
+
bbox = images[idx].bbox
|
| 310 |
+
# Expand merged_bbox to include the current bbox
|
| 311 |
+
merged_bbox = BoundingBox(
|
| 312 |
+
x0=min(merged_bbox.x0, bbox.x0),
|
| 313 |
+
y0=min(merged_bbox.y0, bbox.y0),
|
| 314 |
+
x1=max(merged_bbox.x1, bbox.x1),
|
| 315 |
+
y1=max(merged_bbox.y1, bbox.y1),
|
| 316 |
+
)
|
| 317 |
+
# Optionally, update the name
|
| 318 |
+
merged_name += f"+{images[idx].name}"
|
| 319 |
+
|
| 320 |
+
merged_images.append(ImageElement(name=merged_name, bbox=merged_bbox))
|
| 321 |
+
|
| 322 |
+
# Return the merged images along with other elements
|
| 323 |
+
return merged_images
|
| 324 |
+
|
| 325 |
+
def _transform_point(x, y, m):
|
| 326 |
+
x_new = m[0] * x + m[2] * y + m[4]
|
| 327 |
+
y_new = m[1] * x + m[3] * y + m[5]
|
| 328 |
+
return x_new, y_new
|
| 329 |
+
|
| 330 |
+
def _mult(m: List[float], n: List[float]) -> List[float]:
|
| 331 |
+
return [
|
| 332 |
+
m[0] * n[0] + m[1] * n[2],
|
| 333 |
+
m[0] * n[1] + m[1] * n[3],
|
| 334 |
+
m[2] * n[0] + m[3] * n[2],
|
| 335 |
+
m[2] * n[1] + m[3] * n[3],
|
| 336 |
+
m[4] * n[0] + m[5] * n[2] + n[4],
|
| 337 |
+
m[4] * n[1] + m[5] * n[3] + n[5],
|
| 338 |
+
]
|
| 339 |
+
|
| 340 |
+
def _pdf_report(local_pdf_path: str, page_num: int) -> PageReport:
|
| 341 |
+
reader = PdfReader(local_pdf_path)
|
| 342 |
+
page = reader.pages[page_num - 1]
|
| 343 |
+
resources = page.get("/Resources", {})
|
| 344 |
+
xobjects = resources.get("/XObject", {})
|
| 345 |
+
text_elements, image_elements = [], []
|
| 346 |
+
|
| 347 |
+
def visitor_body(text, cm, tm, font_dict, font_size):
|
| 348 |
+
txt2user = _mult(tm, cm)
|
| 349 |
+
text_elements.append(TextElement(text, txt2user[4], txt2user[5]))
|
| 350 |
+
|
| 351 |
+
def visitor_op(op, args, cm, tm):
|
| 352 |
+
if op == b"Do":
|
| 353 |
+
xobject_name = args[0]
|
| 354 |
+
xobject = xobjects.get(xobject_name)
|
| 355 |
+
if xobject and xobject["/Subtype"] == "/Image":
|
| 356 |
+
# Compute image bbox
|
| 357 |
+
# The image is placed according to the CTM
|
| 358 |
+
_width = xobject.get("/Width")
|
| 359 |
+
_height = xobject.get("/Height")
|
| 360 |
+
x0, y0 = _transform_point(0, 0, cm)
|
| 361 |
+
x1, y1 = _transform_point(1, 1, cm)
|
| 362 |
+
image_elements.append(ImageElement(xobject_name, BoundingBox(min(x0, x1), min(y0, y1), max(x0, x1), max(y0, y1))))
|
| 363 |
+
|
| 364 |
+
page.extract_text(visitor_text=visitor_body, visitor_operand_before=visitor_op)
|
| 365 |
+
|
| 366 |
+
return PageReport(
|
| 367 |
+
mediabox=BoundingBox.from_rectangle(page.mediabox),
|
| 368 |
+
text_elements=text_elements,
|
| 369 |
+
image_elements=image_elements,
|
| 370 |
+
)
|
| 371 |
+
|
| 372 |
+
def get_anchor_text(
|
| 373 |
+
local_pdf_path: str, page: int, pdf_engine: Literal["pdftotext", "pdfium", "pypdf", "topcoherency", "pdfreport"], target_length: int = 4000
|
| 374 |
+
) -> str:
|
| 375 |
+
assert page > 0, "Pages are 1-indexed in pdf-land"
|
| 376 |
+
|
| 377 |
+
|
| 378 |
+
if pdf_engine == "pdfreport":
|
| 379 |
+
return _linearize_pdf_report(_pdf_report(local_pdf_path, page), max_length=target_length)
|
| 380 |
+
else:
|
| 381 |
+
raise NotImplementedError("Unknown engine")
|
| 382 |
+
|
| 383 |
+
PROMPTS_SYS = {
|
| 384 |
+
"default": lambda base_text: (f"Below is an image of a document page along with its dimensions. "
|
| 385 |
+
f"Simply return the markdown representation of this document, presenting tables in markdown format as they naturally appear.\n"
|
| 386 |
+
f"If the document contains images, use a placeholder like dummy.png for each image.\n"
|
| 387 |
+
f"Your final output must be in JSON format with a single key `natural_text` containing the response.\n"
|
| 388 |
+
f"RAW_TEXT_START\n{base_text}\nRAW_TEXT_END"),
|
| 389 |
+
"structure": lambda base_text: (
|
| 390 |
+
f"Below is an image of a document page, along with its dimensions and possibly some raw textual content previously extracted from it. "
|
| 391 |
+
f"Note that the text extraction may be incomplete or partially missing. Carefully consider both the layout and any available text to reconstruct the document accurately.\n"
|
| 392 |
+
f"Your task is to return the markdown representation of this document, presenting tables in HTML format as they naturally appear.\n"
|
| 393 |
+
f"If the document contains images or figures, analyze them and include the tag <figure>IMAGE_ANALYSIS</figure> in the appropriate location.\n"
|
| 394 |
+
f"Your final output must be in JSON format with a single key `natural_text` containing the response.\n"
|
| 395 |
+
f"RAW_TEXT_START\n{base_text}\nRAW_TEXT_END"
|
| 396 |
+
),
|
| 397 |
+
}
|
| 398 |
+
|
| 399 |
+
def get_prompt(prompt_name: str) -> Callable[[str], str]:
|
| 400 |
+
"""
|
| 401 |
+
Get a prompt template function for the specified prompt type.
|
| 402 |
+
|
| 403 |
+
This function returns a callable that generates a prompt template based on the provided prompt name.
|
| 404 |
+
The returned function takes extracted text as input and returns a formatted prompt string
|
| 405 |
+
that can be used with OCR/vision models.
|
| 406 |
+
|
| 407 |
+
Available prompt types:
|
| 408 |
+
- "default": Creates a prompt for extracting text with tables in markdown format.
|
| 409 |
+
- "structure": Creates a prompt for extracting text with tables in HTML format and image analysis.
|
| 410 |
+
|
| 411 |
+
Args:
|
| 412 |
+
prompt_name (str): The identifier for the desired prompt template ("default" or "structure").
|
| 413 |
+
|
| 414 |
+
Returns:
|
| 415 |
+
Callable[[str], str]: A function that takes extracted text and returns a formatted prompt.
|
| 416 |
+
|
| 417 |
+
Examples:
|
| 418 |
+
>>> prompt_fn = get_prompt("default")
|
| 419 |
+
>>> formatted_prompt = prompt_fn("Sample extracted text")
|
| 420 |
+
>>> print(formatted_prompt[:50]) # Print first 50 chars
|
| 421 |
+
Below is an image of a document page along with its
|
| 422 |
+
"""
|
| 423 |
+
return PROMPTS_SYS.get(prompt_name, lambda x: "Invalid PROMPT_NAME provided.")
|
| 424 |
+
|
| 425 |
+
def image_to_base64png(img: Image.Image):
|
| 426 |
+
buffered = io.BytesIO()
|
| 427 |
+
img = img.convert("RGB")
|
| 428 |
+
img.save(buffered, format="JPEG")
|
| 429 |
+
return base64.b64encode(buffered.getvalue()).decode("utf-8")
|
| 430 |
+
|
| 431 |
+
def get_anchor_text_from_image(img: Image.Image):
|
| 432 |
+
width = float(img.width)
|
| 433 |
+
height = float(img.height)
|
| 434 |
+
text = f"""Page dimensions: {width:.1f}x{height:.1f}\n[Image 0x0 to {width:.0f}x{height:.0f}]\n"""
|
| 435 |
+
return text
|
| 436 |
+
|
| 437 |
+
def prepare_ocr_messages(
|
| 438 |
+
pdf_or_image_path: str,
|
| 439 |
+
task_type: str = "default",
|
| 440 |
+
target_image_dim: int = 1800,
|
| 441 |
+
target_text_length: int = 8000,
|
| 442 |
+
page_num: int = 1,
|
| 443 |
+
) -> List[Dict[str, Any]]:
|
| 444 |
+
"""
|
| 445 |
+
Prepare messages for OCR processing from a PDF or image file.
|
| 446 |
+
|
| 447 |
+
This function provides an end-to-end workflow that combines multiple processing steps
|
| 448 |
+
into a single call, creating messages ready for OCR processing with language models.
|
| 449 |
+
It handles both image and PDF inputs, with appropriate page selection for PDFs.
|
| 450 |
+
|
| 451 |
+
Processing Steps:
|
| 452 |
+
1. Convert image to PDF if necessary (images are always treated as single pages)
|
| 453 |
+
2. Render the selected PDF page to base64 PNG
|
| 454 |
+
3. Extract anchor text from the page with position information
|
| 455 |
+
4. Apply appropriate prompt template based on task type
|
| 456 |
+
5. Create a messages structure ready for LLM API submission
|
| 457 |
+
|
| 458 |
+
Args:
|
| 459 |
+
pdf_or_image_path (str): Path to a PDF or image file to process
|
| 460 |
+
task_type (str): Type of OCR task - "default" for standard markdown extraction,
|
| 461 |
+
"structure" for enhanced layout analysis with HTML tables
|
| 462 |
+
target_image_dim (int): Target longest dimension for the rendered image in pixels
|
| 463 |
+
target_text_length (int): Maximum length of extracted text to include
|
| 464 |
+
page_num (int): Page number to process (default=1, for images always 1)
|
| 465 |
+
|
| 466 |
+
Returns:
|
| 467 |
+
List[Dict[str, Any]]: Messages structure ready for OCR processing with an LLM API,
|
| 468 |
+
containing both text prompt and image data
|
| 469 |
+
|
| 470 |
+
Raises:
|
| 471 |
+
ValueError: If image conversion fails, page number is out of range, or other processing errors occur
|
| 472 |
+
|
| 473 |
+
Examples:
|
| 474 |
+
>>> # Process the first page of a PDF
|
| 475 |
+
>>> messages = prepare_ocr_messages("document.pdf")
|
| 476 |
+
>>>
|
| 477 |
+
>>> # Process page 5 of a PDF with structure analysis
|
| 478 |
+
>>> messages = prepare_ocr_messages(
|
| 479 |
+
... pdf_or_image_path="multipage.pdf",
|
| 480 |
+
... task_type="structure",
|
| 481 |
+
... page_num=5
|
| 482 |
+
... )
|
| 483 |
+
>>>
|
| 484 |
+
>>> # Process an image file (always page 1)
|
| 485 |
+
>>> messages = prepare_ocr_messages("scan.jpg")
|
| 486 |
+
"""
|
| 487 |
+
# Check for required PDF utilities
|
| 488 |
+
ext = os.path.splitext(pdf_or_image_path)[1].lower()
|
| 489 |
+
is_image = ext not in [".pdf"]
|
| 490 |
+
|
| 491 |
+
# Determine if the file is a PDF or image
|
| 492 |
+
filename = pdf_or_image_path
|
| 493 |
+
|
| 494 |
+
try:
|
| 495 |
+
if is_image:
|
| 496 |
+
page_num = 1
|
| 497 |
+
img = Image.open(pdf_or_image_path)
|
| 498 |
+
# Render the image to base64 PNG
|
| 499 |
+
image_base64 = image_to_base64png(img)
|
| 500 |
+
# Get anchor text from the image
|
| 501 |
+
anchor_text = get_anchor_text_from_image(img)
|
| 502 |
+
else:
|
| 503 |
+
if page_num < 1:
|
| 504 |
+
page_num = 1
|
| 505 |
+
else:
|
| 506 |
+
page_num = int(page_num) # cast to int
|
| 507 |
+
# Render the selected page to base64 PNG
|
| 508 |
+
image_base64 = render_pdf_to_base64png(
|
| 509 |
+
filename, page_num, target_longest_image_dim=target_image_dim
|
| 510 |
+
)
|
| 511 |
+
# Extract anchor text from the selected PDF page
|
| 512 |
+
anchor_text = get_anchor_text(
|
| 513 |
+
filename,
|
| 514 |
+
page_num,
|
| 515 |
+
pdf_engine="pdfreport",
|
| 516 |
+
target_length=target_text_length,
|
| 517 |
+
)
|
| 518 |
+
|
| 519 |
+
|
| 520 |
+
# Get the prompt template function for the specified task type
|
| 521 |
+
prompt_fn = get_prompt(task_type)
|
| 522 |
+
|
| 523 |
+
# Apply the prompt template to the extracted anchor text
|
| 524 |
+
prompt_text = prompt_fn(anchor_text)
|
| 525 |
+
|
| 526 |
+
# Create messages structure
|
| 527 |
+
messages = [
|
| 528 |
+
{
|
| 529 |
+
"role": "user",
|
| 530 |
+
"content": [
|
| 531 |
+
{"type": "text", "text": prompt_text},
|
| 532 |
+
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}},
|
| 533 |
+
],
|
| 534 |
+
}
|
| 535 |
+
]
|
| 536 |
+
|
| 537 |
+
return messages
|
| 538 |
+
except IndexError:
|
| 539 |
+
raise ValueError(f"Page number {page_num} is out of range for the document {pdf_or_image_path}")
|
| 540 |
+
except Exception as e:
|
| 541 |
+
raise ValueError(f"Error processing document: {str(e)}")
|
| 542 |
+
|
| 543 |
+
def is_base64_string(input_string: str) -> bool:
|
| 544 |
+
try:
|
| 545 |
+
# Try to decode and re-encode to check validity
|
| 546 |
+
return base64.b64encode(base64.b64decode(input_string))[:10] == input_string.encode()[:10]
|
| 547 |
+
except Exception:
|
| 548 |
+
return False
|
| 549 |
+
|
| 550 |
+
def ensure_image_in_path(input_string: str) -> str:
|
| 551 |
+
"""
|
| 552 |
+
Detect whether the input is a base64-encoded image or a file path.
|
| 553 |
+
|
| 554 |
+
- If it's base64, decode and save it as a temporary image file.
|
| 555 |
+
- If it's a valid image format (e.g. JPEG, PNG), preserve the format.
|
| 556 |
+
- If it's not base64, return the input as-is (assumed to be a path).
|
| 557 |
+
|
| 558 |
+
Returns:
|
| 559 |
+
str: A file path (either the original or a temp file path if base64).
|
| 560 |
+
"""
|
| 561 |
+
if input_string.endswith(".png") or input_string.endswith(".jpg") or input_string.endswith(".jpeg") or input_string.endswith(".pdf"):
|
| 562 |
+
return input_string
|
| 563 |
+
elif is_base64_string(input_string):
|
| 564 |
+
try:
|
| 565 |
+
image_data = base64.b64decode(input_string)
|
| 566 |
+
image = Image.open(io.BytesIO(image_data))
|
| 567 |
+
image_format = image.format.lower() # e.g. 'jpeg', 'png'
|
| 568 |
+
# Save image to a temporary file with correct extension
|
| 569 |
+
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=f".{image_format}")
|
| 570 |
+
image.save(temp_file.name, format=image_format)
|
| 571 |
+
return temp_file.name
|
| 572 |
+
except Exception:
|
| 573 |
+
return input_string
|
| 574 |
+
return input_string
|
| 575 |
+
|
| 576 |
+
def ocr_document(pdf_or_image_path: str, task_type: str = "default", target_image_dim: int = 1800, target_text_length: int = 8000, page_num: int = 1, base_url: str = os.getenv("TYPHOON_BASE_URL", 'https://api.opentyphoon.ai/v1'), api_key: str = None, model: str = "typhoon-ocr-preview") -> str:
|
| 577 |
+
"""
|
| 578 |
+
OCR a PDF or image file.
|
| 579 |
+
|
| 580 |
+
This function provides an end-to-end workflow that combines multiple processing steps
|
| 581 |
+
into a single call, creating messages ready for OCR processing with language models.
|
| 582 |
+
It handles both image and PDF inputs, with appropriate page selection for PDFs.
|
| 583 |
+
|
| 584 |
+
Args:
|
| 585 |
+
pdf_or_image_path (str): Path to a PDF or image file to process
|
| 586 |
+
task_type (str): Type of OCR task - "default" for standard markdown extraction,
|
| 587 |
+
"structure" for enhanced layout analysis with HTML tables
|
| 588 |
+
target_image_dim (int): Target longest dimension for the rendered image in pixels
|
| 589 |
+
target_text_length (int): Maximum length of extracted text to include
|
| 590 |
+
page_num (int): Page number to process (default=1, for images always 1)
|
| 591 |
+
base_url (str): API base URL
|
| 592 |
+
api_key (str): API key for authentication (will also check environment variables if None)
|
| 593 |
+
model (str): Model identifier to use for OCR
|
| 594 |
+
|
| 595 |
+
Returns:
|
| 596 |
+
str: Extracted text content in the specified format
|
| 597 |
+
|
| 598 |
+
Raises:
|
| 599 |
+
ValueError: If image conversion fails, page number is out of range, or other processing errors occur
|
| 600 |
+
"""
|
| 601 |
+
pdf_or_image_path = ensure_image_in_path(pdf_or_image_path)
|
| 602 |
+
|
| 603 |
+
openai = OpenAI(base_url=base_url, api_key=api_key or os.getenv("TYPHOON_OCR_API_KEY") or os.getenv('TYPHOON_API_KEY') or os.getenv("OPENAI_API_KEY"))
|
| 604 |
+
messages = prepare_ocr_messages(
|
| 605 |
+
pdf_or_image_path=pdf_or_image_path,
|
| 606 |
+
task_type=task_type,
|
| 607 |
+
target_image_dim=target_image_dim,
|
| 608 |
+
target_text_length=target_text_length,
|
| 609 |
+
page_num=page_num if page_num else 1
|
| 610 |
+
)
|
| 611 |
+
response = openai.chat.completions.create(
|
| 612 |
+
model=model,
|
| 613 |
+
messages=messages,
|
| 614 |
+
max_tokens=16384,
|
| 615 |
+
extra_body={
|
| 616 |
+
"repetition_penalty": 1.2,
|
| 617 |
+
"temperature": 0.1,
|
| 618 |
+
"top_p": 0.6,
|
| 619 |
+
},
|
| 620 |
+
)
|
| 621 |
+
text_output = response.choices[0].message.content
|
| 622 |
+
text = json.loads(text_output)['natural_text']
|
| 623 |
+
return text
|
packages/typhoon_ocr/typhoon_ocr/pdf_utils.py
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import shutil
|
| 2 |
+
import warnings
|
| 3 |
+
|
| 4 |
+
def check_pdf_utilities():
|
| 5 |
+
"""
|
| 6 |
+
Check if the required Poppler utilities (pdfinfo and pdftoppm) are installed.
|
| 7 |
+
|
| 8 |
+
This function verifies if the necessary PDF utilities are available on the system
|
| 9 |
+
and provides helpful instructions if they are missing.
|
| 10 |
+
|
| 11 |
+
Returns:
|
| 12 |
+
bool: True if all required utilities are available, False otherwise.
|
| 13 |
+
"""
|
| 14 |
+
missing_utils = []
|
| 15 |
+
|
| 16 |
+
# Check for pdfinfo
|
| 17 |
+
if shutil.which("pdfinfo") is None:
|
| 18 |
+
missing_utils.append("pdfinfo")
|
| 19 |
+
|
| 20 |
+
# Check for pdftoppm
|
| 21 |
+
if shutil.which("pdftoppm") is None:
|
| 22 |
+
missing_utils.append("pdftoppm")
|
| 23 |
+
|
| 24 |
+
if missing_utils:
|
| 25 |
+
warning_message = (
|
| 26 |
+
f"WARNING: The following required Poppler utilities are missing: {', '.join(missing_utils)}.\n"
|
| 27 |
+
"These utilities are required for PDF processing in Typhoon OCR.\n\n"
|
| 28 |
+
"Installation instructions:\n"
|
| 29 |
+
"- macOS: Run 'brew install poppler'\n"
|
| 30 |
+
"- Ubuntu/Debian: Run 'apt-get install poppler-utils'\n"
|
| 31 |
+
"- Windows: Install from https://github.com/oschwartz10612/poppler-windows/releases/ and add to PATH\n"
|
| 32 |
+
)
|
| 33 |
+
warnings.warn(warning_message, ImportWarning)
|
| 34 |
+
return False
|
| 35 |
+
|
| 36 |
+
return True
|
| 37 |
+
|
| 38 |
+
pdf_utils_available = check_pdf_utilities()
|
| 39 |
+
if not pdf_utils_available:
|
| 40 |
+
message = ('PDF utilities are not available.'
|
| 41 |
+
"Installation instructions for Poppler utilities:\n"
|
| 42 |
+
"- macOS: Run 'brew install poppler'\n"
|
| 43 |
+
"- Ubuntu/Debian: Run 'apt-get install poppler-utils'\n"
|
| 44 |
+
"- Windows: Install from https://github.com/oschwartz10612/poppler-windows/releases/ and add to PATH"
|
| 45 |
+
)
|
| 46 |
+
raise ImportError(message)
|
requirements.txt
CHANGED
|
@@ -1,5 +1,7 @@
|
|
| 1 |
openai
|
| 2 |
python-dotenv
|
|
|
|
|
|
|
| 3 |
gradio
|
| 4 |
-
|
| 5 |
-
typhoon-ocr
|
|
|
|
| 1 |
openai
|
| 2 |
python-dotenv
|
| 3 |
+
ftfy
|
| 4 |
+
pypdf
|
| 5 |
gradio
|
| 6 |
+
pillow
|
| 7 |
+
typhoon-ocr
|