trapezius60 commited on
Commit
47d96e9
·
verified ·
1 Parent(s): 2e0735a

Upload 17 files

Browse files
.env.template ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ TYPHOON_BASE_URL=
2
+ TYPHOON_API_KEY=
3
+ TYPHOON_OCR_MODEL=typhoon-ocr-preview
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ examples/test.png filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ .env
2
+ *.pyc
3
+ .specstory
README.md CHANGED
@@ -1,12 +1,79 @@
1
- ---
2
- title: Ocr Typhoon
3
- emoji: 💻
4
- colorFrom: pink
5
- colorTo: yellow
6
- sdk: gradio
7
- sdk_version: 5.49.1
8
- app_file: app.py
9
- pinned: false
10
- ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## Typhoon OCR
2
+
3
+ Typhoon OCR is a model for extracting structured markdown from images or PDFs. It supports document layout analysis and table extraction, returning results in markdown or HTML. This package is a simple Gradio website to demonstrate the performance of Typhoon OCR.
4
+
5
+ ### Features
6
+
7
+ - Upload a PDF or image (single page)
8
+ - Extracts and reconstructs document content as markdown
9
+ - Supports different prompt modes for layout or structure
10
+ - Language: English, Thai
11
+ - Uses a local or remote OpenAI-compatible API (e.g., vllm, opentyphoon.ai)
12
+ - See blog for more detail https://opentyphoon.ai/blog/en/typhoon-ocr-release
13
+
14
+ ### Requirements
15
+
16
+ - Linux / Mac with python (window not supported at the moment)
17
+
18
+ ### Install
19
+
20
+ ```bash
21
+ pip install typhoon-ocr
22
+ ```
23
+
24
+ or to run the gradio app.
25
+
26
+ ```bash
27
+ pip install -r requirements.txt
28
+ # edit .env
29
+ # pip install vllm # optional for hosting a local server
30
+ ```
31
+
32
+ ### Mac specific
33
+
34
+ ```
35
+ brew install poppler
36
+ # The following binaries are required and provided by poppler:
37
+ # - pdfinfo
38
+ # - pdftoppm
39
+ ```
40
+
41
+ ### Linux specific
42
+
43
+ ```
44
+ sudo apt-get update
45
+ sudo apt-get install poppler-utils
46
+ # The following binaries are required and provided by poppler-utils:
47
+ # - pdfinfo
48
+ # - pdftoppm
49
+ ```
50
+
51
+ ### Start vllm
52
+
53
+ ```bash
54
+ vllm serve scb10x/typhoon-ocr-7b --served-model-name typhoon-ocr --dtype bfloat16 --port 8101
55
+ ```
56
+
57
+ ### Run Gradio demo
58
+
59
+ ```bash
60
+ python app.py
61
+ ```
62
+
63
+ ### Dependencies
64
+
65
+ - openai
66
+ - python-dotenv
67
+ - ftfy
68
+ - pypdf
69
+ - gradio
70
+ - vllm (for hosting an inference server)
71
+ - pillow
72
+
73
+ ### Debug
74
+
75
+ - If `Error processing document` occur. Make sure you have install `brew install poppler` or `apt-get install poppler-utils`.
76
+
77
+ ### License
78
+
79
+ This project is licensed under the Apache 2.0 License. See individual datasets and checkpoints for their respective licenses.
app.py CHANGED
@@ -10,11 +10,7 @@ from PIL import Image
10
 
11
  load_dotenv()
12
 
13
- # Load API (OpenAI-compatible)
14
- openai = OpenAI(
15
- base_url=os.getenv("TYPHOON_BASE_URL"),
16
- api_key=os.getenv("TYPHOON_API_KEY")
17
- )
18
 
19
  theme = gr.themes.Soft(
20
  primary_hue=gr.themes.Color(
@@ -37,11 +33,11 @@ theme = gr.themes.Soft(
37
  def process_pdf(pdf_or_image_file, task_type, page_number):
38
  if pdf_or_image_file is None:
39
  return None, "No file uploaded"
40
-
41
  orig_filename = pdf_or_image_file.name
42
-
43
  try:
44
- # Prepare OCR messages using Typhoon’s helper
45
  messages = prepare_ocr_messages(
46
  pdf_or_image_path=orig_filename,
47
  task_type=task_type,
@@ -49,13 +45,13 @@ def process_pdf(pdf_or_image_file, task_type, page_number):
49
  target_text_length=8000,
50
  page_num=page_number if page_number else 1
51
  )
52
-
53
- # Extract image for preview
54
  image_url = messages[0]["content"][1]["image_url"]["url"]
55
  image_base64 = image_url.replace("data:image/png;base64,", "")
56
  image_pil = Image.open(BytesIO(base64.b64decode(image_base64)))
57
-
58
- # Send the request to Typhoon API
59
  response = openai.chat.completions.create(
60
  model=os.getenv("TYPHOON_OCR_MODEL"),
61
  messages=messages,
@@ -66,58 +62,82 @@ def process_pdf(pdf_or_image_file, task_type, page_number):
66
  "top_p": 0.6,
67
  },
68
  )
69
-
70
  text_output = response.choices[0].message.content
71
-
72
- # Try to extract `natural_text`
73
  try:
74
  json_data = json.loads(text_output)
75
- markdown_out = json_data.get("natural_text", "").replace("<figure>", "").replace("</figure>", "")
76
- except:
77
- markdown_out = text_output
78
-
79
  return image_pil, markdown_out
80
-
81
  except Exception as e:
82
- return None, f"Error: {str(e)}"
83
 
84
 
 
85
  with gr.Blocks(theme=theme) as demo:
86
  title = gr.HTML("""
87
- <h1>Typhoon OCR (API version)</h1>
88
- <p>This Space uses the Typhoon OCR API, NOT the huge local model. Works on CPU!</p>
89
- """)
90
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
  with gr.Row():
92
  with gr.Column(scale=1):
93
- pdf_input = gr.File(
94
- label="📄 Upload Image or PDF",
95
- file_types=[".pdf", ".png", ".jpg", ".jpeg"]
96
- )
97
-
98
- task_dropdown = gr.Radio(
99
- ["default", "structure"],
100
- label="🎯 OCR Task",
101
- value="default"
102
- )
103
-
104
- page_number = gr.Number(
105
- label="📄 Page Number (PDFs only)",
106
- value=1,
107
- minimum=1,
108
- step=1
109
- )
110
-
111
- run_button = gr.Button("🚀 Run OCR")
 
 
 
 
 
 
112
  image_output = gr.Image(label="📸 Preview Image", type="pil")
113
-
114
  with gr.Column(scale=2):
115
- markdown_output = gr.Markdown(label="OCR Result", show_label=True)
116
 
 
 
117
  run_button.click(
118
  fn=process_pdf,
119
  inputs=[pdf_input, task_dropdown, page_number],
120
  outputs=[image_output, markdown_output]
121
  )
122
 
123
- demo.launch()
 
 
10
 
11
  load_dotenv()
12
 
13
+ openai = OpenAI(base_url=os.getenv("TYPHOON_BASE_URL"), api_key=os.getenv("TYPHOON_API_KEY"))
 
 
 
 
14
 
15
  theme = gr.themes.Soft(
16
  primary_hue=gr.themes.Color(
 
33
  def process_pdf(pdf_or_image_file, task_type, page_number):
34
  if pdf_or_image_file is None:
35
  return None, "No file uploaded"
36
+
37
  orig_filename = pdf_or_image_file.name
38
+
39
  try:
40
+ # Use the new simplified function to prepare OCR messages with page number
41
  messages = prepare_ocr_messages(
42
  pdf_or_image_path=orig_filename,
43
  task_type=task_type,
 
45
  target_text_length=8000,
46
  page_num=page_number if page_number else 1
47
  )
48
+
49
+ # Extract the image from the message content for display
50
  image_url = messages[0]["content"][1]["image_url"]["url"]
51
  image_base64 = image_url.replace("data:image/png;base64,", "")
52
  image_pil = Image.open(BytesIO(base64.b64decode(image_base64)))
53
+
54
+ # Send messages to OpenAI compatible API
55
  response = openai.chat.completions.create(
56
  model=os.getenv("TYPHOON_OCR_MODEL"),
57
  messages=messages,
 
62
  "top_p": 0.6,
63
  },
64
  )
 
65
  text_output = response.choices[0].message.content
66
+
67
+ # Try to parse the output assuming it is a Python dictionary containing 'natural_text'
68
  try:
69
  json_data = json.loads(text_output)
70
+ markdown_out = json_data.get('natural_text', "").replace("<figure>", "").replace("</figure>", "")
71
+ except Exception as e:
72
+ markdown_out = f"⚠️ Could not extract `natural_text` from output.\nError: {str(e)}"
73
+
74
  return image_pil, markdown_out
75
+
76
  except Exception as e:
77
+ return None, f"Error processing file: {str(e)}"
78
 
79
 
80
+ # Build the Gradio UI.
81
  with gr.Blocks(theme=theme) as demo:
82
  title = gr.HTML("""
83
+ <h1>Typhoon OCR</h1>
84
+ <ul>
85
+ <li>🤗 <b>Model weights</b>: <a href="https://huggingface.co/scb10x/typhoon-ocr-7b" target="_blank">https://huggingface.co/scb10x/typhoon-ocr-7b</a></li>
86
+ </ul>
87
+ <br />
88
+ <details>
89
+ <summary><strong>Disclaimer</strong></summary>
90
+ The responses generated by this Artificial Intelligence (AI) system are autonomously constructed and do not necessarily reflect the views or positions of the developing organizations, their affiliates, or any of their employees. These AI-generated responses do not represent those of the organizations. The organizations do not endorse, support, sanction, encourage, verify, or agree with the comments, opinions, or statements generated by this AI. The information produced by this AI is not intended to malign any religion, ethnic group, club, organization, company, individual, anyone, or anything. It is not the intent of the organizations to malign any group or individual. The AI operates based on its programming and training data and its responses should not be interpreted as the explicit intent or opinion of the organizations.
91
+ </details>
92
+ <br />
93
+ <details>
94
+ <summary><strong>Terms of use</strong></summary>
95
+ By using this service, users are required to agree to the following terms: The service is a research preview intended for non-commercial use only. It only provides limited safety measures and may generate offensive content. It must not be used for any illegal, harmful, violent, racist, or sexual purposes. Vision language models are prone to hallucinations to a greater extent compared to text-only LLMs.
96
+ </details>
97
+ <br />
98
+ <details>
99
+ <summary><strong>License</strong></summary>
100
+ This project utilizes certain datasets and checkpoints that are subject to their respective original licenses. Users must comply with all terms and conditions of these original licenses. The content of this project itself is licensed under the Apache license 2.0.
101
+ </details>
102
+ """)
103
  with gr.Row():
104
  with gr.Column(scale=1):
105
+ # Update file_types to accept PDF as well as common image formats.
106
+ pdf_input = gr.File(label="📄 Upload Image file or PDF file", file_types=[".pdf", ".png", ".jpg", ".jpeg"])
107
+
108
+ with gr.Group(elem_classes=["task-background"]):
109
+ task_dropdown = gr.Radio(["default", "structure"], label="🎯 Select Task", value="default")
110
+ gr.HTML("""
111
+ <p><b>default</b>: This mode works for most cases and is recommended for files without a clear template such as infographics.</p>
112
+ <p><b>structure</b>: This mode offers improved performance for complex layout documents such as those containing images, tables and forms.</p>
113
+ <p>We recommend trying both and see which one works better for your use case.</p>
114
+ """, elem_classes=["task-dropdown-info"])
115
+ demo.css = """
116
+ .task-background {
117
+ background: var(--block-background-fill) !important;
118
+
119
+ }
120
+ .task-background > * {
121
+ background: var(--block-background-fill) !important;
122
+ }
123
+ .task-dropdown-info {
124
+ padding: 0 16px;
125
+ font-size: 12px;
126
+ }
127
+ """
128
+ page_number = gr.Number(label="📄 Page Number (for PDFs only)", value=1, minimum=1, step=1)
129
+ run_button = gr.Button("🚀 Run")
130
  image_output = gr.Image(label="📸 Preview Image", type="pil")
 
131
  with gr.Column(scale=2):
132
+ markdown_output = gr.Markdown(label='Markdown Result', show_label=True)
133
 
134
+
135
+ # Connect the UI inputs to the processing function.
136
  run_button.click(
137
  fn=process_pdf,
138
  inputs=[pdf_input, task_dropdown, page_number],
139
  outputs=[image_output, markdown_output]
140
  )
141
 
142
+ # Launch the Gradio demo (temporary public share for 72 hours)
143
+ demo.launch(share=False)
examples/simple_ocr.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ from typhoon_ocr import ocr_document
2
+ import os
3
+
4
+ # please set env TYPHOON_API_KEY or OPENAI_API_KEY to use this function
5
+
6
+ script_dir = os.path.dirname(os.path.abspath(__file__))
7
+ image_path = os.path.join(script_dir, "test.png")
8
+
9
+ markdown = ocr_document(image_path)
10
+ print(markdown)
examples/test.png ADDED

Git LFS Details

  • SHA256: 936687db2af4cb509c8b9070c5f000caface565bf02cfc7b35addaaa94375b07
  • Pointer size: 132 Bytes
  • Size of remote file: 2.97 MB
packages/typhoon_ocr/.gitignore ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Distribution / packaging
2
+ dist/
3
+ build/
4
+ *.egg-info/
5
+ *.egg
6
+
7
+ # Byte-compiled / optimized / DLL files
8
+ __pycache__/
9
+ *.py[cod]
10
+ *$py.class
11
+ *.so
12
+ .Python
13
+
14
+ # Virtual environments
15
+ venv/
16
+ env/
17
+ ENV/
18
+ .env
19
+ .venv
20
+ env.bak/
21
+ venv.bak/
22
+
23
+ # Unit test / coverage reports
24
+ htmlcov/
25
+ .tox/
26
+ .coverage
27
+ .coverage.*
28
+ .cache
29
+ nosetests.xml
30
+ coverage.xml
31
+ *.cover
32
+ .hypothesis/
33
+ .pytest_cache/
34
+
35
+ # Sphinx documentation
36
+ docs/_build/
37
+ docs/api/
38
+
39
+ # Jupyter Notebook
40
+ .ipynb_checkpoints
41
+
42
+ # IDE specific files
43
+ .idea/
44
+ .vscode/
45
+ *.swp
46
+ *.swo
47
+ .DS_Store
48
+
49
+ # Project specific
50
+ *.bak
51
+ *.tmp
52
+ temp/
packages/typhoon_ocr/LICENSE ADDED
@@ -0,0 +1,176 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Apache License
2
+ Version 2.0, January 2004
3
+ http://www.apache.org/licenses/
4
+
5
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6
+
7
+ 1. Definitions.
8
+
9
+ "License" shall mean the terms and conditions for use, reproduction,
10
+ and distribution as defined by Sections 1 through 9 of this document.
11
+
12
+ "Licensor" shall mean the copyright owner or entity authorized by
13
+ the copyright owner that is granting the License.
14
+
15
+ "Legal Entity" shall mean the union of the acting entity and all
16
+ other entities that control, are controlled by, or are under common
17
+ control with that entity. For the purposes of this definition,
18
+ "control" means (i) the power, direct or indirect, to cause the
19
+ direction or management of such entity, whether by contract or
20
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
21
+ outstanding shares, or (iii) beneficial ownership of such entity.
22
+
23
+ "You" (or "Your") shall mean an individual or Legal Entity
24
+ exercising permissions granted by this License.
25
+
26
+ "Source" form shall mean the preferred form for making modifications,
27
+ including but not limited to software source code, documentation
28
+ source, and configuration files.
29
+
30
+ "Object" form shall mean any form resulting from mechanical
31
+ transformation or translation of a Source form, including but
32
+ not limited to compiled object code, generated documentation,
33
+ and conversions to other media types.
34
+
35
+ "Work" shall mean the work of authorship, whether in Source or
36
+ Object form, made available under the License, as indicated by a
37
+ copyright notice that is included in or attached to the work
38
+ (an example is provided in the Appendix below).
39
+
40
+ "Derivative Works" shall mean any work, whether in Source or Object
41
+ form, that is based on (or derived from) the Work and for which the
42
+ editorial revisions, annotations, elaborations, or other modifications
43
+ represent, as a whole, an original work of authorship. For the purposes
44
+ of this License, Derivative Works shall not include works that remain
45
+ separable from, or merely link (or bind by name) to the interfaces of,
46
+ the Work and Derivative Works thereof.
47
+
48
+ "Contribution" shall mean any work of authorship, including
49
+ the original version of the Work and any modifications or additions
50
+ to that Work or Derivative Works thereof, that is intentionally
51
+ submitted to Licensor for inclusion in the Work by the copyright owner
52
+ or by an individual or Legal Entity authorized to submit on behalf of
53
+ the copyright owner. For the purposes of this definition, "submitted"
54
+ means any form of electronic, verbal, or written communication sent
55
+ to the Licensor or its representatives, including but not limited to
56
+ communication on electronic mailing lists, source code control systems,
57
+ and issue tracking systems that are managed by, or on behalf of, the
58
+ Licensor for the purpose of discussing and improving the Work, but
59
+ excluding communication that is conspicuously marked or otherwise
60
+ designated in writing by the copyright owner as "Not a Contribution."
61
+
62
+ "Contributor" shall mean Licensor and any individual or Legal Entity
63
+ on behalf of whom a Contribution has been received by Licensor and
64
+ subsequently incorporated within the Work.
65
+
66
+ 2. Grant of Copyright License. Subject to the terms and conditions of
67
+ this License, each Contributor hereby grants to You a perpetual,
68
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69
+ copyright license to reproduce, prepare Derivative Works of,
70
+ publicly display, publicly perform, sublicense, and distribute the
71
+ Work and such Derivative Works in Source or Object form.
72
+
73
+ 3. Grant of Patent License. Subject to the terms and conditions of
74
+ this License, each Contributor hereby grants to You a perpetual,
75
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76
+ (except as stated in this section) patent license to make, have made,
77
+ use, offer to sell, sell, import, and otherwise transfer the Work,
78
+ where such license applies only to those patent claims licensable
79
+ by such Contributor that are necessarily infringed by their
80
+ Contribution(s) alone or by combination of their Contribution(s)
81
+ with the Work to which such Contribution(s) was submitted. If You
82
+ institute patent litigation against any entity (including a
83
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
84
+ or a Contribution incorporated within the Work constitutes direct
85
+ or contributory patent infringement, then any patent licenses
86
+ granted to You under this License for that Work shall terminate
87
+ as of the date such litigation is filed.
88
+
89
+ 4. Redistribution. You may reproduce and distribute copies of the
90
+ Work or Derivative Works thereof in any medium, with or without
91
+ modifications, and in Source or Object form, provided that You
92
+ meet the following conditions:
93
+
94
+ (a) You must give any other recipients of the Work or
95
+ Derivative Works a copy of this License; and
96
+
97
+ (b) You must cause any modified files to carry prominent notices
98
+ stating that You changed the files; and
99
+
100
+ (c) You must retain, in the Source form of any Derivative Works
101
+ that You distribute, all copyright, patent, trademark, and
102
+ attribution notices from the Source form of the Work,
103
+ excluding those notices that do not pertain to any part of
104
+ the Derivative Works; and
105
+
106
+ (d) If the Work includes a "NOTICE" text file as part of its
107
+ distribution, then any Derivative Works that You distribute must
108
+ include a readable copy of the attribution notices contained
109
+ within such NOTICE file, excluding those notices that do not
110
+ pertain to any part of the Derivative Works, in at least one
111
+ of the following places: within a NOTICE text file distributed
112
+ as part of the Derivative Works; within the Source form or
113
+ documentation, if provided along with the Derivative Works; or,
114
+ within a display generated by the Derivative Works, if and
115
+ wherever such third-party notices normally appear. The contents
116
+ of the NOTICE file are for informational purposes only and
117
+ do not modify the License. You may add Your own attribution
118
+ notices within Derivative Works that You distribute, alongside
119
+ or as an addendum to the NOTICE text from the Work, provided
120
+ that such additional attribution notices cannot be construed
121
+ as modifying the License.
122
+
123
+ You may add Your own copyright statement to Your modifications and
124
+ may provide additional or different license terms and conditions
125
+ for use, reproduction, or distribution of Your modifications, or
126
+ for any such Derivative Works as a whole, provided Your use,
127
+ reproduction, and distribution of the Work otherwise complies with
128
+ the conditions stated in this License.
129
+
130
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
131
+ any Contribution intentionally submitted for inclusion in the Work
132
+ by You to the Licensor shall be under the terms and conditions of
133
+ this License, without any additional terms or conditions.
134
+ Notwithstanding the above, nothing herein shall supersede or modify
135
+ the terms of any separate license agreement you may have executed
136
+ with Licensor regarding such Contributions.
137
+
138
+ 6. Trademarks. This License does not grant permission to use the trade
139
+ names, trademarks, service marks, or product names of the Licensor,
140
+ except as required for reasonable and customary use in describing the
141
+ origin of the Work and reproducing the content of the NOTICE file.
142
+
143
+ 7. Disclaimer of Warranty. Unless required by applicable law or
144
+ agreed to in writing, Licensor provides the Work (and each
145
+ Contributor provides its Contributions) on an "AS IS" BASIS,
146
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147
+ implied, including, without limitation, any warranties or conditions
148
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149
+ PARTICULAR PURPOSE. You are solely responsible for determining the
150
+ appropriateness of using or redistributing the Work and assume any
151
+ risks associated with Your exercise of permissions under this License.
152
+
153
+ 8. Limitation of Liability. In no event and under no legal theory,
154
+ whether in tort (including negligence), contract, or otherwise,
155
+ unless required by applicable law (such as deliberate and grossly
156
+ negligent acts) or agreed to in writing, shall any Contributor be
157
+ liable to You for damages, including any direct, indirect, special,
158
+ incidental, or consequential damages of any character arising as a
159
+ result of this License or out of the use or inability to use the
160
+ Work (including but not limited to damages for loss of goodwill,
161
+ work stoppage, computer failure or malfunction, or any and all
162
+ other commercial damages or losses), even if such Contributor
163
+ has been advised of the possibility of such damages.
164
+
165
+ 9. Accepting Warranty or Additional Liability. While redistributing
166
+ the Work or Derivative Works thereof, You may choose to offer,
167
+ and charge a fee for, acceptance of support, warranty, indemnity,
168
+ or other liability obligations and/or rights consistent with this
169
+ License. However, in accepting such obligations, You may act only
170
+ on Your own behalf and on Your sole responsibility, not on behalf
171
+ of any other Contributor, and only if You agree to indemnify,
172
+ defend, and hold each Contributor harmless for any liability
173
+ incurred by, or claims asserted against, such Contributor by reason
174
+ of your accepting any such warranty or additional liability.
175
+
176
+ END OF TERMS AND CONDITIONS
packages/typhoon_ocr/MANIFEST.in ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ include LICENSE
2
+ include README.md
3
+ include pyproject.toml
packages/typhoon_ocr/README.md ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Typhoon OCR
2
+
3
+ Typhoon OCR is a model for extracting structured markdown from images or PDFs. It supports document layout analysis and table extraction, returning results in markdown or HTML. This package provides utilities to convert images and PDFs to the format supported by the Typhoon OCR model.
4
+
5
+ ## Languages Supported
6
+
7
+ The Typhoon OCR model supports:
8
+ - English
9
+ - Thai
10
+
11
+ ## Features
12
+
13
+ - Convert images to PDFs for unified processing
14
+ - Extract text and layout information from PDFs and images
15
+ - Generate OCR-ready messages for API processing with Typhoon OCR model
16
+ - Built-in prompt templates for different document processing tasks
17
+ - Process specific pages from multi-page PDF documents
18
+
19
+ ## Installation
20
+
21
+ ```bash
22
+ pip install typhoon-ocr
23
+ ```
24
+
25
+ ### System Requirements
26
+
27
+ The package requires the Poppler utilities to be installed on your system:
28
+
29
+ #### For macOS:
30
+ ```bash
31
+ brew install poppler
32
+ ```
33
+
34
+ #### For Linux:
35
+ ```bash
36
+ sudo apt-get update
37
+ sudo apt-get install poppler-utils
38
+ ```
39
+
40
+ The following binaries are required:
41
+ - `pdfinfo`
42
+ - `pdftoppm`
43
+
44
+ ## Usage
45
+
46
+ ### Core functionality
47
+
48
+ The package provides 2 main functions:
49
+
50
+ ```python
51
+ from typhoon_ocr import ocr_document, prepare_ocr_messages
52
+ ```
53
+ * `ocr_document`: Full OCR pipeline for Typhoon OCR model via opentyphoon.ai or OpenAI compatible api (such as vllm)
54
+ * `prepare_ocr_messages`: Generate complete OCR-ready messages for the Typhoon OCR model
55
+
56
+
57
+ ### Complete OCR workflow
58
+
59
+ Use the simplified API to ocr the document or prepare messages for OpenAI compatible api at opentyphoon.ai:
60
+
61
+ ```python
62
+ from typhoon_ocr import ocr_document
63
+
64
+ markdown = ocr_document(
65
+ pdf_or_image_path="document.pdf", # Works with PDFs or images
66
+ task_type="default", # Choose between "default" or "structure"
67
+ page_num=2 # Process page 2 of a PDF (default is 1, always 1 for images)
68
+ )
69
+
70
+ # Or with image
71
+ markdown = ocr_document(
72
+ pdf_or_image_path="scan.jpg", # Works with PDFs or images
73
+ task_type="default", # Choose between "default" or "structure"
74
+ )
75
+ ```
76
+
77
+ Prepare the messages manually.
78
+
79
+ ```python
80
+ from typhoon_ocr import prepare_ocr_messages
81
+ from openai import OpenAI
82
+
83
+ # Prepare messages for OCR processing
84
+ messages = prepare_ocr_messages(
85
+ pdf_or_image_path="document.pdf", # Works with PDFs or images
86
+ task_type="default", # Choose between "default" or "structure"
87
+ page_num=2 # Process page 2 of a PDF (default is 1, always 1 for images)
88
+ )
89
+
90
+ # Use with https://opentyphoon.ai/ api or self-host model via vllm
91
+ # See model list at https://huggingface.co/collections/scb10x/typhoon-ocr-682713483cb934ab0cf069bd
92
+ client = OpenAI(base_url='https://api.opentyphoon.ai/v1')
93
+ response = client.chat.completions.create(
94
+ model="typhoon-ocr-preview",
95
+ messages=messages,
96
+ max_tokens=16000,
97
+ extra_body={
98
+ "repetition_penalty": 1.2,
99
+ "temperature": 0.1,
100
+ "top_p": 0.6,
101
+ },
102
+
103
+ )
104
+
105
+ # Parse the JSON response
106
+ text_output = response.choices[0].message.content
107
+ markdown = json.loads(text_output)['natural_text']
108
+ print(markdown)
109
+ ```
110
+
111
+ ### Available task types
112
+
113
+ The package comes with built-in prompt templates for different OCR tasks:
114
+
115
+ - `default`: Extracts markdown representation of the document with tables in markdown format
116
+ - `structure`: Provides more structured output with HTML tables and image analysis placeholders
117
+
118
+ ## Document Extraction Capabilities
119
+
120
+ The Typhoon OCR model, when used with this package, can extract:
121
+
122
+ - Structured text with proper layout preservation
123
+ - Tables (in markdown or HTML format)
124
+ - Document hierarchy (headings, paragraphs, lists)
125
+ - Text with positional information
126
+ - Basic image analysis and placement
127
+
128
+ ## License
129
+
130
+ This project code is licensed under the Apache 2.0 License.
131
+
132
+ ## Acknowledgments
133
+
134
+ The code is based on work from [OlmoCR](https://github.com/allenai/olmocr) under the Apache 2.0 license.
packages/typhoon_ocr/build_and_upload.sh ADDED
@@ -0,0 +1 @@
 
 
1
+ python -m build && python -m twine upload dist/* --verbose
packages/typhoon_ocr/pyproject.toml ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [build-system]
2
+ requires = ["setuptools>=42", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [tool.black]
6
+ line-length = 88
7
+ target-version = ["py37"]
8
+ include = '\.pyi?$'
9
+
10
+ [tool.isort]
11
+ profile = "black"
12
+ line_length = 88
packages/typhoon_ocr/setup.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from setuptools import setup, find_packages
2
+
3
+ with open("README.md", "r", encoding="utf-8") as fh:
4
+ long_description = fh.read()
5
+
6
+ setup(
7
+ name="typhoon-ocr",
8
+ version="0.3.8",
9
+ author="Typhoon OCR Contributors",
10
+ author_email="contact@opentyphoon.ai",
11
+ description="A package for extracting structured content from PDFs and images using Typhoon OCR models",
12
+ long_description=long_description,
13
+ long_description_content_type="text/markdown",
14
+ url="https://github.com/scb-10x/typhoon-ocr",
15
+ packages=find_packages(),
16
+ classifiers=[
17
+ "Programming Language :: Python :: 3",
18
+ "License :: OSI Approved :: Apache Software License",
19
+ "Operating System :: OS Independent",
20
+ ],
21
+ python_requires=">=3.7",
22
+ install_requires=[
23
+ "ftfy",
24
+ "pypdf",
25
+ "pillow",
26
+ "openai",
27
+ ],
28
+ extras_require={
29
+ "dev": [
30
+ "pytest",
31
+ "black",
32
+ "flake8",
33
+ ],
34
+ "app": [
35
+ "gradio",
36
+ "python-dotenv",
37
+ ],
38
+ },
39
+ )
packages/typhoon_ocr/typhoon_ocr/__init__.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Typhoon OCR is a model for extracting structured markdown from images or PDFs.
3
+
4
+ This package provides utilities for document analysis, layout extraction, and OCR processing.
5
+ It focuses on structured text extraction with proper formatting and layout preservation.
6
+
7
+ Main Functions:
8
+ - prepare_ocr_messages: Generate OCR-ready messages from PDFs or images
9
+ - get_prompt: Access built-in prompt templates for different OCR tasks
10
+ - image_to_pdf: Convert image files to PDF format
11
+
12
+ Requirements:
13
+ - Poppler utilities (pdfinfo, pdftoppm) must be installed on the system
14
+ - Appropriate dependencies (ftfy, pypdf, pillow) for text processing
15
+
16
+ Example Usage:
17
+ >>> from typhoon_ocr import prepare_ocr_messages
18
+ >>> messages = prepare_ocr_messages("document.pdf", task_type="default", page_num=1)
19
+ >>> # Use messages with LLM API for OCR processing
20
+ """
21
+ from .pdf_utils import pdf_utils_available
22
+ from .ocr_utils import (
23
+ prepare_ocr_messages,
24
+ get_prompt,
25
+ get_anchor_text,
26
+ image_to_pdf,
27
+ ocr_document,
28
+ )
29
+
30
+ __version__ = "0.3.8"
31
+
32
+ __all__ = [
33
+ "pdf_utils_available",
34
+ "prepare_ocr_messages",
35
+ "get_prompt",
36
+ "get_anchor_text",
37
+ "image_to_pdf",
38
+ "ocr_document",
39
+ ]
packages/typhoon_ocr/typhoon_ocr/ocr_utils.py ADDED
@@ -0,0 +1,623 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Utility functions for Typhoon OCR.
3
+
4
+ This code is adapted from https://github.com/allenai/olmocr
5
+ Under the Apache 2.0 license.
6
+ Edited by Typhoon OCR Contributors.
7
+ """
8
+ from dataclasses import dataclass
9
+ import json
10
+ from openai import OpenAI
11
+ import os
12
+ import re
13
+ import io
14
+ import tempfile
15
+ from PIL import Image
16
+ import subprocess
17
+ import base64
18
+ from typing import Any, Callable, Dict, List, Literal
19
+ import random
20
+ import ftfy
21
+ from pypdf.generic import RectangleObject
22
+ from pypdf import PdfReader
23
+
24
+
25
+ @dataclass(frozen=True)
26
+ class Element:
27
+ pass
28
+
29
+
30
+ @dataclass(frozen=True)
31
+ class BoundingBox:
32
+ x0: float
33
+ y0: float
34
+ x1: float
35
+ y1: float
36
+
37
+ @staticmethod
38
+ def from_rectangle(rect: RectangleObject) -> "BoundingBox":
39
+ return BoundingBox(rect[0], rect[1], rect[2], rect[3])
40
+
41
+
42
+ @dataclass(frozen=True)
43
+ class TextElement(Element):
44
+ text: str
45
+ x: float
46
+ y: float
47
+
48
+
49
+ @dataclass(frozen=True)
50
+ class ImageElement(Element):
51
+ name: str
52
+ bbox: BoundingBox
53
+
54
+
55
+ @dataclass(frozen=True)
56
+ class PageReport:
57
+ mediabox: BoundingBox
58
+ text_elements: List[TextElement]
59
+ image_elements: List[ImageElement]
60
+
61
+ def image_to_pdf(image_path):
62
+ try:
63
+ # Open the image file.
64
+ img = Image.open(image_path)
65
+ # Create a temporary file to store the PDF.
66
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
67
+ filename = tmp.name
68
+ temp_pdf_created = True
69
+ # Convert image to RGB if necessary and save as PDF.
70
+ if img.mode != "RGB":
71
+ img = img.convert("RGB")
72
+ img.save(filename, "PDF")
73
+ return filename
74
+ except Exception as conv_err:
75
+ return None
76
+
77
+ def get_pdf_media_box_width_height(local_pdf_path: str, page_num: int) -> tuple[float, float]:
78
+ """
79
+ Get the MediaBox dimensions for a specific page in a PDF file using the pdfinfo command.
80
+
81
+ :param pdf_file: Path to the PDF file
82
+ :param page_num: The page number for which to extract MediaBox dimensions
83
+ :return: A dictionary containing MediaBox dimensions or None if not found
84
+ """
85
+
86
+ # Construct the pdfinfo command to extract info for the specific page
87
+ command = ["pdfinfo", "-f", str(page_num), "-l", str(page_num), "-box", "-enc", "UTF-8", local_pdf_path]
88
+ # Run the command using subprocess
89
+ result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
90
+
91
+ # Check if there is any error in executing the command
92
+ if result.returncode != 0:
93
+ raise ValueError(f"Error running pdfinfo: {result.stderr}")
94
+
95
+ # Parse the output to find MediaBox
96
+ output = result.stdout
97
+
98
+ for line in output.splitlines():
99
+ if "MediaBox" in line:
100
+ media_box_str: List[str] = line.split(":")[1].strip().split()
101
+ media_box: List[float] = [float(x) for x in media_box_str]
102
+ return abs(media_box[0] - media_box[2]), abs(media_box[3] - media_box[1])
103
+
104
+ raise ValueError("MediaBox not found in the PDF info.")
105
+
106
+ def render_pdf_to_base64png(local_pdf_path: str, page_num: int, target_longest_image_dim: int = 2048) -> str:
107
+
108
+ longest_dim = max(get_pdf_media_box_width_height(local_pdf_path, page_num))
109
+
110
+ # Convert PDF page to PNG using pdftoppm
111
+ pdftoppm_result = subprocess.run(
112
+ [
113
+ "pdftoppm",
114
+ "-png",
115
+ "-f",
116
+ str(page_num),
117
+ "-l",
118
+ str(page_num),
119
+ "-r",
120
+ str(target_longest_image_dim * 72 / longest_dim), # 72 pixels per point is the conversion factor
121
+ local_pdf_path,
122
+ ],
123
+ timeout=120,
124
+ stdout=subprocess.PIPE,
125
+ stderr=subprocess.PIPE,
126
+ )
127
+ assert pdftoppm_result.returncode == 0, pdftoppm_result.stderr
128
+ return base64.b64encode(pdftoppm_result.stdout).decode("utf-8")
129
+
130
+
131
+ def _linearize_pdf_report(report: PageReport, max_length: int = 4000) -> str:
132
+ result = ""
133
+ result += f"Page dimensions: {report.mediabox.x1:.1f}x{report.mediabox.y1:.1f}\n"
134
+
135
+ if max_length < 20:
136
+ return result
137
+
138
+ images = _merge_image_elements(report.image_elements)
139
+
140
+ # Process image elements
141
+ image_strings = []
142
+ for element in images:
143
+ image_str = f"[Image {element.bbox.x0:.0f}x{element.bbox.y0:.0f} to {element.bbox.x1:.0f}x{element.bbox.y1:.0f}]\n"
144
+ # Use element's unique identifier (e.g., id or position) for comparison
145
+ image_strings.append((element, image_str))
146
+
147
+ # Process text elements
148
+ text_strings = []
149
+ for element in report.text_elements: # type: ignore
150
+ if len(element.text.strip()) == 0: # type: ignore
151
+ continue
152
+
153
+ element_text = _cleanup_element_text(element.text) # type: ignore
154
+ text_str = f"[{element.x:.0f}x{element.y:.0f}]{element_text}\n" # type: ignore
155
+ text_strings.append((element, text_str))
156
+
157
+ # Combine all elements with their positions for sorting
158
+ all_elements: list[tuple[str, ImageElement, str, tuple[float, float]]] = []
159
+ for elem, s in image_strings:
160
+ position = (elem.bbox.x0, elem.bbox.y0)
161
+ all_elements.append(("image", elem, s, position))
162
+ for elem, s in text_strings:
163
+ position = (elem.x, elem.y) # type: ignore
164
+ all_elements.append(("text", elem, s, position))
165
+
166
+ # Calculate total length
167
+ total_length = len(result) + sum(len(s) for _, _, s, _ in all_elements)
168
+
169
+ if total_length <= max_length:
170
+ # Include all elements
171
+ for _, _, s, _ in all_elements:
172
+ result += s
173
+ return result
174
+
175
+ # Identify elements with min/max coordinates
176
+ edge_elements = set()
177
+
178
+ if images:
179
+ min_x0_image = min(images, key=lambda e: e.bbox.x0)
180
+ max_x1_image = max(images, key=lambda e: e.bbox.x1)
181
+ min_y0_image = min(images, key=lambda e: e.bbox.y0)
182
+ max_y1_image = max(images, key=lambda e: e.bbox.y1)
183
+ edge_elements.update([min_x0_image, max_x1_image, min_y0_image, max_y1_image])
184
+
185
+ if report.text_elements:
186
+ text_elements = [e for e in report.text_elements if len(e.text.strip()) > 0]
187
+ if text_elements:
188
+ min_x_text = min(text_elements, key=lambda e: e.x)
189
+ max_x_text = max(text_elements, key=lambda e: e.x)
190
+ min_y_text = min(text_elements, key=lambda e: e.y)
191
+ max_y_text = max(text_elements, key=lambda e: e.y)
192
+ edge_elements.update([min_x_text, max_x_text, min_y_text, max_y_text]) # type: ignore
193
+
194
+ # Keep track of element IDs to prevent duplication
195
+ selected_element_ids = set()
196
+ selected_elements = []
197
+
198
+ # Include edge elements first
199
+ for elem_type, elem, s, position in all_elements:
200
+ if elem in edge_elements and id(elem) not in selected_element_ids:
201
+ selected_elements.append((elem_type, elem, s, position))
202
+ selected_element_ids.add(id(elem))
203
+
204
+ # Calculate remaining length
205
+ current_length = len(result) + sum(len(s) for _, _, s, _ in selected_elements)
206
+ _remaining_length = max_length - current_length
207
+
208
+ # Exclude edge elements from the pool
209
+ remaining_elements = [(elem_type, elem, s, position) for elem_type, elem, s, position in all_elements if id(elem) not in selected_element_ids]
210
+
211
+ # Sort remaining elements by their positions (e.g., x-coordinate and then y-coordinate)
212
+ # remaining_elements.sort(key=lambda x: (x[3][0], x[3][1]))
213
+
214
+ # Shuffle remaining elements randomly
215
+ random.shuffle(remaining_elements)
216
+
217
+ # Add elements until reaching max_length
218
+ for elem_type, elem, s, position in remaining_elements:
219
+ if current_length + len(s) > max_length:
220
+ break
221
+ selected_elements.append((elem_type, elem, s, position))
222
+ selected_element_ids.add(id(elem))
223
+ current_length += len(s)
224
+
225
+ # Sort selected elements by their positions to maintain logical order
226
+ selected_elements.sort(key=lambda x: (x[3][0], x[3][1]))
227
+
228
+ # Build the final result
229
+ for _, _, s, _ in selected_elements:
230
+ result += s
231
+
232
+ return result
233
+
234
+
235
+ def _cap_split_string(text: str, max_length: int) -> str:
236
+ if len(text) <= max_length:
237
+ return text
238
+
239
+ head_length = max_length // 2 - 3
240
+ tail_length = head_length
241
+
242
+ head = text[:head_length].rsplit(" ", 1)[0] or text[:head_length]
243
+ tail = text[-tail_length:].split(" ", 1)[-1] or text[-tail_length:]
244
+
245
+ return f"{head} ... {tail}"
246
+
247
+
248
+ def _cleanup_element_text(element_text: str) -> str:
249
+ MAX_TEXT_ELEMENT_LENGTH = 250
250
+ TEXT_REPLACEMENTS = {"[": "\\[", "]": "\\]", "\n": "\\n", "\r": "\\r", "\t": "\\t"}
251
+ text_replacement_pattern = re.compile("|".join(re.escape(key) for key in TEXT_REPLACEMENTS.keys()))
252
+
253
+ element_text = ftfy.fix_text(element_text).strip()
254
+
255
+ # Replace square brackets with escaped brackets and other escaped chars
256
+ element_text = text_replacement_pattern.sub(lambda match: TEXT_REPLACEMENTS[match.group(0)], element_text)
257
+
258
+ return _cap_split_string(element_text, MAX_TEXT_ELEMENT_LENGTH)
259
+
260
+ def _merge_image_elements(images: List[ImageElement], tolerance: float = 0.5) -> List[ImageElement]:
261
+ n = len(images)
262
+ parent = list(range(n)) # Initialize Union-Find parent pointers
263
+
264
+ def find(i):
265
+ # Find with path compression
266
+ root = i
267
+ while parent[root] != root:
268
+ root = parent[root]
269
+ while parent[i] != i:
270
+ parent_i = parent[i]
271
+ parent[i] = root
272
+ i = parent_i
273
+ return root
274
+
275
+ def union(i, j):
276
+ # Union by attaching root of one tree to another
277
+ root_i = find(i)
278
+ root_j = find(j)
279
+ if root_i != root_j:
280
+ parent[root_i] = root_j
281
+
282
+ def bboxes_overlap(b1: BoundingBox, b2: BoundingBox, tolerance: float) -> bool:
283
+ # Compute horizontal and vertical distances between boxes
284
+ h_dist = max(0, max(b1.x0, b2.x0) - min(b1.x1, b2.x1))
285
+ v_dist = max(0, max(b1.y0, b2.y0) - min(b1.y1, b2.y1))
286
+ # Check if distances are within tolerance
287
+ return h_dist <= tolerance and v_dist <= tolerance
288
+
289
+ # Union overlapping images
290
+ for i in range(n):
291
+ for j in range(i + 1, n):
292
+ if bboxes_overlap(images[i].bbox, images[j].bbox, tolerance):
293
+ union(i, j)
294
+
295
+ # Group images by their root parent
296
+ groups: dict[int, list[int]] = {}
297
+ for i in range(n):
298
+ root = find(i)
299
+ groups.setdefault(root, []).append(i)
300
+
301
+ # Merge images in the same group
302
+ merged_images = []
303
+ for indices in groups.values():
304
+ # Initialize merged bounding box
305
+ merged_bbox = images[indices[0]].bbox
306
+ merged_name = images[indices[0]].name
307
+
308
+ for idx in indices[1:]:
309
+ bbox = images[idx].bbox
310
+ # Expand merged_bbox to include the current bbox
311
+ merged_bbox = BoundingBox(
312
+ x0=min(merged_bbox.x0, bbox.x0),
313
+ y0=min(merged_bbox.y0, bbox.y0),
314
+ x1=max(merged_bbox.x1, bbox.x1),
315
+ y1=max(merged_bbox.y1, bbox.y1),
316
+ )
317
+ # Optionally, update the name
318
+ merged_name += f"+{images[idx].name}"
319
+
320
+ merged_images.append(ImageElement(name=merged_name, bbox=merged_bbox))
321
+
322
+ # Return the merged images along with other elements
323
+ return merged_images
324
+
325
+ def _transform_point(x, y, m):
326
+ x_new = m[0] * x + m[2] * y + m[4]
327
+ y_new = m[1] * x + m[3] * y + m[5]
328
+ return x_new, y_new
329
+
330
+ def _mult(m: List[float], n: List[float]) -> List[float]:
331
+ return [
332
+ m[0] * n[0] + m[1] * n[2],
333
+ m[0] * n[1] + m[1] * n[3],
334
+ m[2] * n[0] + m[3] * n[2],
335
+ m[2] * n[1] + m[3] * n[3],
336
+ m[4] * n[0] + m[5] * n[2] + n[4],
337
+ m[4] * n[1] + m[5] * n[3] + n[5],
338
+ ]
339
+
340
+ def _pdf_report(local_pdf_path: str, page_num: int) -> PageReport:
341
+ reader = PdfReader(local_pdf_path)
342
+ page = reader.pages[page_num - 1]
343
+ resources = page.get("/Resources", {})
344
+ xobjects = resources.get("/XObject", {})
345
+ text_elements, image_elements = [], []
346
+
347
+ def visitor_body(text, cm, tm, font_dict, font_size):
348
+ txt2user = _mult(tm, cm)
349
+ text_elements.append(TextElement(text, txt2user[4], txt2user[5]))
350
+
351
+ def visitor_op(op, args, cm, tm):
352
+ if op == b"Do":
353
+ xobject_name = args[0]
354
+ xobject = xobjects.get(xobject_name)
355
+ if xobject and xobject["/Subtype"] == "/Image":
356
+ # Compute image bbox
357
+ # The image is placed according to the CTM
358
+ _width = xobject.get("/Width")
359
+ _height = xobject.get("/Height")
360
+ x0, y0 = _transform_point(0, 0, cm)
361
+ x1, y1 = _transform_point(1, 1, cm)
362
+ image_elements.append(ImageElement(xobject_name, BoundingBox(min(x0, x1), min(y0, y1), max(x0, x1), max(y0, y1))))
363
+
364
+ page.extract_text(visitor_text=visitor_body, visitor_operand_before=visitor_op)
365
+
366
+ return PageReport(
367
+ mediabox=BoundingBox.from_rectangle(page.mediabox),
368
+ text_elements=text_elements,
369
+ image_elements=image_elements,
370
+ )
371
+
372
+ def get_anchor_text(
373
+ local_pdf_path: str, page: int, pdf_engine: Literal["pdftotext", "pdfium", "pypdf", "topcoherency", "pdfreport"], target_length: int = 4000
374
+ ) -> str:
375
+ assert page > 0, "Pages are 1-indexed in pdf-land"
376
+
377
+
378
+ if pdf_engine == "pdfreport":
379
+ return _linearize_pdf_report(_pdf_report(local_pdf_path, page), max_length=target_length)
380
+ else:
381
+ raise NotImplementedError("Unknown engine")
382
+
383
+ PROMPTS_SYS = {
384
+ "default": lambda base_text: (f"Below is an image of a document page along with its dimensions. "
385
+ f"Simply return the markdown representation of this document, presenting tables in markdown format as they naturally appear.\n"
386
+ f"If the document contains images, use a placeholder like dummy.png for each image.\n"
387
+ f"Your final output must be in JSON format with a single key `natural_text` containing the response.\n"
388
+ f"RAW_TEXT_START\n{base_text}\nRAW_TEXT_END"),
389
+ "structure": lambda base_text: (
390
+ f"Below is an image of a document page, along with its dimensions and possibly some raw textual content previously extracted from it. "
391
+ f"Note that the text extraction may be incomplete or partially missing. Carefully consider both the layout and any available text to reconstruct the document accurately.\n"
392
+ f"Your task is to return the markdown representation of this document, presenting tables in HTML format as they naturally appear.\n"
393
+ f"If the document contains images or figures, analyze them and include the tag <figure>IMAGE_ANALYSIS</figure> in the appropriate location.\n"
394
+ f"Your final output must be in JSON format with a single key `natural_text` containing the response.\n"
395
+ f"RAW_TEXT_START\n{base_text}\nRAW_TEXT_END"
396
+ ),
397
+ }
398
+
399
+ def get_prompt(prompt_name: str) -> Callable[[str], str]:
400
+ """
401
+ Get a prompt template function for the specified prompt type.
402
+
403
+ This function returns a callable that generates a prompt template based on the provided prompt name.
404
+ The returned function takes extracted text as input and returns a formatted prompt string
405
+ that can be used with OCR/vision models.
406
+
407
+ Available prompt types:
408
+ - "default": Creates a prompt for extracting text with tables in markdown format.
409
+ - "structure": Creates a prompt for extracting text with tables in HTML format and image analysis.
410
+
411
+ Args:
412
+ prompt_name (str): The identifier for the desired prompt template ("default" or "structure").
413
+
414
+ Returns:
415
+ Callable[[str], str]: A function that takes extracted text and returns a formatted prompt.
416
+
417
+ Examples:
418
+ >>> prompt_fn = get_prompt("default")
419
+ >>> formatted_prompt = prompt_fn("Sample extracted text")
420
+ >>> print(formatted_prompt[:50]) # Print first 50 chars
421
+ Below is an image of a document page along with its
422
+ """
423
+ return PROMPTS_SYS.get(prompt_name, lambda x: "Invalid PROMPT_NAME provided.")
424
+
425
+ def image_to_base64png(img: Image.Image):
426
+ buffered = io.BytesIO()
427
+ img = img.convert("RGB")
428
+ img.save(buffered, format="JPEG")
429
+ return base64.b64encode(buffered.getvalue()).decode("utf-8")
430
+
431
+ def get_anchor_text_from_image(img: Image.Image):
432
+ width = float(img.width)
433
+ height = float(img.height)
434
+ text = f"""Page dimensions: {width:.1f}x{height:.1f}\n[Image 0x0 to {width:.0f}x{height:.0f}]\n"""
435
+ return text
436
+
437
+ def prepare_ocr_messages(
438
+ pdf_or_image_path: str,
439
+ task_type: str = "default",
440
+ target_image_dim: int = 1800,
441
+ target_text_length: int = 8000,
442
+ page_num: int = 1,
443
+ ) -> List[Dict[str, Any]]:
444
+ """
445
+ Prepare messages for OCR processing from a PDF or image file.
446
+
447
+ This function provides an end-to-end workflow that combines multiple processing steps
448
+ into a single call, creating messages ready for OCR processing with language models.
449
+ It handles both image and PDF inputs, with appropriate page selection for PDFs.
450
+
451
+ Processing Steps:
452
+ 1. Convert image to PDF if necessary (images are always treated as single pages)
453
+ 2. Render the selected PDF page to base64 PNG
454
+ 3. Extract anchor text from the page with position information
455
+ 4. Apply appropriate prompt template based on task type
456
+ 5. Create a messages structure ready for LLM API submission
457
+
458
+ Args:
459
+ pdf_or_image_path (str): Path to a PDF or image file to process
460
+ task_type (str): Type of OCR task - "default" for standard markdown extraction,
461
+ "structure" for enhanced layout analysis with HTML tables
462
+ target_image_dim (int): Target longest dimension for the rendered image in pixels
463
+ target_text_length (int): Maximum length of extracted text to include
464
+ page_num (int): Page number to process (default=1, for images always 1)
465
+
466
+ Returns:
467
+ List[Dict[str, Any]]: Messages structure ready for OCR processing with an LLM API,
468
+ containing both text prompt and image data
469
+
470
+ Raises:
471
+ ValueError: If image conversion fails, page number is out of range, or other processing errors occur
472
+
473
+ Examples:
474
+ >>> # Process the first page of a PDF
475
+ >>> messages = prepare_ocr_messages("document.pdf")
476
+ >>>
477
+ >>> # Process page 5 of a PDF with structure analysis
478
+ >>> messages = prepare_ocr_messages(
479
+ ... pdf_or_image_path="multipage.pdf",
480
+ ... task_type="structure",
481
+ ... page_num=5
482
+ ... )
483
+ >>>
484
+ >>> # Process an image file (always page 1)
485
+ >>> messages = prepare_ocr_messages("scan.jpg")
486
+ """
487
+ # Check for required PDF utilities
488
+ ext = os.path.splitext(pdf_or_image_path)[1].lower()
489
+ is_image = ext not in [".pdf"]
490
+
491
+ # Determine if the file is a PDF or image
492
+ filename = pdf_or_image_path
493
+
494
+ try:
495
+ if is_image:
496
+ page_num = 1
497
+ img = Image.open(pdf_or_image_path)
498
+ # Render the image to base64 PNG
499
+ image_base64 = image_to_base64png(img)
500
+ # Get anchor text from the image
501
+ anchor_text = get_anchor_text_from_image(img)
502
+ else:
503
+ if page_num < 1:
504
+ page_num = 1
505
+ else:
506
+ page_num = int(page_num) # cast to int
507
+ # Render the selected page to base64 PNG
508
+ image_base64 = render_pdf_to_base64png(
509
+ filename, page_num, target_longest_image_dim=target_image_dim
510
+ )
511
+ # Extract anchor text from the selected PDF page
512
+ anchor_text = get_anchor_text(
513
+ filename,
514
+ page_num,
515
+ pdf_engine="pdfreport",
516
+ target_length=target_text_length,
517
+ )
518
+
519
+
520
+ # Get the prompt template function for the specified task type
521
+ prompt_fn = get_prompt(task_type)
522
+
523
+ # Apply the prompt template to the extracted anchor text
524
+ prompt_text = prompt_fn(anchor_text)
525
+
526
+ # Create messages structure
527
+ messages = [
528
+ {
529
+ "role": "user",
530
+ "content": [
531
+ {"type": "text", "text": prompt_text},
532
+ {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}},
533
+ ],
534
+ }
535
+ ]
536
+
537
+ return messages
538
+ except IndexError:
539
+ raise ValueError(f"Page number {page_num} is out of range for the document {pdf_or_image_path}")
540
+ except Exception as e:
541
+ raise ValueError(f"Error processing document: {str(e)}")
542
+
543
+ def is_base64_string(input_string: str) -> bool:
544
+ try:
545
+ # Try to decode and re-encode to check validity
546
+ return base64.b64encode(base64.b64decode(input_string))[:10] == input_string.encode()[:10]
547
+ except Exception:
548
+ return False
549
+
550
+ def ensure_image_in_path(input_string: str) -> str:
551
+ """
552
+ Detect whether the input is a base64-encoded image or a file path.
553
+
554
+ - If it's base64, decode and save it as a temporary image file.
555
+ - If it's a valid image format (e.g. JPEG, PNG), preserve the format.
556
+ - If it's not base64, return the input as-is (assumed to be a path).
557
+
558
+ Returns:
559
+ str: A file path (either the original or a temp file path if base64).
560
+ """
561
+ if input_string.endswith(".png") or input_string.endswith(".jpg") or input_string.endswith(".jpeg") or input_string.endswith(".pdf"):
562
+ return input_string
563
+ elif is_base64_string(input_string):
564
+ try:
565
+ image_data = base64.b64decode(input_string)
566
+ image = Image.open(io.BytesIO(image_data))
567
+ image_format = image.format.lower() # e.g. 'jpeg', 'png'
568
+ # Save image to a temporary file with correct extension
569
+ temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=f".{image_format}")
570
+ image.save(temp_file.name, format=image_format)
571
+ return temp_file.name
572
+ except Exception:
573
+ return input_string
574
+ return input_string
575
+
576
+ def ocr_document(pdf_or_image_path: str, task_type: str = "default", target_image_dim: int = 1800, target_text_length: int = 8000, page_num: int = 1, base_url: str = os.getenv("TYPHOON_BASE_URL", 'https://api.opentyphoon.ai/v1'), api_key: str = None, model: str = "typhoon-ocr-preview") -> str:
577
+ """
578
+ OCR a PDF or image file.
579
+
580
+ This function provides an end-to-end workflow that combines multiple processing steps
581
+ into a single call, creating messages ready for OCR processing with language models.
582
+ It handles both image and PDF inputs, with appropriate page selection for PDFs.
583
+
584
+ Args:
585
+ pdf_or_image_path (str): Path to a PDF or image file to process
586
+ task_type (str): Type of OCR task - "default" for standard markdown extraction,
587
+ "structure" for enhanced layout analysis with HTML tables
588
+ target_image_dim (int): Target longest dimension for the rendered image in pixels
589
+ target_text_length (int): Maximum length of extracted text to include
590
+ page_num (int): Page number to process (default=1, for images always 1)
591
+ base_url (str): API base URL
592
+ api_key (str): API key for authentication (will also check environment variables if None)
593
+ model (str): Model identifier to use for OCR
594
+
595
+ Returns:
596
+ str: Extracted text content in the specified format
597
+
598
+ Raises:
599
+ ValueError: If image conversion fails, page number is out of range, or other processing errors occur
600
+ """
601
+ pdf_or_image_path = ensure_image_in_path(pdf_or_image_path)
602
+
603
+ openai = OpenAI(base_url=base_url, api_key=api_key or os.getenv("TYPHOON_OCR_API_KEY") or os.getenv('TYPHOON_API_KEY') or os.getenv("OPENAI_API_KEY"))
604
+ messages = prepare_ocr_messages(
605
+ pdf_or_image_path=pdf_or_image_path,
606
+ task_type=task_type,
607
+ target_image_dim=target_image_dim,
608
+ target_text_length=target_text_length,
609
+ page_num=page_num if page_num else 1
610
+ )
611
+ response = openai.chat.completions.create(
612
+ model=model,
613
+ messages=messages,
614
+ max_tokens=16384,
615
+ extra_body={
616
+ "repetition_penalty": 1.2,
617
+ "temperature": 0.1,
618
+ "top_p": 0.6,
619
+ },
620
+ )
621
+ text_output = response.choices[0].message.content
622
+ text = json.loads(text_output)['natural_text']
623
+ return text
packages/typhoon_ocr/typhoon_ocr/pdf_utils.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import shutil
2
+ import warnings
3
+
4
+ def check_pdf_utilities():
5
+ """
6
+ Check if the required Poppler utilities (pdfinfo and pdftoppm) are installed.
7
+
8
+ This function verifies if the necessary PDF utilities are available on the system
9
+ and provides helpful instructions if they are missing.
10
+
11
+ Returns:
12
+ bool: True if all required utilities are available, False otherwise.
13
+ """
14
+ missing_utils = []
15
+
16
+ # Check for pdfinfo
17
+ if shutil.which("pdfinfo") is None:
18
+ missing_utils.append("pdfinfo")
19
+
20
+ # Check for pdftoppm
21
+ if shutil.which("pdftoppm") is None:
22
+ missing_utils.append("pdftoppm")
23
+
24
+ if missing_utils:
25
+ warning_message = (
26
+ f"WARNING: The following required Poppler utilities are missing: {', '.join(missing_utils)}.\n"
27
+ "These utilities are required for PDF processing in Typhoon OCR.\n\n"
28
+ "Installation instructions:\n"
29
+ "- macOS: Run 'brew install poppler'\n"
30
+ "- Ubuntu/Debian: Run 'apt-get install poppler-utils'\n"
31
+ "- Windows: Install from https://github.com/oschwartz10612/poppler-windows/releases/ and add to PATH\n"
32
+ )
33
+ warnings.warn(warning_message, ImportWarning)
34
+ return False
35
+
36
+ return True
37
+
38
+ pdf_utils_available = check_pdf_utilities()
39
+ if not pdf_utils_available:
40
+ message = ('PDF utilities are not available.'
41
+ "Installation instructions for Poppler utilities:\n"
42
+ "- macOS: Run 'brew install poppler'\n"
43
+ "- Ubuntu/Debian: Run 'apt-get install poppler-utils'\n"
44
+ "- Windows: Install from https://github.com/oschwartz10612/poppler-windows/releases/ and add to PATH"
45
+ )
46
+ raise ImportError(message)
requirements.txt CHANGED
@@ -1,5 +1,7 @@
1
  openai
2
  python-dotenv
 
 
3
  gradio
4
- Pillow
5
- typhoon-ocr
 
1
  openai
2
  python-dotenv
3
+ ftfy
4
+ pypdf
5
  gradio
6
+ pillow
7
+ typhoon-ocr