| """ |
| Example: Document Processing Pipeline |
| |
| Demonstrates: |
| 1. Processing a PDF document |
| 2. Extracting text with OCR |
| 3. Layout detection |
| 4. Semantic chunking |
| """ |
|
|
| import asyncio |
| from pathlib import Path |
| from loguru import logger |
|
|
| |
| from src.document.pipeline import ( |
| PipelineConfig, |
| DocumentProcessor, |
| process_document, |
| ) |
| from src.document.ocr import OCRConfig |
|
|
|
|
| def example_basic_processing(): |
| """Basic document processing example.""" |
| print("=" * 50) |
| print("Basic Document Processing") |
| print("=" * 50) |
|
|
| |
| config = PipelineConfig( |
| ocr=OCRConfig(engine="paddleocr"), |
| render_dpi=300, |
| max_pages=5, |
| ) |
|
|
| |
| processor = DocumentProcessor(config) |
|
|
| |
| |
| sample_doc = Path("./data/sample.pdf") |
|
|
| if not sample_doc.exists(): |
| print(f"Sample document not found: {sample_doc}") |
| print("Create a sample PDF at ./data/sample.pdf to run this example") |
| return |
|
|
| |
| result = processor.process(sample_doc) |
|
|
| |
| print(f"\nDocument: {result.metadata.filename}") |
| print(f"Pages: {result.metadata.num_pages}") |
| print(f"Chunks: {result.metadata.total_chunks}") |
| print(f"Characters: {result.metadata.total_characters}") |
| print(f"OCR Confidence: {result.metadata.ocr_confidence_avg:.2%}") |
|
|
| print("\n--- Sample Chunks ---") |
| for i, chunk in enumerate(result.chunks[:3]): |
| print(f"\n[Chunk {i+1}] Type: {chunk.chunk_type.value}, Page: {chunk.page+1}") |
| print(f"Text: {chunk.text[:200]}...") |
| print(f"BBox: ({chunk.bbox.x_min:.0f}, {chunk.bbox.y_min:.0f}) -> ({chunk.bbox.x_max:.0f}, {chunk.bbox.y_max:.0f})") |
|
|
|
|
| def example_with_layout(): |
| """Document processing with layout analysis.""" |
| print("\n" + "=" * 50) |
| print("Document Processing with Layout Analysis") |
| print("=" * 50) |
|
|
| from src.document.layout import LayoutConfig, LayoutType |
|
|
| |
| config = PipelineConfig( |
| ocr=OCRConfig(engine="paddleocr"), |
| layout=LayoutConfig(method="rule_based"), |
| include_layout_regions=True, |
| ) |
|
|
| processor = DocumentProcessor(config) |
|
|
| sample_doc = Path("./data/sample.pdf") |
| if not sample_doc.exists(): |
| print("Sample document not found") |
| return |
|
|
| result = processor.process(sample_doc) |
|
|
| |
| layout_counts = {} |
| for region in result.layout_regions: |
| layout_type = region.layout_type.value |
| layout_counts[layout_type] = layout_counts.get(layout_type, 0) + 1 |
|
|
| print(f"\nLayout Analysis:") |
| for layout_type, count in sorted(layout_counts.items()): |
| print(f" {layout_type}: {count} regions") |
|
|
| |
| tables = [r for r in result.layout_regions if r.layout_type == LayoutType.TABLE] |
| if tables: |
| print(f"\n--- Tables Found ({len(tables)}) ---") |
| for i, table in enumerate(tables[:2]): |
| print(f"\nTable {i+1}: Page {table.page+1}") |
| print(f" Position: ({table.bbox.x_min:.0f}, {table.bbox.y_min:.0f})") |
| print(f" Size: {table.bbox.width:.0f} x {table.bbox.height:.0f}") |
|
|
|
|
| def example_convenience_function(): |
| """Using the convenience function.""" |
| print("\n" + "=" * 50) |
| print("Using Convenience Function") |
| print("=" * 50) |
|
|
| sample_doc = Path("./data/sample.pdf") |
| if not sample_doc.exists(): |
| print("Sample document not found") |
| return |
|
|
| |
| result = process_document(sample_doc) |
|
|
| print(f"Processed: {result.metadata.filename}") |
| print(f"Chunks: {len(result.chunks)}") |
| print(f"\nFull text preview:") |
| print(result.full_text[:500] + "..." if len(result.full_text) > 500 else result.full_text) |
|
|
|
|
| if __name__ == "__main__": |
| example_basic_processing() |
| example_with_layout() |
| example_convenience_function() |
|
|