| """ |
| Document Processing CLI Commands |
| |
| Commands: |
| sparknet document parse <file> - Parse and extract text from document |
| sparknet document extract <file> - Extract structured fields |
| sparknet document classify <file> - Classify document type |
| sparknet document analyze <file> - Full document analysis |
| """ |
|
|
| import typer |
| from typing import Optional, List |
| from pathlib import Path |
| import json |
| import sys |
|
|
| |
| document_app = typer.Typer( |
| name="document", |
| help="Document processing commands", |
| ) |
|
|
|
|
| @document_app.command("parse") |
| def parse_document( |
| file_path: Path = typer.Argument(..., help="Path to document file"), |
| output: Optional[Path] = typer.Option(None, "--output", "-o", help="Output JSON file"), |
| ocr_engine: str = typer.Option("paddleocr", "--ocr", help="OCR engine: paddleocr, tesseract"), |
| dpi: int = typer.Option(300, "--dpi", help="Rendering DPI for PDFs"), |
| max_pages: Optional[int] = typer.Option(None, "--max-pages", help="Maximum pages to process"), |
| include_images: bool = typer.Option(False, "--images", help="Include cropped region images"), |
| ): |
| """ |
| Parse a document and extract text with layout information. |
| |
| Example: |
| sparknet document parse invoice.pdf -o result.json |
| """ |
| from loguru import logger |
|
|
| if not file_path.exists(): |
| typer.echo(f"Error: File not found: {file_path}", err=True) |
| raise typer.Exit(1) |
|
|
| typer.echo(f"Parsing document: {file_path}") |
|
|
| try: |
| from ..document.pipeline import ( |
| PipelineConfig, |
| get_document_processor, |
| ) |
| from ..document.ocr import OCRConfig |
|
|
| |
| ocr_config = OCRConfig(engine=ocr_engine) |
| config = PipelineConfig( |
| ocr=ocr_config, |
| render_dpi=dpi, |
| max_pages=max_pages, |
| ) |
|
|
| |
| processor = get_document_processor(config) |
| result = processor.process(str(file_path)) |
|
|
| |
| output_data = { |
| "document_id": result.metadata.document_id, |
| "filename": result.metadata.filename, |
| "num_pages": result.metadata.num_pages, |
| "total_chunks": result.metadata.total_chunks, |
| "total_characters": result.metadata.total_characters, |
| "ocr_confidence": result.metadata.ocr_confidence_avg, |
| "chunks": [ |
| { |
| "chunk_id": c.chunk_id, |
| "type": c.chunk_type.value, |
| "page": c.page, |
| "text": c.text[:500] + "..." if len(c.text) > 500 else c.text, |
| "confidence": c.confidence, |
| "bbox": { |
| "x_min": c.bbox.x_min, |
| "y_min": c.bbox.y_min, |
| "x_max": c.bbox.x_max, |
| "y_max": c.bbox.y_max, |
| }, |
| } |
| for c in result.chunks |
| ], |
| "full_text": result.full_text[:2000] + "..." if len(result.full_text) > 2000 else result.full_text, |
| } |
|
|
| |
| if output: |
| with open(output, "w") as f: |
| json.dump(output_data, f, indent=2) |
| typer.echo(f"Results written to: {output}") |
| else: |
| typer.echo(json.dumps(output_data, indent=2)) |
|
|
| typer.echo(f"\nProcessed {result.metadata.num_pages} pages, {len(result.chunks)} chunks") |
|
|
| except ImportError as e: |
| typer.echo(f"Error: Missing dependency - {e}", err=True) |
| raise typer.Exit(1) |
| except Exception as e: |
| typer.echo(f"Error processing document: {e}", err=True) |
| raise typer.Exit(1) |
|
|
|
|
| @document_app.command("extract") |
| def extract_fields( |
| file_path: Path = typer.Argument(..., help="Path to document file"), |
| schema: Optional[Path] = typer.Option(None, "--schema", "-s", help="Extraction schema YAML file"), |
| fields: Optional[List[str]] = typer.Option(None, "--field", "-f", help="Fields to extract (can use multiple)"), |
| output: Optional[Path] = typer.Option(None, "--output", "-o", help="Output JSON file"), |
| validate: bool = typer.Option(True, "--validate/--no-validate", help="Validate extraction"), |
| ): |
| """ |
| Extract structured fields from a document. |
| |
| Example: |
| sparknet document extract invoice.pdf -f "invoice_number" -f "total_amount" |
| sparknet document extract contract.pdf --schema contract_schema.yaml |
| """ |
| from loguru import logger |
|
|
| if not file_path.exists(): |
| typer.echo(f"Error: File not found: {file_path}", err=True) |
| raise typer.Exit(1) |
|
|
| if not schema and not fields: |
| typer.echo("Error: Provide --schema or --field options", err=True) |
| raise typer.Exit(1) |
|
|
| typer.echo(f"Extracting fields from: {file_path}") |
|
|
| try: |
| from ..document.schemas.extraction import ExtractionSchema, FieldDefinition |
| from ..agents.document_agent import DocumentAgent |
|
|
| |
| if schema: |
| import yaml |
| with open(schema) as f: |
| schema_data = yaml.safe_load(f) |
| extraction_schema = ExtractionSchema(**schema_data) |
| else: |
| |
| field_defs = [ |
| FieldDefinition( |
| name=f, |
| field_type="string", |
| required=True, |
| ) |
| for f in fields |
| ] |
| extraction_schema = ExtractionSchema( |
| name="cli_extraction", |
| fields=field_defs, |
| ) |
|
|
| |
| import asyncio |
| agent = DocumentAgent() |
| asyncio.run(agent.load_document(str(file_path))) |
| result = asyncio.run(agent.extract_fields(extraction_schema)) |
|
|
| |
| output_data = { |
| "document": str(file_path), |
| "fields": result.fields, |
| "confidence": result.confidence, |
| "evidence": [ |
| { |
| "chunk_id": e.chunk_id, |
| "page": e.page, |
| "snippet": e.snippet, |
| } |
| for e in result.evidence |
| ] if result.evidence else [], |
| } |
|
|
| |
| if validate and result.fields: |
| from ..document.validation import get_extraction_critic |
| critic = get_extraction_critic() |
|
|
| evidence_chunks = [ |
| {"text": e.snippet, "page": e.page, "chunk_id": e.chunk_id} |
| for e in result.evidence |
| ] if result.evidence else [] |
|
|
| validation = critic.validate_extraction(result.fields, evidence_chunks) |
| output_data["validation"] = { |
| "status": validation.overall_status.value, |
| "confidence": validation.overall_confidence, |
| "should_accept": validation.should_accept, |
| "abstain_reason": validation.abstain_reason, |
| } |
|
|
| |
| if output: |
| with open(output, "w") as f: |
| json.dump(output_data, f, indent=2) |
| typer.echo(f"Results written to: {output}") |
| else: |
| typer.echo(json.dumps(output_data, indent=2)) |
|
|
| except ImportError as e: |
| typer.echo(f"Error: Missing dependency - {e}", err=True) |
| raise typer.Exit(1) |
| except Exception as e: |
| typer.echo(f"Error extracting fields: {e}", err=True) |
| raise typer.Exit(1) |
|
|
|
|
| @document_app.command("classify") |
| def classify_document( |
| file_path: Path = typer.Argument(..., help="Path to document file"), |
| output: Optional[Path] = typer.Option(None, "--output", "-o", help="Output JSON file"), |
| ): |
| """ |
| Classify document type. |
| |
| Example: |
| sparknet document classify document.pdf |
| """ |
| from loguru import logger |
|
|
| if not file_path.exists(): |
| typer.echo(f"Error: File not found: {file_path}", err=True) |
| raise typer.Exit(1) |
|
|
| typer.echo(f"Classifying document: {file_path}") |
|
|
| try: |
| from ..agents.document_agent import DocumentAgent |
| import asyncio |
|
|
| agent = DocumentAgent() |
| asyncio.run(agent.load_document(str(file_path))) |
| classification = asyncio.run(agent.classify()) |
|
|
| output_data = { |
| "document": str(file_path), |
| "document_type": classification.document_type.value, |
| "confidence": classification.confidence, |
| "reasoning": classification.reasoning, |
| "metadata": classification.metadata, |
| } |
|
|
| if output: |
| with open(output, "w") as f: |
| json.dump(output_data, f, indent=2) |
| typer.echo(f"Results written to: {output}") |
| else: |
| typer.echo(json.dumps(output_data, indent=2)) |
|
|
| except Exception as e: |
| typer.echo(f"Error classifying document: {e}", err=True) |
| raise typer.Exit(1) |
|
|
|
|
| @document_app.command("ask") |
| def ask_document( |
| file_path: Path = typer.Argument(..., help="Path to document file"), |
| question: str = typer.Argument(..., help="Question to ask about the document"), |
| output: Optional[Path] = typer.Option(None, "--output", "-o", help="Output JSON file"), |
| ): |
| """ |
| Ask a question about a document. |
| |
| Example: |
| sparknet document ask invoice.pdf "What is the total amount?" |
| """ |
| from loguru import logger |
|
|
| if not file_path.exists(): |
| typer.echo(f"Error: File not found: {file_path}", err=True) |
| raise typer.Exit(1) |
|
|
| typer.echo(f"Processing question for: {file_path}") |
|
|
| try: |
| from ..agents.document_agent import DocumentAgent |
| import asyncio |
|
|
| agent = DocumentAgent() |
| asyncio.run(agent.load_document(str(file_path))) |
| answer, evidence = asyncio.run(agent.answer_question(question)) |
|
|
| output_data = { |
| "document": str(file_path), |
| "question": question, |
| "answer": answer, |
| "evidence": [ |
| { |
| "chunk_id": e.chunk_id, |
| "page": e.page, |
| "snippet": e.snippet, |
| "confidence": e.confidence, |
| } |
| for e in evidence |
| ] if evidence else [], |
| } |
|
|
| if output: |
| with open(output, "w") as f: |
| json.dump(output_data, f, indent=2) |
| typer.echo(f"Results written to: {output}") |
| else: |
| typer.echo(f"\nQuestion: {question}") |
| typer.echo(f"\nAnswer: {answer}") |
| if evidence: |
| typer.echo(f"\nEvidence ({len(evidence)} sources):") |
| for e in evidence[:3]: |
| typer.echo(f" - Page {e.page + 1}: {e.snippet[:100]}...") |
|
|
| except Exception as e: |
| typer.echo(f"Error processing question: {e}", err=True) |
| raise typer.Exit(1) |
|
|