Spaces:
Starting
Starting
| # DEPENDENCIES | |
| import hashlib | |
| from typing import Any | |
| from typing import List | |
| from typing import Dict | |
| from typing import Tuple | |
| from pathlib import Path | |
| from typing import Optional | |
| from datetime import datetime | |
| from config.models import DocumentType | |
| from utils.text_cleaner import TextCleaner | |
| from config.models import DocumentMetadata | |
| from config.logging_config import get_logger | |
| from utils.error_handler import PDFParseError | |
| from utils.error_handler import handle_errors | |
| from document_parser.ocr_engine import OCREngine | |
| try: | |
| import fitz | |
| PYMUPDF_AVAILABLE = True | |
| except ImportError: | |
| PYMUPDF_AVAILABLE = False | |
| try: | |
| import PyPDF2 | |
| from PyPDF2 import PdfReader | |
| PYPdf2_AVAILABLE = True | |
| except ImportError: | |
| PYPdf2_AVAILABLE = False | |
| # Setup Logging | |
| logger = get_logger(__name__) | |
| class PDFParser: | |
| """ | |
| Comprehensive PDF parsing with metadata extraction: Uses PyMuPDF (fitz) as primary parser with PyPDF2 fallback | |
| Handles various PDF formats including encrypted and scanned documents | |
| """ | |
| def __init__(self, prefer_pymupdf: bool = True): | |
| """ | |
| Initialize PDF parser. | |
| Arguments: | |
| ---------- | |
| prefer_pymupdf { bool } : Use PyMuPDF as primary parser if available | |
| """ | |
| self.logger = logger | |
| self.prefer_pymupdf = prefer_pymupdf and PYMUPDF_AVAILABLE | |
| self.ocr_engine = None | |
| try: | |
| from document_parser.ocr_engine import OCREngine | |
| self.ocr_available = True | |
| except ImportError: | |
| self.ocr_available = False | |
| self.logger.warning("OCR engine not available - scanned PDFs may not be processed") | |
| if (not PYMUPDF_AVAILABLE and not PYPdf2_AVAILABLE): | |
| raise ImportError("Neither PyMuPDF nor PyPDF2 are available. Please install at least one.") | |
| self.logger.info(f"PDF Parser initialized - Primary: {'PyMuPDF' if self.prefer_pymupdf else 'PyPDF2'}, PyMuPDF available: {PYMUPDF_AVAILABLE}, PyPDF2 available: {PYPdf2_AVAILABLE}") | |
| def parse(self, file_path: Path, extract_metadata: bool = True, clean_text: bool = True, password: Optional[str] = None) -> tuple[str, Optional[DocumentMetadata]]: | |
| """ | |
| Parse PDF and extract text and metadata : tries PyMuPDF first, falls back to PyPDF2 if needed | |
| Arguments: | |
| ---------- | |
| file_path { Path } : Path to PDF file | |
| extract_metadata { bool } : Extract document metadata | |
| clean_text { bool } : Clean extracted text | |
| password { str } : Password for encrypted PDFs | |
| Returns: | |
| -------- | |
| { tuple } : Tuple of (extracted_text, metadata) | |
| Raises: | |
| ------- | |
| PDFParseError : If parsing fails | |
| """ | |
| file_path = Path(file_path) | |
| if not file_path.exists(): | |
| raise PDFParseError(str(file_path), original_error = FileNotFoundError(f"PDF file not found: {file_path}")) | |
| self.logger.info(f"Parsing PDF: {file_path}") | |
| # Try PyMuPDF first if preferred and available | |
| if (self.prefer_pymupdf and PYMUPDF_AVAILABLE): | |
| try: | |
| parsed_text = self._parse_with_pymupdf(file_path = file_path, | |
| extract_metadata = extract_metadata, | |
| clean_text = clean_text, | |
| password = password, | |
| ) | |
| return parsed_text | |
| except Exception as e: | |
| self.logger.warning(f"PyMuPDF parsing failed for {file_path}, falling back to PyPDF2: {repr(e)}") | |
| # Fall back to PyPDF2 | |
| if PYPdf2_AVAILABLE: | |
| try: | |
| parsed_text = self._parse_with_pypdf2(file_path = file_path, | |
| extract_metadata = extract_metadata, | |
| clean_text = clean_text, | |
| password = password, | |
| ) | |
| return parsed_text | |
| except Exception as e: | |
| self.logger.error(f"PyPDF2 parsing also failed for {file_path}: {repr(e)}") | |
| raise PDFParseError(str(file_path), original_error = e) | |
| else: | |
| raise PDFParseError(str(file_path), original_error = RuntimeError("No PDF parsing libraries available")) | |
| def _parse_with_pymupdf(self, file_path: Path, extract_metadata: bool = True, clean_text: bool = True, password: Optional[str] = None) -> tuple[str, Optional[DocumentMetadata]]: | |
| """ | |
| Parse PDF using PyMuPDF (fitz) with OCR fallback for scanned documents | |
| """ | |
| self.logger.debug(f"Using PyMuPDF for parsing: {file_path}") | |
| doc = None | |
| try: | |
| # Open PDF with PyMuPDF | |
| self.logger.debug(f"Opening document: {file_path}") | |
| doc = fitz.open(str(file_path)) | |
| self.logger.debug(f"Document opened successfully, {len(doc)} pages") | |
| # Handle encrypted PDFs | |
| if (doc.needs_pass and password): | |
| if not doc.authenticate(password): | |
| raise PDFParseError(str(file_path), original_error = ValueError("Invalid password for encrypted PDF")) | |
| elif (doc.needs_pass and not password): | |
| raise PDFParseError(str(file_path), original_error = ValueError("PDF is encrypted but no password provided")) | |
| # Extract text with per-page OCR fallback | |
| text_content = self._extract_text_with_pymupdf(doc = doc, | |
| file_path = file_path, | |
| ) | |
| # Extract metadata | |
| metadata = None | |
| if extract_metadata: | |
| metadata = self._extract_metadata_with_pymupdf(doc = doc, | |
| file_path = file_path, | |
| ) | |
| # Clean text | |
| if clean_text: | |
| text_content = TextCleaner.clean(text_content, | |
| remove_html = True, | |
| normalize_whitespace = True, | |
| preserve_structure = True, | |
| ) | |
| self.logger.info(f"Successfully parsed PDF with PyMuPDF: {len(text_content)} characters, {len(doc)} pages") | |
| return text_content, metadata | |
| except Exception as e: | |
| self.logger.error(f"PyMuPDF parsing failed for {file_path}: {repr(e)}") | |
| raise | |
| finally: | |
| # Always close the document in finally block | |
| if doc: | |
| self.logger.debug("Closing PyMuPDF document") | |
| doc.close() | |
| def _parse_with_pypdf2(self, file_path: Path, extract_metadata: bool = True, clean_text: bool = True, password: Optional[str] = None) -> tuple[str, Optional[DocumentMetadata]]: | |
| """ | |
| Parse PDF using PyPDF2 | |
| Arguments: | |
| ---------- | |
| file_path { Path } : Path to PDF file | |
| extract_metadata { bool } : Extract document metadata | |
| clean_text { bool } : Clean extracted text | |
| password { str } : Password for encrypted PDFs | |
| Returns: | |
| -------- | |
| { tuple } : Tuple of (extracted_text, metadata) | |
| """ | |
| self.logger.debug(f"Using PyPDF2 for parsing: {file_path}") | |
| try: | |
| # Open PDF with PyPDF2 | |
| with open(file_path, 'rb') as pdf_file: | |
| reader = PdfReader(pdf_file) | |
| # Handle encrypted PDFs | |
| if reader.is_encrypted: | |
| if password: | |
| reader.decrypt(password) | |
| self.logger.info("Successfully decrypted PDF with PyPDF2") | |
| else: | |
| raise PDFParseError(str(file_path), original_error = ValueError("PDF is encrypted but no password provided")) | |
| # Extract text from all pages | |
| text_content = self._extract_text_with_pypdf2(reader = reader) | |
| # Extract metadata | |
| metadata = None | |
| if extract_metadata: | |
| metadata = self._extract_metadata_with_pypdf2(reader = reader, | |
| file_path = file_path, | |
| ) | |
| # Clean text | |
| if clean_text: | |
| text_content = TextCleaner.clean(text_content, | |
| remove_html = True, | |
| normalize_whitespace = True, | |
| preserve_structure = True, | |
| ) | |
| self.logger.info(f"Successfully parsed PDF with PyPDF2: {len(text_content)} characters, {len(reader.pages)} pages") | |
| return text_content, metadata | |
| except Exception as e: | |
| self.logger.error(f"PyPDF2 parsing failed for {file_path}: {repr(e)}") | |
| raise | |
| def _extract_text_with_pymupdf(self, doc: "fitz.Document", file_path: Path = None) -> str: | |
| """ | |
| Extract text from all pages using PyMuPDF with per-page OCR fallback. | |
| Arguments: | |
| ---------- | |
| doc : PyMuPDF document object | |
| file_path : Path to PDF file (for OCR fallback) | |
| Returns: | |
| -------- | |
| { str } : Combined text from all pages | |
| """ | |
| text_parts = list() | |
| for page_num in range(len(doc)): | |
| try: | |
| page = doc[page_num] | |
| page_text = page.get_text() | |
| if page_text and page_text.strip(): | |
| # Add page marker for citation purposes | |
| text_parts.append(f"\n[PAGE {page_num + 1}]\n{page_text}") | |
| self.logger.debug(f"Extracted {len(page_text)} chars from page {page_num + 1} with PyMuPDF") | |
| else: | |
| # No text extracted - this page might be scanned | |
| self.logger.warning(f"No text extracted from page {page_num + 1} with PyMuPDF (might be scanned)") | |
| # Try OCR for this specific page if available | |
| if self.ocr_available and file_path: | |
| try: | |
| self.logger.info(f"Attempting OCR for page {page_num + 1}") | |
| ocr_text = self._extract_page_text_with_ocr(file_path, page_num + 1) | |
| if ocr_text and ocr_text.strip(): | |
| text_parts.append(f"\n[PAGE {page_num + 1} - OCR]\n{ocr_text}") | |
| self.logger.info(f"OCR extracted {len(ocr_text)} chars from page {page_num + 1}") | |
| else: | |
| text_parts.append(f"\n[PAGE {page_num + 1} - NO TEXT]\n") | |
| self.logger.warning(f"OCR also failed to extract text from page {page_num + 1}") | |
| except Exception as ocr_error: | |
| self.logger.warning(f"OCR failed for page {page_num + 1}: {repr(ocr_error)}") | |
| text_parts.append(f"\n[PAGE {page_num + 1} - OCR FAILED]\n") | |
| else: | |
| # No OCR available or no file_path provided | |
| text_parts.append(f"\n[PAGE {page_num + 1} - NO TEXT]\n") | |
| except Exception as e: | |
| self.logger.warning(f"Error extracting text from page {page_num + 1} with PyMuPDF: {repr(e)}") | |
| text_parts.append(f"\n[PAGE {page_num + 1} - ERROR: {str(e)}]\n") | |
| continue | |
| return "\n".join(text_parts) | |
| def _extract_text_with_pypdf2(self, reader: PdfReader) -> str: | |
| """ | |
| Extract text from all pages using PyPDF2 | |
| Arguments: | |
| ---------- | |
| reader { PdfReader } : PdfReader object | |
| Returns: | |
| -------- | |
| { str } : Combined text from all pages | |
| """ | |
| text_parts = list() | |
| num_pages = len(reader.pages) | |
| for page_num in range(num_pages): | |
| try: | |
| page = reader.pages[page_num] | |
| page_text = page.extract_text() | |
| if page_text and page_text.strip(): | |
| # Add page marker for citation purposes | |
| text_parts.append(f"\n[PAGE {page_num + 1}]\n{page_text}") | |
| self.logger.debug(f"Extracted {len(page_text)} chars from page {page_num + 1} with PyPDF2") | |
| else: | |
| self.logger.warning(f"No text extracted from page {page_num + 1} with PyPDF2 (might be scanned)") | |
| except Exception as e: | |
| self.logger.warning(f"Error extracting text from page {page_num + 1} with PyPDF2: {repr(e)}") | |
| continue | |
| return "\n".join(text_parts) | |
| def _extract_metadata_with_pymupdf(self, doc: "fitz.Document", file_path: Path) -> DocumentMetadata: | |
| """ | |
| Extract metadata using PyMuPDF | |
| Arguments: | |
| ----------- | |
| doc { fitz.Document } : PyMuPDF document object | |
| file_path { Path } : Path to PDF file | |
| Returns: | |
| -------- | |
| { DocumentMetadata } : DocumentMetadata object | |
| """ | |
| # Get PDF metadata | |
| pdf_metadata = doc.metadata | |
| # Extract common fields | |
| title = pdf_metadata.get('title', '').strip() | |
| author = pdf_metadata.get('author', '').strip() | |
| # Parse dates | |
| created_date = self._parse_pdf_date(pdf_metadata.get('creationDate')) | |
| modified_date = self._parse_pdf_date(pdf_metadata.get('modDate')) | |
| # Get file size | |
| file_size = file_path.stat().st_size | |
| # Count pages | |
| num_pages = len(doc) | |
| # Generate document ID | |
| doc_hash = hashlib.md5(str(file_path).encode()).hexdigest() | |
| doc_id = f"doc_{int(datetime.now().timestamp())}_{doc_hash}" | |
| # Create metadata object | |
| metadata = DocumentMetadata(document_id = doc_id, | |
| filename = file_path.name, | |
| file_path = file_path, | |
| document_type = DocumentType.PDF, | |
| title = title or file_path.stem, | |
| author = author, | |
| created_date = created_date, | |
| modified_date = modified_date, | |
| file_size_bytes = file_size, | |
| num_pages = num_pages, | |
| extra = {"pdf_version" : pdf_metadata.get('producer', ''), | |
| "pdf_metadata" : {k: str(v) for k, v in pdf_metadata.items() if v}, | |
| "parser_used" : "pymupdf" | |
| } | |
| ) | |
| return metadata | |
| def _extract_metadata_with_pypdf2(self, reader: PdfReader, file_path: Path) -> DocumentMetadata: | |
| """ | |
| Extract metadata using PyPDF2 | |
| Arguments: | |
| ---------- | |
| reader { PdfReader } : PdfReader object | |
| file_path { Path } : Path to PDF file | |
| Returns: | |
| -------- | |
| { DocumentMetadata } : DocumentMetadata object | |
| """ | |
| # Get PDF metadata | |
| pdf_info = reader.metadata if reader.metadata else {} | |
| # Extract common fields | |
| title = self._get_metadata_field(pdf_info, ['/Title', 'title']) | |
| author = self._get_metadata_field(pdf_info, ['/Author', 'author']) | |
| # Parse dates | |
| created_date = self._parse_pdf_date(self._get_metadata_field(pdf_info, ['/CreationDate', 'creation_date'])) | |
| modified_date = self._parse_pdf_date(self._get_metadata_field(pdf_info, ['/ModDate', 'mod_date'])) | |
| # Get file size | |
| file_size = file_path.stat().st_size | |
| # Count pages | |
| num_pages = len(reader.pages) | |
| # Generate document ID | |
| doc_hash = hashlib.md5(str(file_path).encode()).hexdigest() | |
| doc_id = f"doc_{int(datetime.now().timestamp())}_{doc_hash}" | |
| # Create metadata object | |
| metadata = DocumentMetadata(document_id = doc_id, | |
| filename = file_path.name, | |
| file_path = file_path, | |
| document_type = DocumentType.PDF, | |
| title = title or file_path.stem, | |
| author = author, | |
| created_date = created_date, | |
| modified_date = modified_date, | |
| file_size_bytes = file_size, | |
| num_pages = num_pages, | |
| extra = {"pdf_version" : self._get_metadata_field(pdf_info, ['/Producer', 'producer']), | |
| "pdf_metadata" : {k: str(v) for k, v in pdf_info.items() if v}, | |
| "parser_used" : "pypdf2", | |
| } | |
| ) | |
| return metadata | |
| def _extract_text_with_ocr(self, file_path: Path) -> str: | |
| """ | |
| Extract text from scanned PDF using OCR | |
| """ | |
| if not self.ocr_available: | |
| raise PDFParseError(str(file_path), original_error = RuntimeError("OCR engine not available")) | |
| if self.ocr_engine is None: | |
| self.ocr_engine = OCREngine() | |
| return self.ocr_engine.extract_text_from_pdf(file_path) | |
| def _extract_page_text_with_ocr(self, file_path: Path, page_number: int) -> str: | |
| """ | |
| Extract text from a specific page using OCR | |
| Arguments: | |
| ---------- | |
| file_path { Path } : Path to PDF file | |
| page_number { int } : Page number (1-indexed) | |
| Returns: | |
| -------- | |
| { str } : Extracted text from the page | |
| """ | |
| if not self.ocr_available: | |
| raise PDFParseError(str(file_path), original_error = RuntimeError("OCR engine not available")) | |
| if self.ocr_engine is None: | |
| self.ocr_engine = OCREngine() | |
| try: | |
| # Use OCR engine to extract text from specific page | |
| return self.ocr_engine.extract_text_from_pdf(pdf_path = file_path, | |
| pages = [page_number], | |
| ) | |
| except Exception as e: | |
| self.logger.error(f"OCR failed for page {page_number}: {repr(e)}") | |
| return "" | |
| def _get_metadata_field(metadata: Dict, field_names: List[str]) -> Optional[str]: | |
| """ | |
| Get metadata field with fallback names | |
| Arguments: | |
| ---------- | |
| metadata { dict } : Metadata dictionary | |
| field_names { list } : List of possible field names | |
| Returns: | |
| -------- | |
| { str } : Field value or None | |
| """ | |
| for field_name in field_names: | |
| if field_name in metadata: | |
| value = metadata[field_name] | |
| if value: | |
| return str(value).strip() | |
| return None | |
| def _parse_pdf_date(date_str: Optional[str]) -> Optional[datetime]: | |
| """ | |
| Parse PDF date format : PDF dates are in format: D:YYYYMMDDHHmmSSOHH'mm' | |
| Arguments: | |
| ---------- | |
| date_str { str } : PDF date string | |
| Returns: | |
| -------- | |
| { datetime } : Datetime object or None | |
| """ | |
| if not date_str: | |
| return None | |
| try: | |
| # Remove 'D:' prefix if present | |
| if date_str.startswith('D:'): | |
| date_str = date_str[2:] | |
| # Parse basic format: YYYYMMDDHHMMSS | |
| date_str = date_str[:14] | |
| return datetime.strptime(date_str, '%Y%m%d%H%M%S') | |
| except Exception: | |
| return None | |
| def extract_page_text(self, file_path: Path, page_number: int, clean_text: bool = True) -> str: | |
| """ | |
| Extract text from a specific page | |
| Arguments: | |
| ---------- | |
| file_path { Path } : Path to PDF file | |
| page_number { int } : Page number (1-indexed) | |
| clean_text { bool } : Clean extracted text | |
| Returns: | |
| -------- | |
| { str } : Page text | |
| """ | |
| # Try PyMuPDF first if preferred and available | |
| if self.prefer_pymupdf and PYMUPDF_AVAILABLE: | |
| try: | |
| page_text = self._extract_page_text_pymupdf(file_path = file_path, | |
| page_number = page_number, | |
| clean_text = clean_text, | |
| ) | |
| return page_text | |
| except Exception as e: | |
| self.logger.warning(f"PyMuPDF page extraction failed, falling back to PyPDF2: {repr(e)}") | |
| # Fall back to PyPDF2 | |
| if PYPdf2_AVAILABLE: | |
| page_text = self._extract_page_text_pypdf2(file_path = file_path, | |
| pagse_number = page_number, | |
| clean_text = clean_text, | |
| ) | |
| return page_text | |
| else: | |
| raise PDFParseError(str(file_path), original_error = RuntimeError("No PDF parsing libraries available")) | |
| def _extract_page_text_pymupdf(self, file_path: Path, page_number: int, clean_text: bool = True) -> str: | |
| """ | |
| Extract page text using PyMuPDF | |
| """ | |
| doc = None | |
| try: | |
| doc = fitz.open(str(file_path)) | |
| num_pages = len(doc) | |
| if ((page_number < 1) or (page_number > num_pages)): | |
| raise ValueError(f"Page number {page_number} out of range (1-{num_pages})") | |
| page = doc[page_number - 1] | |
| page_text = page.get_text() | |
| if clean_text: | |
| page_text = TextCleaner.clean(page_text) | |
| return page_text | |
| except Exception as e: | |
| self.logger.error(f"Failed to extract page {page_number} with PyMuPDF: {repr(e)}") | |
| raise PDFParseError(str(file_path), original_error = e) | |
| finally: | |
| if doc: | |
| doc.close() | |
| def _extract_page_text_pypdf2(self, file_path: Path, page_number: int, clean_text: bool = True) -> str: | |
| """ | |
| Extract page text using PyPDF2 | |
| """ | |
| try: | |
| with open(file_path, 'rb') as pdf_file: | |
| reader = PdfReader(pdf_file) | |
| num_pages = len(reader.pages) | |
| if ((page_number < 1) or (page_number > num_pages)): | |
| raise ValueError(f"Page number {page_number} out of range (1-{num_pages})") | |
| page = reader.pages[page_number - 1] | |
| page_text = page.extract_text() | |
| if clean_text: | |
| page_text = TextCleaner.clean(page_text) | |
| return page_text | |
| except Exception as e: | |
| self.logger.error(f"Failed to extract page {page_number} with PyPDF2: {repr(e)}") | |
| raise PDFParseError(str(file_path), original_error = e) | |
| def get_page_count(self, file_path: Path) -> int: | |
| """ | |
| Get number of pages in PDF | |
| Arguments: | |
| ---------- | |
| file_path { Path } : Path to PDF file | |
| Returns: | |
| -------- | |
| { int } : Number of pages | |
| """ | |
| # Try PyMuPDF first if available | |
| if PYMUPDF_AVAILABLE: | |
| doc = None | |
| try: | |
| doc = fitz.open(str(file_path)) | |
| page_count = len(doc) | |
| return page_count | |
| except Exception as e: | |
| self.logger.warning(f"PyMuPDF page count failed, trying PyPDF2: {repr(e)}") | |
| finally: | |
| if doc: | |
| doc.close() | |
| # Fall back to PyPDF2 | |
| if PYPdf2_AVAILABLE: | |
| try: | |
| with open(file_path, 'rb') as pdf_file: | |
| reader = PdfReader(pdf_file) | |
| return len(reader.pages) | |
| except Exception as e: | |
| self.logger.error(f"Failed to get page count: {repr(e)}") | |
| raise PDFParseError(str(file_path), original_error = e) | |
| else: | |
| raise PDFParseError(str(file_path), original_error = RuntimeError("No PDF parsing libraries available")) | |
| def extract_page_range(self, file_path: Path, start_page: int, end_page: int, clean_text: bool = True) -> str: | |
| """ | |
| Extract text from a range of pages | |
| Arguments: | |
| ---------- | |
| file_path { Path } : Path to PDF file | |
| start_page { int } : Starting page (1-indexed, inclusive) | |
| end_page { int } : Ending page (1-indexed, inclusive) | |
| clean_text { bool } : Clean extracted text | |
| Returns: | |
| -------- | |
| { str } : Combined text from pages | |
| """ | |
| # Try PyMuPDF first if preferred and available | |
| if self.prefer_pymupdf and PYMUPDF_AVAILABLE: | |
| try: | |
| page_range = self._extract_page_range_pymupdf(file_path = file_path, | |
| start_page = start_page, | |
| end_page = end_page, | |
| clean_text = clean_text, | |
| ) | |
| return page_range | |
| except Exception as e: | |
| self.logger.warning(f"PyMuPDF page range extraction failed, falling back to PyPDF2: {repr(e)}") | |
| # Fall back to PyPDF2 | |
| if PYPdf2_AVAILABLE: | |
| page_range = self._extract_page_range_pypdf2(file_path = file_path, | |
| start_page = start_page, | |
| end_page = end_page, | |
| clean_text = clean_text, | |
| ) | |
| return page_range | |
| else: | |
| raise PDFParseError(str(file_path), original_error = RuntimeError("No PDF parsing libraries available")) | |
| def _extract_page_range_pymupdf(self, file_path: Path, start_page: int, end_page: int, clean_text: bool = True) -> str: | |
| """ | |
| Extract page range using PyMuPDF | |
| """ | |
| doc = None | |
| try: | |
| doc = fitz.open(str(file_path)) | |
| num_pages = len(doc) | |
| if ((start_page < 1) or (end_page > num_pages) or (start_page > end_page)): | |
| raise ValueError(f"Invalid page range {start_page}-{end_page} for PDF with {num_pages} pages") | |
| text_parts = list() | |
| for page_num in range(start_page - 1, end_page): | |
| page = doc[page_num] | |
| page_text = page.get_text() | |
| if page_text: | |
| text_parts.append(f"\n[PAGE {page_num + 1}]\n{page_text}") | |
| combined_text = "\n".join(text_parts) | |
| if clean_text: | |
| combined_text = TextCleaner.clean(combined_text) | |
| return combined_text | |
| except Exception as e: | |
| self.logger.error(f"Failed to extract page range with PyMuPDF: {repr(e)}") | |
| raise PDFParseError(str(file_path), original_error = e) | |
| finally: | |
| if doc: | |
| doc.close() | |
| def _extract_page_range_pypdf2(self, file_path: Path, start_page: int, end_page: int, clean_text: bool = True) -> str: | |
| """ | |
| Extract page range using PyPDF2 | |
| """ | |
| try: | |
| with open(file_path, 'rb') as pdf_file: | |
| reader = PdfReader(pdf_file) | |
| num_pages = len(reader.pages) | |
| if ((start_page < 1) or (end_page > num_pages) or (start_page > end_page)): | |
| raise ValueError(f"Invalid page range {start_page}-{end_page} for PDF with {num_pages} pages") | |
| text_parts = list() | |
| for page_num in range(start_page - 1, end_page): | |
| page = reader.pages[page_num] | |
| page_text = page.extract_text() | |
| if page_text: | |
| text_parts.append(f"\n[PAGE {page_num + 1}]\n{page_text}") | |
| combined_text = "\n".join(text_parts) | |
| if clean_text: | |
| combined_text = TextCleaner.clean(combined_text) | |
| return combined_text | |
| except Exception as e: | |
| self.logger.error(f"Failed to extract page range with PyPDF2: {repr(e)}") | |
| raise PDFParseError(str(file_path), original_error = e) | |
| def is_scanned(self, file_path: Path) -> bool: | |
| """ | |
| Check if PDF is scanned (image-based): Scanned PDFs have very little or no extractable text | |
| Arguments: | |
| ---------- | |
| file_path { Path } : Path to PDF file | |
| Returns: | |
| -------- | |
| { bool } : True if appears to be scanned | |
| """ | |
| # Try PyMuPDF first if available for better detection | |
| if PYMUPDF_AVAILABLE: | |
| try: | |
| return self._is_scanned_pymupdf(file_path = file_path) | |
| except Exception as e: | |
| self.logger.warning(f"PyMuPDF scanned detection failed, trying PyPDF2: {repr(e)}") | |
| # Fall back to PyPDF2 | |
| if PYPdf2_AVAILABLE: | |
| return self._is_scanned_pypdf2(file_path = file_path) | |
| else: | |
| self.logger.warning("No PDF parsing libraries available for scanned detection") | |
| return False | |
| def _is_scanned_pymupdf(self, file_path: Path) -> bool: | |
| """ | |
| Check if PDF is scanned using PyMuPDF | |
| """ | |
| doc = None | |
| try: | |
| doc = fitz.open(str(file_path)) | |
| # Sample first 3 pages | |
| pages_to_check = min(3, len(doc)) | |
| total_text_length = 0 | |
| for i in range(pages_to_check): | |
| page = doc[i] | |
| text = page.get_text() | |
| total_text_length += len(text.strip()) | |
| # If average text per page is very low, likely scanned | |
| avg_text_per_page = total_text_length / pages_to_check | |
| # characters per page | |
| threshold = 100 | |
| is_scanned = (avg_text_per_page < threshold) | |
| if is_scanned: | |
| self.logger.info(f"PDF appears to be scanned (avg {avg_text_per_page:.0f} chars/page)") | |
| return is_scanned | |
| except Exception as e: | |
| self.logger.warning(f"Could not determine if PDF is scanned with PyMuPDF: {repr(e)}") | |
| return False | |
| finally: | |
| if doc: | |
| doc.close() | |
| def _is_scanned_pypdf2(self, file_path: Path) -> bool: | |
| """ | |
| Check if PDF is scanned using PyPDF2 | |
| """ | |
| try: | |
| with open(file_path, 'rb') as pdf_file: | |
| reader = PdfReader(pdf_file) | |
| # Sample first 3 pages | |
| pages_to_check = min(3, len(reader.pages)) | |
| total_text_length = 0 | |
| for i in range(pages_to_check): | |
| page = reader.pages[i] | |
| text = page.extract_text() | |
| total_text_length += len(text.strip()) | |
| # If average text per page is very low, likely scanned | |
| avg_text_per_page = total_text_length / pages_to_check | |
| # characters per page | |
| threshold = 100 | |
| is_scanned = (avg_text_per_page < threshold) | |
| if is_scanned: | |
| self.logger.info(f"PDF appears to be scanned (avg {avg_text_per_page:.0f} chars/page)") | |
| return is_scanned | |
| except Exception as e: | |
| self.logger.warning(f"Could not determine if PDF is scanned with PyPDF2: {repr(e)}") | |
| return False |