Spaces:
Sleeping
Sleeping
| from pathlib import Path | |
| import fitz # PyMuPDF | |
| # from ...core.processor import BaseFileProcessor | |
| from ...core.file_type import FileType | |
| from ...core.decorators import processor_register | |
| from ...core.exceptions import FileProcessingError | |
| from ...document import Document, ProcessStatus | |
| from lpm_kernel.file_data.processors.processor import BaseFileProcessor | |
| class PDFProcessor(BaseFileProcessor): | |
| SUPPORTED_TYPES = {FileType.PDF} | |
| def _process_file(cls, file_path: Path, doc: Document) -> Document: | |
| try: | |
| with fitz.open(file_path) as pdf: | |
| text = "" | |
| for page in pdf: | |
| text += page.get_text() | |
| doc.raw_content = text | |
| doc.extract_status = ProcessStatus.SUCCESS | |
| except Exception as e: | |
| doc.extract_status = ProcessStatus.FAILED | |
| raise FileProcessingError(f"Failed to process PDF: {str(e)}") | |
| return doc | |