File size: 988 Bytes
01d5a5d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
from pathlib import Path
import fitz  # PyMuPDF

# from ...core.processor import BaseFileProcessor
from ...core.file_type import FileType
from ...core.decorators import processor_register
from ...core.exceptions import FileProcessingError
from ...document import Document, ProcessStatus
from lpm_kernel.file_data.processors.processor import BaseFileProcessor


@processor_register
class PDFProcessor(BaseFileProcessor):
    SUPPORTED_TYPES = {FileType.PDF}

    @classmethod
    def _process_file(cls, file_path: Path, doc: Document) -> Document:
        try:
            with fitz.open(file_path) as pdf:
                text = ""
                for page in pdf:
                    text += page.get_text()

                doc.raw_content = text
                doc.extract_status = ProcessStatus.SUCCESS

        except Exception as e:
            doc.extract_status = ProcessStatus.FAILED
            raise FileProcessingError(f"Failed to process PDF: {str(e)}")

        return doc