Spaces:
Runtime error
Runtime error
| import gradio as gr | |
| from extractous import Extractor, TesseractOcrConfig | |
| def extract_document(file): | |
| """ | |
| Extract text and metadata from an uploaded document | |
| """ | |
| if file is None: | |
| return "Please upload a file", "No metadata available" | |
| try: | |
| # Create an extractor with default settings | |
| extractor = Extractor() | |
| # Optional: Add OCR config for image-based or scanned documents | |
| extractor = extractor.set_ocr_config(TesseractOcrConfig().set_language("eng")) | |
| # Extract text and metadata | |
| result, metadata = extractor.extract_file_to_string(file) | |
| return result, str(metadata) | |
| except Exception as e: | |
| return f"Error extracting document: {str(e)}", "No metadata available" | |
| # Create the Gradio interface | |
| demo = gr.Interface( | |
| fn=extract_document, | |
| inputs=gr.File(label="Upload Document"), | |
| outputs=[ | |
| gr.Textbox(label="Extracted Text", lines=10), | |
| gr.Textbox(label="Metadata", lines=3), | |
| ], | |
| title="Extractus Demo", | |
| description=""" | |
| Upload a document to extract its text content and metadata using [Extractous](https://github.com/yobix-ai/extractous). | |
| **Supported formats include:** | |
| - PDF files (with OCR support) | |
| - Microsoft Office (DOC, DOCX, PPT, PPTX, etc.) | |
| - Web Documents (HTML, XML) | |
| - Text Files (TXT, Markdown) | |
| - Images (with OCR capability) | |
| - And more | |
| """, | |
| article=""" | |
| This demo showcases document text and metadata extraction capabilities. | |
| For more information, visit [Extractous on GitHub](https://github.com/yobix-ai/extractous). | |
| """, | |
| examples=[ | |
| ["2412.13663v2.pdf"], # Add example files to demo directory | |
| ], | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() | |