Spaces:
Running
Running
| import os | |
| # Redirect cache to a writable path inside container | |
| os.environ["XDG_CACHE_HOME"] = "/tmp/.cache" | |
| import gradio as gr | |
| from impresso_pipelines.ocrqa import OCRQAPipeline | |
| pipeline = OCRQAPipeline() | |
| CSS = """ | |
| #ocr_output textarea { | |
| font-family: monospace !important; | |
| } | |
| """ | |
| LANGUAGES = ["en", "de", "fr","lb"] | |
| # Example OCR text (German text with typical OCR errors) | |
| EXAMPLE_TEXT = """Vieles Seltsame geschieht auf Erden : | |
| Nichts Seltsameres sieht der Mond | |
| Als das Glück, das im Knopfloch wohnt. | |
| Zaubrisch faßt es den ernsten Mann. | |
| Ohne nach Weib u. Kinjd zu fragen | |
| Reitet er aus, nach dem Glück zu jagen, | |
| Nur nacb ihm war stets sein Vegehr. | |
| Aber neben ihm 1reitet der Dämon her | |
| Des Ehrgeizes mit finsterer Tücke, | |
| Und so jagt er zuletzt auf die Brücke, | |
| Die über dem Abgrund, d:m nächtlich schwarzen | |
| Jählings abbricht.""" | |
| def process_ocr_qa(text, lang_choice): | |
| try: | |
| lang = None if lang_choice == "Auto-detect" else lang_choice | |
| result = pipeline(text, language=lang, diagnostics=True) | |
| # Format the output for better readability | |
| if isinstance(result, dict): | |
| output_lines = [] | |
| # Language detection | |
| if 'language' in result: | |
| output_lines.append(f"🌍 Language: {result['language']}") | |
| # Quality score (with ratio) | |
| score = result.get("score") | |
| diagnostics = result.get("diagnostics", {}) | |
| known_tokens = diagnostics.get("known_tokens", []) | |
| unknown_tokens = diagnostics.get("unknown_tokens", []) | |
| if score is not None: | |
| k = len(known_tokens) | |
| u = len(unknown_tokens) | |
| total = k + u | |
| ratio = f"{k}/{total}" if total > 0 else "n/a" | |
| score_emoji = "🟢" if score >= 0.8 else "🟡" if score >= 0.5 else "🔴" | |
| output_lines.append( | |
| f"{score_emoji} Quality Score: {score if total else 'n/a'} ({ratio})" | |
| ) | |
| # Diagnostics section | |
| if 'diagnostics' in result and result['diagnostics']: | |
| diagnostics = result['diagnostics'] | |
| # Known tokens | |
| if 'known_tokens' in diagnostics and diagnostics['known_tokens']: | |
| known_tokens = diagnostics['known_tokens'] | |
| output_lines.append(f"✅ Known unique tokens ({k}):\n{' '.join(known_tokens)}") | |
| # Unknown tokens (potential OCR errors) | |
| if 'unknown_tokens' in diagnostics and diagnostics['unknown_tokens']: | |
| unknown_tokens = diagnostics['unknown_tokens'] | |
| output_lines.append(f"❌ Unrecognized unique tokens ({u}):\n{' '.join(unknown_tokens)}") | |
| elif 'unknown_tokens' in diagnostics and total > 0: | |
| output_lines.append("✨ All tokens were known – no OCR errors detected.") | |
| # Model information | |
| if 'model_id' in diagnostics: | |
| output_lines.append(f"🤖 Model: {diagnostics['model_id']}") | |
| # Other fields | |
| for key, value in result.items(): | |
| if key not in ['language', 'score', 'diagnostics']: | |
| output_lines.append(f"🔍 {key.replace('_', ' ').title()}: {value}") | |
| return "\n\n".join(output_lines) | |
| else: | |
| return f"✨ Processed Result:\n{result}" | |
| except Exception as e: | |
| print("❌ Pipeline error:", e) | |
| return f"Error: {e}" | |
| # Create the interface with logo and improved description | |
| with gr.Blocks(title="OCR QA Demo") as demo: | |
| gr.Markdown( | |
| """ | |
| # 🔍 Optical Character Recognition (OCR) Quality Assessment Demo | |
| This demo evaluates OCR quality by comparing the unique words in a text against large reference vocabularies. | |
| It reports: | |
| - **potential OCR errors** (unrecognized unique tokens) and known tokens | |
| - an overall **quality score** between 0.0 (poor) and 1.0 (perfect), defined as `score = known/(known + unrecognized)` | |
| Try the German example below or paste your own OCR text. | |
| """ | |
| ) | |
| with gr.Row(): | |
| with gr.Column(): | |
| text_input = gr.Textbox( | |
| label="OCR input text", | |
| value=EXAMPLE_TEXT, | |
| lines=8, | |
| placeholder="Paste OCR text..." | |
| ) | |
| lang_dropdown = gr.Dropdown( | |
| choices=LANGUAGES, | |
| value="de", | |
| label="Language of the input text" | |
| ) | |
| submit_btn = gr.Button("🔍 Assess OCR Text Quality", variant="primary") | |
| info_btn = gr.Button("Help", size="md", scale=1) | |
| with gr.Column(): | |
| output = gr.Textbox( | |
| label="OCR Quality Assessment", | |
| lines=15, | |
| placeholder="The quality assessment will be shown here...", | |
| scale=10, | |
| elem_id="ocr_output" | |
| ) | |
| gr.HTML( | |
| """ | |
| <a href="https://impresso-project.ch" target="_blank"> | |
| <img src="https://huggingface.co/spaces/impresso-project/ocrqa-demo/resolve/main/logo.jpeg" | |
| alt="Impresso Project Logo" | |
| style="height: 42px; display: block; margin: 5px auto; background-color: white;"> | |
| </a> | |
| """ | |
| ) | |
| # Info modal/accordion for pipeline details | |
| with gr.Accordion("📝 About the OCR QA Method", open=False, visible=False) as info_accordion: | |
| gr.Markdown( | |
| """ | |
| This pipeline estimates OCR quality by analyzing the proportion of **unique words** in a text that match curated wordlists for a given language. | |
| #### How it works: | |
| - **Scoring**: The quality score ranges from **0.0** (poor) to **1.0** (excellent) and is based on the ratio of recognized to unrecognized unique word forms. | |
| - **Lexical resources**: Words are matched against precompiled lists derived from **Wikipedia** and **Wortschatz Leipzig**, using **Bloom filters** for fast, memory-efficient lookup. | |
| - **Multilingual support**: Available for several languages (e.g., German, French, English). If not specified, the language is detected automatically. | |
| - **Diagnostics output**: | |
| - ✅ **Known tokens**: Words found in the reference wordlist, presumed correctly OCR’d. | |
| - ❌ **Unrecognized tokens**: Words not found in the list—often OCR errors, rare forms, or out-of-vocabulary items (e.g., names, historical terms). | |
| - Note: Non-alphabetic characters will be removed. For efficiency reasons, all digits are replace by the digit 0. | |
| #### ⚠️ Limitations: | |
| - The lists of known words are **not exhaustive**, particularly for **historical vocabulary**, **Luxembourgish**, or **named entities**. | |
| - The method may fail to flag **short OCR artifacts** (e.g., 1–3 character noise). | |
| As such, the score should be understood as a **heuristic indicator**, best used for: | |
| - Comparative assessments between OCR outputs | |
| - Filtering low-quality text from large corpora | |
| - Supporting decisions in corpus preparation and annotation workflows | |
| It is **not a substitute for manual inspection** or ground-truth evaluation. | |
| """ | |
| ) | |
| submit_btn.click( | |
| fn=process_ocr_qa, | |
| inputs=[text_input, lang_dropdown], | |
| outputs=output | |
| ) | |
| # Toggle info visibility when info button is clicked | |
| info_btn.click( | |
| fn=lambda: gr.Accordion(visible=True, open=True), | |
| outputs=info_accordion | |
| ) | |
| demo.launch(server_name="0.0.0.0", server_port=7860, css=CSS) |