Spaces:

impresso-project
/

ocrqa-demo

Running

App Files Files Community

ocrqa-demo / app.py

simon-clmtd

fix explanation for v2 treatment.

4083407 verified 7 days ago

raw

history blame contribute delete

7.88 kB

	import os

	# Redirect cache to a writable path inside container
	os.environ["XDG_CACHE_HOME"] = "/tmp/.cache"

	import gradio as gr
	from impresso_pipelines.ocrqa import OCRQAPipeline

	pipeline = OCRQAPipeline()

	CSS = """
	#ocr_output textarea {
	font-family: monospace !important;
	}
	"""


	LANGUAGES = ["en", "de", "fr","lb"]

	# Example OCR text (German text with typical OCR errors)
	EXAMPLE_TEXT = """Vieles Seltsame geschieht auf Erden :
	Nichts Seltsameres sieht der Mond
	Als das Glück, das im Knopfloch wohnt.
	Zaubrisch faßt es den ernsten Mann.
	Ohne nach Weib u. Kinjd zu fragen
	Reitet er aus, nach dem Glück zu jagen,
	Nur nacb ihm war stets sein Vegehr.
	Aber neben ihm 1reitet der Dämon her
	Des Ehrgeizes mit finsterer Tücke,
	Und so jagt er zuletzt auf die Brücke,
	Die über dem Abgrund, d:m nächtlich schwarzen
	Jählings abbricht."""

	def process_ocr_qa(text, lang_choice):
	try:
	lang = None if lang_choice == "Auto-detect" else lang_choice
	result = pipeline(text, language=lang, diagnostics=True)

	# Format the output for better readability
	if isinstance(result, dict):
	output_lines = []

	# Language detection
	if 'language' in result:
	output_lines.append(f"🌍 Language: {result['language']}")

	# Quality score (with ratio)
	score = result.get("score")
	diagnostics = result.get("diagnostics", {})
	known_tokens = diagnostics.get("known_tokens", [])
	unknown_tokens = diagnostics.get("unknown_tokens", [])

	if score is not None:
	k = len(known_tokens)
	u = len(unknown_tokens)
	total = k + u
	ratio = f"{k}/{total}" if total > 0 else "n/a"

	score_emoji = "🟢" if score >= 0.8 else "🟡" if score >= 0.5 else "🔴"
	output_lines.append(
	f"{score_emoji} Quality Score: {score if total else 'n/a'} ({ratio})"
	)


	# Diagnostics section
	if 'diagnostics' in result and result['diagnostics']:
	diagnostics = result['diagnostics']

	# Known tokens
	if 'known_tokens' in diagnostics and diagnostics['known_tokens']:
	known_tokens = diagnostics['known_tokens']
	output_lines.append(f"✅ Known unique tokens ({k}):\n{' '.join(known_tokens)}")

	# Unknown tokens (potential OCR errors)
	if 'unknown_tokens' in diagnostics and diagnostics['unknown_tokens']:
	unknown_tokens = diagnostics['unknown_tokens']
	output_lines.append(f"❌ Unrecognized unique tokens ({u}):\n{' '.join(unknown_tokens)}")
	elif 'unknown_tokens' in diagnostics and total > 0:
	output_lines.append("✨ All tokens were known – no OCR errors detected.")

	# Model information
	if 'model_id' in diagnostics:
	output_lines.append(f"🤖 Model: {diagnostics['model_id']}")

	# Other fields
	for key, value in result.items():
	if key not in ['language', 'score', 'diagnostics']:
	output_lines.append(f"🔍 {key.replace('_', ' ').title()}: {value}")

	return "\n\n".join(output_lines)
	else:
	return f"✨ Processed Result:\n{result}"

	except Exception as e:
	print("❌ Pipeline error:", e)
	return f"Error: {e}"

	# Create the interface with logo and improved description
	with gr.Blocks(title="OCR QA Demo") as demo:

	gr.Markdown(
	"""
	# 🔍 Optical Character Recognition (OCR) Quality Assessment Demo

	This demo evaluates OCR quality by comparing the unique words in a text against large reference vocabularies.

	It reports:
	- potential OCR errors (unrecognized unique tokens) and known tokens
	- an overall quality score between 0.0 (poor) and 1.0 (perfect), defined as `score = known/(known + unrecognized)`


	Try the German example below or paste your own OCR text.
	"""
	)

	with gr.Row():
	with gr.Column():
	text_input = gr.Textbox(
	label="OCR input text",
	value=EXAMPLE_TEXT,
	lines=8,
	placeholder="Paste OCR text..."
	)
	lang_dropdown = gr.Dropdown(
	choices=LANGUAGES,
	value="de",
	label="Language of the input text"
	)
	submit_btn = gr.Button("🔍 Assess OCR Text Quality", variant="primary")
	info_btn = gr.Button("Help", size="md", scale=1)

	with gr.Column():
	output = gr.Textbox(
	label="OCR Quality Assessment",
	lines=15,
	placeholder="The quality assessment will be shown here...",
	scale=10,
	elem_id="ocr_output"

	)
	gr.HTML(
	"""
	<a href="https://impresso-project.ch" target="_blank">
	<img src="https://huggingface.co/spaces/impresso-project/ocrqa-demo/resolve/main/logo.jpeg"
	alt="Impresso Project Logo"
	style="height: 42px; display: block; margin: 5px auto; background-color: white;">
	</a>
	"""
	)

	# Info modal/accordion for pipeline details
	with gr.Accordion("📝 About the OCR QA Method", open=False, visible=False) as info_accordion:
	gr.Markdown(
	"""
	This pipeline estimates OCR quality by analyzing the proportion of unique words in a text that match curated wordlists for a given language.

	#### How it works:
	- Scoring: The quality score ranges from 0.0 (poor) to 1.0 (excellent) and is based on the ratio of recognized to unrecognized unique word forms.
	- Lexical resources: Words are matched against precompiled lists derived from Wikipedia and Wortschatz Leipzig, using Bloom filters for fast, memory-efficient lookup.
	- Multilingual support: Available for several languages (e.g., German, French, English). If not specified, the language is detected automatically.
	- Diagnostics output:
	- ✅ Known tokens: Words found in the reference wordlist, presumed correctly OCR’d.
	- ❌ Unrecognized tokens: Words not found in the list—often OCR errors, rare forms, or out-of-vocabulary items (e.g., names, historical terms).
	- Note: Non-alphabetic characters will be removed. For efficiency reasons, all digits are replace by the digit 0.

	#### ⚠️ Limitations:
	- The lists of known words are not exhaustive, particularly for historical vocabulary, Luxembourgish, or named entities.
	- The method may fail to flag short OCR artifacts (e.g., 1–3 character noise).

	As such, the score should be understood as a heuristic indicator, best used for:
	- Comparative assessments between OCR outputs
	- Filtering low-quality text from large corpora
	- Supporting decisions in corpus preparation and annotation workflows

	It is not a substitute for manual inspection or ground-truth evaluation.
	"""
	)

	submit_btn.click(
	fn=process_ocr_qa,
	inputs=[text_input, lang_dropdown],
	outputs=output
	)

	# Toggle info visibility when info button is clicked
	info_btn.click(
	fn=lambda: gr.Accordion(visible=True, open=True),
	outputs=info_accordion
	)

	demo.launch(server_name="0.0.0.0", server_port=7860, css=CSS)