Spaces:
Paused
Paused
| """Convert any document to Markdown.""" | |
| import re | |
| from copy import deepcopy | |
| from pathlib import Path | |
| from typing import Any | |
| import mdformat | |
| import numpy as np | |
| from pdftext.extraction import dictionary_output | |
| from sklearn.cluster import KMeans | |
| def parsed_pdf_to_markdown(pages: list[dict[str, Any]]) -> list[str]: # noqa: C901, PLR0915 | |
| """Convert a PDF parsed with pdftext to Markdown.""" | |
| def add_heading_level_metadata(pages: list[dict[str, Any]]) -> list[dict[str, Any]]: # noqa: C901 | |
| """Add heading level metadata to a PDF parsed with pdftext.""" | |
| def extract_font_size(span: dict[str, Any]) -> float: | |
| """Extract the font size from a text span.""" | |
| font_size: float = 1.0 | |
| if span["font"]["size"] > 1: # A value of 1 appears to mean "unknown" in pdftext. | |
| font_size = span["font"]["size"] | |
| elif digit_sequences := re.findall(r"\d+", span["font"]["name"] or ""): | |
| font_size = float(digit_sequences[-1]) | |
| elif "\n" not in span["text"]: # Occasionally a span can contain a newline character. | |
| if round(span["rotation"]) in (0.0, 180.0, -180.0): | |
| font_size = span["bbox"][3] - span["bbox"][1] | |
| elif round(span["rotation"]) in (90.0, -90.0, 270.0, -270.0): | |
| font_size = span["bbox"][2] - span["bbox"][0] | |
| return font_size | |
| # Copy the pages. | |
| pages = deepcopy(pages) | |
| # Extract an array of all font sizes used by the text spans. | |
| font_sizes = np.asarray( | |
| [ | |
| extract_font_size(span) | |
| for page in pages | |
| for block in page["blocks"] | |
| for line in block["lines"] | |
| for span in line["spans"] | |
| ] | |
| ) | |
| font_sizes = np.round(font_sizes * 2) / 2 | |
| unique_font_sizes, counts = np.unique(font_sizes, return_counts=True) | |
| # Determine the paragraph font size as the mode font size. | |
| tiny = unique_font_sizes < min(5, np.max(unique_font_sizes)) | |
| counts[tiny] = -counts[tiny] | |
| mode = np.argmax(counts) | |
| counts[tiny] = -counts[tiny] | |
| mode_font_size = unique_font_sizes[mode] | |
| # Determine (at most) 6 heading font sizes by clustering font sizes larger than the mode. | |
| heading_font_sizes = unique_font_sizes[mode + 1 :] | |
| if len(heading_font_sizes) > 0: | |
| heading_counts = counts[mode + 1 :] | |
| kmeans = KMeans(n_clusters=min(6, len(heading_font_sizes)), random_state=42) | |
| kmeans.fit(heading_font_sizes[:, np.newaxis], sample_weight=heading_counts) | |
| heading_font_sizes = np.sort(np.ravel(kmeans.cluster_centers_))[::-1] | |
| # Add heading level information to the text spans and lines. | |
| for page in pages: | |
| for block in page["blocks"]: | |
| for line in block["lines"]: | |
| if "md" not in line: | |
| line["md"] = {} | |
| heading_level = np.zeros(8) # 0-5: <h1>-<h6>, 6: <p>, 7: <small> | |
| for span in line["spans"]: | |
| if "md" not in span: | |
| span["md"] = {} | |
| span_font_size = extract_font_size(span) | |
| if span_font_size < mode_font_size: | |
| idx = 7 | |
| elif span_font_size == mode_font_size: | |
| idx = 6 | |
| else: | |
| idx = np.argmin(np.abs(heading_font_sizes - span_font_size)) # type: ignore[assignment] | |
| span["md"]["heading_level"] = idx + 1 | |
| heading_level[idx] += len(span["text"]) | |
| line["md"]["heading_level"] = np.argmax(heading_level) + 1 | |
| return pages | |
| def add_emphasis_metadata(pages: list[dict[str, Any]]) -> list[dict[str, Any]]: | |
| """Add emphasis metadata such as bold and italic to a PDF parsed with pdftext.""" | |
| # Copy the pages. | |
| pages = deepcopy(pages) | |
| # Add emphasis metadata to the text spans. | |
| for page in pages: | |
| for block in page["blocks"]: | |
| for line in block["lines"]: | |
| if "md" not in line: | |
| line["md"] = {} | |
| for span in line["spans"]: | |
| if "md" not in span: | |
| span["md"] = {} | |
| span["md"]["bold"] = span["font"]["weight"] > 500 # noqa: PLR2004 | |
| span["md"]["italic"] = "ital" in (span["font"]["name"] or "").lower() | |
| line["md"]["bold"] = all( | |
| span["md"]["bold"] for span in line["spans"] if span["text"].strip() | |
| ) | |
| line["md"]["italic"] = all( | |
| span["md"]["italic"] for span in line["spans"] if span["text"].strip() | |
| ) | |
| return pages | |
| def strip_page_numbers(pages: list[dict[str, Any]]) -> list[dict[str, Any]]: | |
| """Strip page numbers from a PDF parsed with pdftext.""" | |
| # Copy the pages. | |
| pages = deepcopy(pages) | |
| # Remove lines that only contain a page number. | |
| for page in pages: | |
| for block in page["blocks"]: | |
| block["lines"] = [ | |
| line | |
| for line in block["lines"] | |
| if not re.match( | |
| r"^\s*[#0]*\d+\s*$", "".join(span["text"] for span in line["spans"]) | |
| ) | |
| ] | |
| return pages | |
| def convert_to_markdown(pages: list[dict[str, Any]]) -> list[str]: # noqa: C901, PLR0912 | |
| """Convert a list of pages to Markdown.""" | |
| pages_md = [] | |
| for page in pages: | |
| page_md = "" | |
| for block in page["blocks"]: | |
| block_text = "" | |
| for line in block["lines"]: | |
| # Build the line text and style the spans. | |
| line_text = "" | |
| for span in line["spans"]: | |
| if ( | |
| not line["md"]["bold"] | |
| and not line["md"]["italic"] | |
| and span["md"]["bold"] | |
| and span["md"]["italic"] | |
| ): | |
| line_text += f"***{span['text']}***" | |
| elif not line["md"]["bold"] and span["md"]["bold"]: | |
| line_text += f"**{span['text']}**" | |
| elif not line["md"]["italic"] and span["md"]["italic"]: | |
| line_text += f"*{span['text']}*" | |
| else: | |
| line_text += span["text"] | |
| # Add emphasis to the line (if it's not a heading or whitespace). | |
| line_text = line_text.rstrip() | |
| line_is_whitespace = not line_text.strip() | |
| line_is_heading = line["md"]["heading_level"] <= 6 # noqa: PLR2004 | |
| if not line_is_heading and not line_is_whitespace: | |
| if line["md"]["bold"] and line["md"]["italic"]: | |
| line_text = f"***{line_text}***" | |
| elif line["md"]["bold"]: | |
| line_text = f"**{line_text}**" | |
| elif line["md"]["italic"]: | |
| line_text = f"*{line_text}*" | |
| # Set the heading level. | |
| if line_is_heading and not line_is_whitespace: | |
| line_text = f"{'#' * line['md']['heading_level']} {line_text}" | |
| line_text += "\n" | |
| block_text += line_text | |
| block_text = block_text.rstrip() + "\n\n" | |
| page_md += block_text | |
| pages_md.append(page_md.strip()) | |
| return pages_md | |
| def merge_split_headings(pages: list[str]) -> list[str]: | |
| """Merge headings that are split across lines.""" | |
| def _merge_split_headings(match: re.Match[str]) -> str: | |
| atx_headings = [line.strip("# ").strip() for line in match.group().splitlines()] | |
| return f"{match.group(1)} {' '.join(atx_headings)}\n\n" | |
| pages_md = [ | |
| re.sub( | |
| r"^(#+)[ \t]+[^\n]+\n+(?:^\1[ \t]+[^\n]+\n+)+", | |
| _merge_split_headings, | |
| page, | |
| flags=re.MULTILINE, | |
| ) | |
| for page in pages | |
| ] | |
| return pages_md | |
| # Add heading level metadata. | |
| pages = add_heading_level_metadata(pages) | |
| # Add emphasis metadata. | |
| pages = add_emphasis_metadata(pages) | |
| # Strip page numbers. | |
| pages = strip_page_numbers(pages) | |
| # Convert the pages to Markdown. | |
| pages_md = convert_to_markdown(pages) | |
| # Merge headings that are split across lines. | |
| pages_md = merge_split_headings(pages_md) | |
| return pages_md | |
| def document_to_markdown(doc_path: Path) -> str: | |
| """Convert any document to GitHub Flavored Markdown.""" | |
| # Convert the file's content to GitHub Flavored Markdown. | |
| if doc_path.suffix == ".pdf": | |
| # Parse the PDF with pdftext and convert it to Markdown. | |
| pages = dictionary_output(doc_path, sort=True, keep_chars=False) | |
| doc = "\n\n".join(parsed_pdf_to_markdown(pages)) | |
| else: | |
| try: | |
| # Use pandoc for everything else. | |
| import pypandoc | |
| doc = pypandoc.convert_file(doc_path, to="gfm") | |
| except ImportError as error: | |
| error_message = ( | |
| "To convert files to Markdown with pandoc, please install the `pandoc` extra." | |
| ) | |
| raise ImportError(error_message) from error | |
| except RuntimeError: | |
| # File format not supported, fall back to reading the text. | |
| doc = doc_path.read_text() | |
| # Improve Markdown quality. | |
| doc = mdformat.text(doc) | |
| return doc | |