Spaces:

rishabhsetiya
/

CAIAssignmentGradio

Sleeping

File size: 5,642 Bytes

import os
import re
import json
import pickle
from typing import List, Dict

import numpy as np
import faiss
import pandas as pd
import tabula
from sentence_transformers import SentenceTransformer
from rank_bm25 import BM25Okapi

# ---------------- Config ----------------
EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"

PDF_PATH    = "MakeMyTrip_Financial_Statements.pdf"
OUT_DIR     = "data/index_merged"

# Paths for saved chunks & indices
CHUNKS_100_PATH = os.path.join(OUT_DIR, "chunks_100.json")
CHUNKS_400_PATH = os.path.join(OUT_DIR, "chunks_400.json")
CHUNKS_MERGED_PATH = os.path.join(OUT_DIR, "chunks_merged.json")

FAISS_PATH    = os.path.join(OUT_DIR, "faiss_merged.index")
BM25_PATH     = os.path.join(OUT_DIR, "bm25_merged.pkl")
META_PATH     = os.path.join(OUT_DIR, "meta_merged.pkl")

# ---------------- Utils ----------------
_tok_pat = re.compile(r"[a-z0-9]+", re.I)
def simple_tokenize(text: str):
    return _tok_pat.findall((text or "").lower())

def create_chunks(texts: List[str], max_tokens: int) -> List[str]:
    """Simple word-based tokenizer to split texts into chunks."""
    chunks, current_chunk, current_tokens = [], [], 0
    for text in texts:
        tokens = re.findall(r"\w+", text)
        if current_tokens + len(tokens) > max_tokens:
            chunks.append(" ".join(current_chunk))
            current_chunk, current_tokens = [], 0
        current_chunk.append(text)
        current_tokens += len(tokens)
    if current_chunk:
        chunks.append(" ".join(current_chunk))
    return chunks

def extract_tables_from_pdf(pdf_path: str, pages="all") -> List[Dict]:
    """Extract tables from financial PDF into structured row-year-value dicts."""
    tables = tabula.read_pdf(
        pdf_path,
        pages=pages,
        multiple_tables=True,
        pandas_options={'dtype': str}
    )

    table_rows = []
    row_id = 0
    
    for df in tables:
        if df.empty:
            continue

        df = df.replace(r'\n', ' ', regex=True).fillna("")

        headers = list(df.iloc[0])
        if any(re.match(r"20\d{2}", str(c)) for c in headers):
            df.columns = [c.strip() for c in headers]
            df = df.drop(0).reset_index(drop=True)

        for _, row in df.iterrows():
            metric = str(row.iloc[0]).strip()
            if not metric or metric.lower() in ["note", ""]:
                continue

            values = {}
            for col, val in row.items():
                if re.match(r"20\d{2}", str(col)):
                    clean_val = str(val).replace(",", "").strip()
                    if clean_val and clean_val not in ["-", "—", "nan"]:
                        values[str(col)] = clean_val

            if not values:
                continue

            table_rows.append({
                "id": f"table-{row_id}",
                "metric": metric,
                "years": list(values.keys()),
                "values": values,
                "content": f"{metric} values: {json.dumps(values)}",
                "source": "table"
            })
            row_id += 1

    print(f"Extracted {len(table_rows)} rows from PDF tables")
    return table_rows

def build_dense_faiss(texts: List[str], out_path: str):
    print(f"Embedding {len(texts)} docs with {EMBED_MODEL} ...")
    model = SentenceTransformer(EMBED_MODEL)
    emb = model.encode(texts, convert_to_numpy=True, batch_size=64, show_progress_bar=True)
    faiss.normalize_L2(emb)
    dim = emb.shape[1]

    index = faiss.IndexFlatIP(dim)
    index.add(emb)
    faiss.write_index(index, out_path)
    print(f"FAISS index built & saved -> {out_path}")

def build_bm25(texts: List[str], out_path: str):
    tokenized = [simple_tokenize(t) for t in texts]
    bm25 = BM25Okapi(tokenized)
    with open(out_path, "wb") as f:
        pickle.dump({"bm25": bm25, "tokenized_corpus": tokenized}, f)
    print(f"BM25 index built & saved -> {out_path}")

# ---------------- Main ----------------
def main():
    os.makedirs(OUT_DIR, exist_ok=True)

    # 1) Extract table rows
    docs = extract_tables_from_pdf(PDF_PATH, pages="all")
    all_texts = [d["content"] for d in docs]

    # 2) Create chunks of size 100 and 400
    chunks_100 = create_chunks(all_texts, 100)
    chunks_400 = create_chunks(all_texts, 400)

    # 3) Save them separately
    with open(CHUNKS_100_PATH, "w", encoding="utf-8") as f:
        json.dump(chunks_100, f, indent=2, ensure_ascii=False)
    with open(CHUNKS_400_PATH, "w", encoding="utf-8") as f:
        json.dump(chunks_400, f, indent=2, ensure_ascii=False)
    print(f"Saved {len(chunks_100)} chunks_100 -> {CHUNKS_100_PATH}")
    print(f"Saved {len(chunks_400)} chunks_400 -> {CHUNKS_400_PATH}")

    # 4) Merge with metadata
    merged = []
    for i, ch in enumerate(chunks_100):
        merged.append({"id": f"100-{i}", "chunk_size": 100, "content": ch})
    for i, ch in enumerate(chunks_400):
        merged.append({"id": f"400-{i}", "chunk_size": 400, "content": ch})

    # 5) Save merged chunks
    with open(CHUNKS_MERGED_PATH, "w", encoding="utf-8") as f:
        json.dump(merged, f, indent=2, ensure_ascii=False)
    print(f"Saved {len(merged)} merged chunks -> {CHUNKS_MERGED_PATH}")

    # 6) Build FAISS & BM25 on merged chunks
    texts = [m["content"] for m in merged]
    build_dense_faiss(texts, FAISS_PATH)
    build_bm25(texts, BM25_PATH)

    # 7) Save metadata
    with open(META_PATH, "wb") as f:
        pickle.dump(merged, f)
    print(f"Saved metadata -> {META_PATH}")

    print("\n✅ Done. Created 100 + 400 chunks separately and merged them for unified FAISS & BM25 indexes!")

if __name__ == "__main__":
    main()