|
|
|
|
|
""" |
|
|
core/utils.py |
|
|
Utility functions and shared data structures for the ESG ABSA framework. |
|
|
Includes text parsing, language detection, and safe plotting helpers. |
|
|
""" |
|
|
|
|
|
import regex as re |
|
|
import os |
|
|
from dataclasses import dataclass |
|
|
from typing import List |
|
|
import matplotlib |
|
|
matplotlib.use("Agg") |
|
|
import matplotlib.pyplot as plt |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@dataclass |
|
|
class Sentence: |
|
|
"""Represents a parsed sentence with metadata.""" |
|
|
text: str |
|
|
idx: int |
|
|
section: str |
|
|
section_type: str |
|
|
lang: str |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
SECTION_MAP = { |
|
|
"TANTANGAN DAN RESPONS TERHADAP ISU KEBERLANJUTAN": "General", |
|
|
"KINERJA EKONOMI": "Economic", |
|
|
"KINERJA LINGKUNGAN": "Environmental", |
|
|
"KINERJA SOSIAL": "Social", |
|
|
"TATA KELOLA": "Governance", |
|
|
"STRATEGI KEBERLANJUTAN": "Strategy", |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def detect_lang(s: str) -> str: |
|
|
""" |
|
|
Detects whether the sentence is more likely Indonesian ('id') or English ('en') |
|
|
using basic lexical cues. |
|
|
""" |
|
|
if re.search(r"\b(we|our|the|and|of|to|in|a|on)\b", s.lower()): |
|
|
if re.search(r"\b(kami|yang|untuk|dan|dengan|pada|di|ke)\b", s.lower()): |
|
|
return "id" |
|
|
return "en" |
|
|
return "id" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def parse_document(raw: str) -> List[Sentence]: |
|
|
""" |
|
|
Parses a raw ESG report text into structured sentence objects. |
|
|
|
|
|
Each sentence is tagged with: |
|
|
- section title (from '## ' headers if available) |
|
|
- section type (Economic, Social, etc.) |
|
|
- detected language ('en' or 'id') |
|
|
""" |
|
|
blocks = re.split(r"\n(?=## )", raw.strip()) if "## " in raw else [raw] |
|
|
sentences = [] |
|
|
sid = 0 |
|
|
|
|
|
for block in blocks: |
|
|
if "## " in block: |
|
|
m = re.match(r"## ([^\n]+)\n?(.*)", block, flags=re.S) |
|
|
if m: |
|
|
header = m.group(1).strip() |
|
|
body = (m.group(2) or "").strip() |
|
|
else: |
|
|
header, body = "General", block |
|
|
else: |
|
|
header, body = "General", block |
|
|
|
|
|
section_type = SECTION_MAP.get(header.upper(), "General") |
|
|
|
|
|
|
|
|
parts = re.split(r"(?<=[\.\?!])\s+(?=[A-ZÀ-ÿK])|[\n•\-;]", body) |
|
|
for part in parts: |
|
|
text = (part or "").strip() |
|
|
if len(text) < 4: |
|
|
continue |
|
|
sentences.append(Sentence(text, sid, header, section_type, detect_lang(text))) |
|
|
sid += 1 |
|
|
|
|
|
return sentences |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def safe_plot(plot_fn, title: str): |
|
|
""" |
|
|
Safely generates matplotlib figures, even if the plotting function fails. |
|
|
|
|
|
Args: |
|
|
plot_fn (callable): A function that takes one argument (the matplotlib axis) |
|
|
title (str): The title for the figure |
|
|
|
|
|
Returns: |
|
|
matplotlib.figure.Figure |
|
|
""" |
|
|
try: |
|
|
fig = plt.figure(figsize=(6, 4)) |
|
|
ax = plt.gca() |
|
|
plot_fn(ax) |
|
|
plt.title(title) |
|
|
plt.tight_layout() |
|
|
return fig |
|
|
except Exception as e: |
|
|
fig = plt.figure(figsize=(4, 2)) |
|
|
plt.text(0.5, 0.5, f"Plot error: {e}", ha="center", va="center") |
|
|
plt.axis("off") |
|
|
return fig |