Spaces:

MCP-1st-Birthday
/

DeepBoner

Running

App Files Files Community

DeepBoner / src /tools /openalex.py

VibecoderMcSwaggins

feat(search): SPEC_13 Evidence Deduplication (#98)

2c5db87 unverified 11 days ago

raw

history blame contribute delete

6.41 kB

	"""OpenAlex search tool - citation-aware scholarly search."""

	import re
	from typing import Any

	import httpx
	from tenacity import retry, stop_after_attempt, wait_exponential

	from src.utils.exceptions import SearchError
	from src.utils.models import Citation, Evidence


	class OpenAlexTool:
	"""
	Search OpenAlex for scholarly works with citation metrics.

	OpenAlex indexes 209M+ works and provides:
	- Citation counts (prioritize influential papers)
	- Concept tagging (hierarchical classification)
	- Open access links (direct PDF URLs)
	- Related works (ML-powered similarity)

	API Docs: https://docs.openalex.org
	Rate Limits: Polite pool with mailto = 100k/day
	"""

	BASE_URL = "https://api.openalex.org/works"
	POLITE_EMAIL = "deepboner-research@proton.me"

	@property
	def name(self) -> str:
	return "openalex"

	@retry(
	stop=stop_after_attempt(3),
	wait=wait_exponential(multiplier=1, min=1, max=10),
	reraise=True,
	)
	async def search(self, query: str, max_results: int = 10) -> list[Evidence]:
	"""
	Search OpenAlex, sorted by citation count.

	Args:
	query: Search terms
	max_results: Maximum results to return

	Returns:
	List of Evidence objects with citation metadata
	"""
	params: dict[str, str \| int] = {
	"search": query,
	"filter": "type:article,has_abstract:true", # Only articles with abstracts
	"sort": "cited_by_count:desc", # Most cited first
	"per_page": min(max_results, 100),
	"mailto": self.POLITE_EMAIL,
	}

	async with httpx.AsyncClient(timeout=30.0) as client:
	try:
	response = await client.get(self.BASE_URL, params=params)
	response.raise_for_status()

	data = response.json()
	works = data.get("results", [])

	return [self._to_evidence(work) for work in works[:max_results]]

	except httpx.HTTPStatusError as e:
	raise SearchError(f"OpenAlex API error: {e}") from e
	except httpx.RequestError as e:
	raise SearchError(f"OpenAlex connection failed: {e}") from e

	def _to_evidence(self, work: dict[str, Any]) -> Evidence:
	"""Convert OpenAlex work to Evidence with rich metadata."""
	# Extract basic fields
	title = work.get("display_name", "Untitled")
	doi = work.get("doi", "")
	year = work.get("publication_year", "Unknown")
	cited_by_count = work.get("cited_by_count", 0)

	# Reconstruct abstract from inverted index
	abstract = self._reconstruct_abstract(work.get("abstract_inverted_index"))
	if not abstract:
	# Should be caught by filter=has_abstract:true, but defensive coding
	abstract = f"[No abstract available. Cited by {cited_by_count} works.]"

	# Extract authors (limit to 5)
	authors = self._extract_authors(work.get("authorships", []))

	# Extract concepts (top 5 by score)
	concepts = self._extract_concepts(work.get("concepts", []))

	# Open access info
	oa_info = work.get("open_access", {})
	is_oa = oa_info.get("is_oa", False)

	# Get PDF URL (prefer best_oa_location)
	best_oa = work.get("best_oa_location", {})
	pdf_url = best_oa.get("pdf_url") if best_oa else None

	# Build URL
	if doi:
	url = doi if doi.startswith("http") else f"https://doi.org/{doi}"
	else:
	openalex_id = work.get("id", "")
	url = openalex_id if openalex_id else "https://openalex.org"

	# NEW: Extract PMID from ids object for deduplication
	ids_obj = work.get("ids", {})
	pmid_url = ids_obj.get("pmid") # "https://pubmed.ncbi.nlm.nih.gov/29456894"
	pmid = None
	if pmid_url and isinstance(pmid_url, str) and "pubmed.ncbi.nlm.nih.gov" in pmid_url:
	# Extract numeric PMID from URL
	pmid_match = re.search(r"/(\d+)/?$", pmid_url)
	if pmid_match:
	pmid = pmid_match.group(1)

	# Prepend citation badge to content
	citation_badge = f"[Cited by {cited_by_count}] " if cited_by_count > 0 else ""
	content = f"{citation_badge}{abstract[:1900]}"

	# Calculate relevance: normalized citation count (capped at 1.0 for 100 citations)
	# 100 citations is a very strong signal in most fields.
	relevance = min(1.0, cited_by_count / 100.0)

	return Evidence(
	content=content[:2000],
	citation=Citation(
	source="openalex",
	title=title[:500],
	url=url,
	date=str(year),
	authors=authors,
	),
	relevance=relevance,
	metadata={
	"cited_by_count": cited_by_count,
	"concepts": concepts,
	"is_open_access": is_oa,
	"pdf_url": pdf_url,
	"pmid": pmid, # NEW: Store PMID for deduplication
	},
	)

	def _reconstruct_abstract(self, inverted_index: dict[str, list[int]] \| None) -> str:
	"""Rebuild abstract from {"word": [positions]} format."""
	if not inverted_index:
	return ""

	position_word: dict[int, str] = {}
	for word, positions in inverted_index.items():
	for pos in positions:
	position_word[pos] = word

	if not position_word:
	return ""

	max_pos = max(position_word.keys())
	return " ".join(position_word.get(i, "") for i in range(max_pos + 1))

	def _extract_authors(self, authorships: list[dict[str, Any]]) -> list[str]:
	"""Extract author names from authorships array."""
	authors = []
	for authorship in authorships[:5]:
	author = authorship.get("author", {})
	name = author.get("display_name")
	if name:
	authors.append(name)
	return authors

	def _extract_concepts(self, concepts: list[dict[str, Any]]) -> list[str]:
	"""Extract concept names, sorted by score."""
	sorted_concepts = sorted(concepts, key=lambda c: c.get("score", 0), reverse=True)
	return [c.get("display_name", "") for c in sorted_concepts[:5] if c.get("display_name")]