Spaces:

psyrishi
/

narrative-summarizer

Sleeping

App Files Files Community

narrative-summarizer / summarizer.py

psyrishi

Add robust UTF-8 encoding detection, token-based chunking, and second-pass summarization

16ea1b0 3 months ago

raw

history blame

3.21 kB

	import os
	from transformers import pipeline
	import chardet

	class NarrativeSummarizer:
	def __init__(self, model_name="facebook/bart-large-cnn", chunk_size=1000):
	self.model_name = model_name
	self.chunk_size = chunk_size
	self.summarizer = pipeline("summarization", model=self.model_name)

	def chunk_text_token_based(self, text):
	# Token-based chunking approximation based on whitespace split (could be improved with tokenizers)
	words = text.split()
	chunks = []
	current_chunk = []
	current_len = 0
	max_tokens = 200 # approximate token limit per chunk (adjust as needed)
	for word in words:
	current_chunk.append(word)
	current_len += 1
	if current_len >= max_tokens:
	chunks.append(" ".join(current_chunk))
	current_chunk = []
	current_len = 0
	if current_chunk:
	chunks.append(" ".join(current_chunk))
	return chunks

	def apply_custom_prompt(self, chunk, prompt_type):
	if prompt_type == "Bread Only":
	prompt = f"Transform the provided fictional narrative into a maximally compressed yet losslessly decompressible format optimized for LLM reconstruction. {chunk}"
	elif prompt_type == "Butter Only":
	prompt = f"Solid foundation, but let's refine the granularity. Your 4-subpoint structure creates artificial symmetry where organic complexity should flourish. {chunk}"
	elif prompt_type == "Bread and Butter":
	prompt = f"Transform the provided fictional narrative into a maximally compressed format. Then refine granularity for organic complexity. {chunk}"
	else:
	prompt = chunk
	return prompt

	def summarize_chunk(self, chunk, prompt_type):
	prompt = self.apply_custom_prompt(chunk, prompt_type)
	summary = self.summarizer(prompt, max_length=150, min_length=50, do_sample=False)
	return summary[0]['summary_text']

	def process_file(self, file_path, prompt_type, iterations=1):
	# Read file robustly with encoding detection
	try:
	with open(file_path, 'rb') as f:
	raw_data = f.read()
	detected = chardet.detect(raw_data)
	encoding = detected['encoding'] or 'utf-8'
	text = raw_data.decode(encoding, errors='replace')
	except Exception as e:
	raise RuntimeError(f"Unable to read the file: {str(e)}")

	# Chunk the text token-wise
	chunks = self.chunk_text_token_based(text)
	condensed_chunks = []

	for chunk in chunks:
	temp_chunk = chunk
	for _ in range(iterations):
	temp_chunk = self.apply_custom_prompt(temp_chunk, prompt_type)
	temp_chunk = self.summarize_chunk(temp_chunk, prompt_type)
	condensed_chunks.append(temp_chunk)

	# Second pass summarization for global compression
	combined = " ".join(condensed_chunks)
	if iterations > 1:
	final_summary = self.summarize_chunk(combined, prompt_type)
	else:
	final_summary = combined

	return final_summary