Spaces:
Sleeping
Sleeping
File size: 3,206 Bytes
16ea1b0 22ddfaa 16ea1b0 22ddfaa 16ea1b0 22ddfaa 16ea1b0 22ddfaa 16ea1b0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 |
import os
from transformers import pipeline
import chardet
class NarrativeSummarizer:
def __init__(self, model_name="facebook/bart-large-cnn", chunk_size=1000):
self.model_name = model_name
self.chunk_size = chunk_size
self.summarizer = pipeline("summarization", model=self.model_name)
def chunk_text_token_based(self, text):
# Token-based chunking approximation based on whitespace split (could be improved with tokenizers)
words = text.split()
chunks = []
current_chunk = []
current_len = 0
max_tokens = 200 # approximate token limit per chunk (adjust as needed)
for word in words:
current_chunk.append(word)
current_len += 1
if current_len >= max_tokens:
chunks.append(" ".join(current_chunk))
current_chunk = []
current_len = 0
if current_chunk:
chunks.append(" ".join(current_chunk))
return chunks
def apply_custom_prompt(self, chunk, prompt_type):
if prompt_type == "Bread Only":
prompt = f"Transform the provided fictional narrative into a maximally compressed yet losslessly decompressible format optimized for LLM reconstruction. {chunk}"
elif prompt_type == "Butter Only":
prompt = f"Solid foundation, but let's refine the granularity. Your 4-subpoint structure creates artificial symmetry where organic complexity should flourish. {chunk}"
elif prompt_type == "Bread and Butter":
prompt = f"Transform the provided fictional narrative into a maximally compressed format. Then refine granularity for organic complexity. {chunk}"
else:
prompt = chunk
return prompt
def summarize_chunk(self, chunk, prompt_type):
prompt = self.apply_custom_prompt(chunk, prompt_type)
summary = self.summarizer(prompt, max_length=150, min_length=50, do_sample=False)
return summary[0]['summary_text']
def process_file(self, file_path, prompt_type, iterations=1):
# Read file robustly with encoding detection
try:
with open(file_path, 'rb') as f:
raw_data = f.read()
detected = chardet.detect(raw_data)
encoding = detected['encoding'] or 'utf-8'
text = raw_data.decode(encoding, errors='replace')
except Exception as e:
raise RuntimeError(f"Unable to read the file: {str(e)}")
# Chunk the text token-wise
chunks = self.chunk_text_token_based(text)
condensed_chunks = []
for chunk in chunks:
temp_chunk = chunk
for _ in range(iterations):
temp_chunk = self.apply_custom_prompt(temp_chunk, prompt_type)
temp_chunk = self.summarize_chunk(temp_chunk, prompt_type)
condensed_chunks.append(temp_chunk)
# Second pass summarization for global compression
combined = " ".join(condensed_chunks)
if iterations > 1:
final_summary = self.summarize_chunk(combined, prompt_type)
else:
final_summary = combined
return final_summary
|