Commit
Β·
93d4e58
1
Parent(s):
5ee5de2
updated
Browse files- app.py +24 -23
- text-paraphraser-expander.ipynb +1 -0
app.py
CHANGED
|
@@ -116,8 +116,8 @@ def highlight_differences(original: str, generated: str) -> Tuple[str, str, Dict
|
|
| 116 |
|
| 117 |
elif tag == 'replace':
|
| 118 |
# Changed text
|
| 119 |
-
highlighted_original.append(f'<span style="
|
| 120 |
-
highlighted_generated.append(f'<span style="
|
| 121 |
changes_count += max(i2 - i1, j2 - j1)
|
| 122 |
|
| 123 |
# Track word substitutions (limit to single word changes for clarity)
|
|
@@ -126,12 +126,12 @@ def highlight_differences(original: str, generated: str) -> Tuple[str, str, Dict
|
|
| 126 |
|
| 127 |
elif tag == 'delete':
|
| 128 |
# Text removed in generated
|
| 129 |
-
highlighted_original.append(f'<span style="
|
| 130 |
deletions_count += (i2 - i1)
|
| 131 |
|
| 132 |
elif tag == 'insert':
|
| 133 |
# Text added in generated
|
| 134 |
-
highlighted_generated.append(f'<span style="
|
| 135 |
additions_count += (j2 - j1)
|
| 136 |
|
| 137 |
# Join with spaces
|
|
@@ -160,49 +160,49 @@ def highlight_differences(original: str, generated: str) -> Tuple[str, str, Dict
|
|
| 160 |
return final_original, final_generated, statistics
|
| 161 |
|
| 162 |
def format_statistics(stats: Dict) -> str:
|
| 163 |
-
"""Format statistics into a readable HTML string"""
|
| 164 |
html = f"""
|
| 165 |
-
<div style="padding: 15px; background:
|
| 166 |
<h3 style="margin-top: 0; color: white;">π Change Analysis</h3>
|
| 167 |
|
| 168 |
<div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(150px, 1fr)); gap: 10px; margin: 15px 0;">
|
| 169 |
-
<div style="background: rgba(255,255,255,0.
|
| 170 |
<div style="font-size: 24px; font-weight: bold;">{stats['total_original']}</div>
|
| 171 |
-
<div style="font-size: 12px; opacity: 0.
|
| 172 |
</div>
|
| 173 |
-
<div style="background: rgba(255,255,255,0.
|
| 174 |
<div style="font-size: 24px; font-weight: bold;">{stats['total_generated']}</div>
|
| 175 |
-
<div style="font-size: 12px; opacity: 0.
|
| 176 |
</div>
|
| 177 |
-
<div style="background: rgba(255,255,255,0.
|
| 178 |
<div style="font-size: 24px; font-weight: bold; color: #90EE90;">{stats['unchanged']}</div>
|
| 179 |
-
<div style="font-size: 12px; opacity: 0.
|
| 180 |
</div>
|
| 181 |
-
<div style="background: rgba(255,255,255,0.
|
| 182 |
-
<div style="font-size: 24px; font-weight: bold; color: #
|
| 183 |
-
<div style="font-size: 12px; opacity: 0.
|
| 184 |
</div>
|
| 185 |
</div>
|
| 186 |
|
| 187 |
-
<div style="margin: 15px 0; padding: 10px; background: rgba(255,255,255,0.
|
| 188 |
<div style="margin-bottom: 8px;">
|
| 189 |
<strong>Modification Rate:</strong> {stats['percentage_changed']:.1f}% modified, {stats['percentage_unchanged']:.1f}% preserved
|
| 190 |
</div>
|
| 191 |
<div style="margin-bottom: 8px;">
|
| 192 |
<span style="color: #90EE90;">β Added: {stats['added']} words</span> |
|
| 193 |
-
<span style="color: #
|
| 194 |
</div>
|
| 195 |
</div>
|
| 196 |
"""
|
| 197 |
|
| 198 |
if stats['substitutions']:
|
| 199 |
html += """
|
| 200 |
-
<div style="margin-top: 15px; padding: 10px; background: rgba(255,255,255,0.
|
| 201 |
<strong>π Sample Word Substitutions:</strong>
|
| 202 |
<div style="margin-top: 8px; font-size: 13px; line-height: 1.6;">
|
| 203 |
"""
|
| 204 |
for orig, new in stats['substitutions']:
|
| 205 |
-
html += f'<div style="margin: 4px 0;"><span style="color: #
|
| 206 |
html += """
|
| 207 |
</div>
|
| 208 |
</div>
|
|
@@ -211,8 +211,8 @@ def format_statistics(stats: Dict) -> str:
|
|
| 211 |
html += """
|
| 212 |
<div style="margin-top: 15px; padding: 8px; background: rgba(255,255,255,0.05); border-radius: 6px; font-size: 12px;">
|
| 213 |
<strong>Legend:</strong>
|
| 214 |
-
<span style="background-color: #
|
| 215 |
-
<span style="background-color: #
|
| 216 |
</div>
|
| 217 |
</div>
|
| 218 |
"""
|
|
@@ -386,6 +386,7 @@ def process_text(
|
|
| 386 |
print(error_msg)
|
| 387 |
return error_msg, "Error occurred", 0.0, "", "", ""
|
| 388 |
|
|
|
|
| 389 |
# Create Gradio interface
|
| 390 |
with gr.Blocks(title="Text Paraphraser & Expander", theme=gr.themes.Soft()) as demo:
|
| 391 |
gr.Markdown(
|
|
@@ -499,8 +500,6 @@ with gr.Blocks(title="Text Paraphraser & Expander", theme=gr.themes.Soft()) as d
|
|
| 499 |
gr.Markdown("---")
|
| 500 |
gr.Markdown("## π Visual Comparison - See What Changed")
|
| 501 |
|
| 502 |
-
change_stats = gr.HTML(label="Change Statistics")
|
| 503 |
-
|
| 504 |
with gr.Row():
|
| 505 |
with gr.Column(scale=1):
|
| 506 |
gr.Markdown("### π Original Text (with changes highlighted)")
|
|
@@ -515,6 +514,8 @@ with gr.Blocks(title="Text Paraphraser & Expander", theme=gr.themes.Soft()) as d
|
|
| 515 |
label="Generated with Changes",
|
| 516 |
show_label=False
|
| 517 |
)
|
|
|
|
|
|
|
| 518 |
|
| 519 |
# Event handlers
|
| 520 |
mode.change(
|
|
|
|
| 116 |
|
| 117 |
elif tag == 'replace':
|
| 118 |
# Changed text
|
| 119 |
+
highlighted_original.append(f'<span style="color: #ffcccc; padding: 2px 4px; border-radius: 3px; text-decoration: line-through;">{original_chunk}</span>')
|
| 120 |
+
highlighted_generated.append(f'<span style="color: #ccffcc; padding: 2px 4px; border-radius: 3px; font-weight: 500;">{generated_chunk}</span>')
|
| 121 |
changes_count += max(i2 - i1, j2 - j1)
|
| 122 |
|
| 123 |
# Track word substitutions (limit to single word changes for clarity)
|
|
|
|
| 126 |
|
| 127 |
elif tag == 'delete':
|
| 128 |
# Text removed in generated
|
| 129 |
+
highlighted_original.append(f'<span style="color: #ffcccc; padding: 2px 4px; border-radius: 3px; text-decoration: line-through;">{original_chunk}</span>')
|
| 130 |
deletions_count += (i2 - i1)
|
| 131 |
|
| 132 |
elif tag == 'insert':
|
| 133 |
# Text added in generated
|
| 134 |
+
highlighted_generated.append(f'<span style="color: #ccffcc; padding: 2px 4px; border-radius: 3px; font-weight: 500;">{generated_chunk}</span>')
|
| 135 |
additions_count += (j2 - j1)
|
| 136 |
|
| 137 |
# Join with spaces
|
|
|
|
| 160 |
return final_original, final_generated, statistics
|
| 161 |
|
| 162 |
def format_statistics(stats: Dict) -> str:
|
| 163 |
+
"""Format statistics into a readable HTML string with dark theme"""
|
| 164 |
html = f"""
|
| 165 |
+
<div style="padding: 15px; background: #000000; border-radius: 10px; color: white; margin: 10px 0;">
|
| 166 |
<h3 style="margin-top: 0; color: white;">π Change Analysis</h3>
|
| 167 |
|
| 168 |
<div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(150px, 1fr)); gap: 10px; margin: 15px 0;">
|
| 169 |
+
<div style="background: rgba(255,255,255,0.05); padding: 10px; border-radius: 8px; text-align: center;">
|
| 170 |
<div style="font-size: 24px; font-weight: bold;">{stats['total_original']}</div>
|
| 171 |
+
<div style="font-size: 12px; opacity: 0.8;">Original Words</div>
|
| 172 |
</div>
|
| 173 |
+
<div style="background: rgba(255,255,255,0.05); padding: 10px; border-radius: 8px; text-align: center;">
|
| 174 |
<div style="font-size: 24px; font-weight: bold;">{stats['total_generated']}</div>
|
| 175 |
+
<div style="font-size: 12px; opacity: 0.8;">Generated Words</div>
|
| 176 |
</div>
|
| 177 |
+
<div style="background: rgba(255,255,255,0.05); padding: 10px; border-radius: 8px; text-align: center;">
|
| 178 |
<div style="font-size: 24px; font-weight: bold; color: #90EE90;">{stats['unchanged']}</div>
|
| 179 |
+
<div style="font-size: 12px; opacity: 0.8;">Unchanged</div>
|
| 180 |
</div>
|
| 181 |
+
<div style="background: rgba(255,255,255,0.05); padding: 10px; border-radius: 8px; text-align: center;">
|
| 182 |
+
<div style="font-size: 24px; font-weight: bold; color: #FF4C4C;">{stats['changed']}</div>
|
| 183 |
+
<div style="font-size: 12px; opacity: 0.8;">Changed</div>
|
| 184 |
</div>
|
| 185 |
</div>
|
| 186 |
|
| 187 |
+
<div style="margin: 15px 0; padding: 10px; background: rgba(255,255,255,0.05); border-radius: 8px;">
|
| 188 |
<div style="margin-bottom: 8px;">
|
| 189 |
<strong>Modification Rate:</strong> {stats['percentage_changed']:.1f}% modified, {stats['percentage_unchanged']:.1f}% preserved
|
| 190 |
</div>
|
| 191 |
<div style="margin-bottom: 8px;">
|
| 192 |
<span style="color: #90EE90;">β Added: {stats['added']} words</span> |
|
| 193 |
+
<span style="color: #FF4C4C;">β Removed: {stats['deleted']} words</span>
|
| 194 |
</div>
|
| 195 |
</div>
|
| 196 |
"""
|
| 197 |
|
| 198 |
if stats['substitutions']:
|
| 199 |
html += """
|
| 200 |
+
<div style="margin-top: 15px; padding: 10px; background: rgba(255,255,255,0.05); border-radius: 8px;">
|
| 201 |
<strong>π Sample Word Substitutions:</strong>
|
| 202 |
<div style="margin-top: 8px; font-size: 13px; line-height: 1.6;">
|
| 203 |
"""
|
| 204 |
for orig, new in stats['substitutions']:
|
| 205 |
+
html += f'<div style="margin: 4px 0;"><span style="color: #FF4C4C;">{orig}</span> β <span style="color: #90EE90;">{new}</span></div>'
|
| 206 |
html += """
|
| 207 |
</div>
|
| 208 |
</div>
|
|
|
|
| 211 |
html += """
|
| 212 |
<div style="margin-top: 15px; padding: 8px; background: rgba(255,255,255,0.05); border-radius: 6px; font-size: 12px;">
|
| 213 |
<strong>Legend:</strong>
|
| 214 |
+
<span style="background-color: #FF4C4C; padding: 2px 6px; border-radius: 3px; margin: 0 4px;">Removed/Changed</span>
|
| 215 |
+
<span style="background-color: #90EE90; padding: 2px 6px; border-radius: 3px; margin: 0 4px;">Added/New</span>
|
| 216 |
</div>
|
| 217 |
</div>
|
| 218 |
"""
|
|
|
|
| 386 |
print(error_msg)
|
| 387 |
return error_msg, "Error occurred", 0.0, "", "", ""
|
| 388 |
|
| 389 |
+
# Create Gradio interface
|
| 390 |
# Create Gradio interface
|
| 391 |
with gr.Blocks(title="Text Paraphraser & Expander", theme=gr.themes.Soft()) as demo:
|
| 392 |
gr.Markdown(
|
|
|
|
| 500 |
gr.Markdown("---")
|
| 501 |
gr.Markdown("## π Visual Comparison - See What Changed")
|
| 502 |
|
|
|
|
|
|
|
| 503 |
with gr.Row():
|
| 504 |
with gr.Column(scale=1):
|
| 505 |
gr.Markdown("### π Original Text (with changes highlighted)")
|
|
|
|
| 514 |
label="Generated with Changes",
|
| 515 |
show_label=False
|
| 516 |
)
|
| 517 |
+
|
| 518 |
+
change_stats = gr.HTML(label="Change Statistics")
|
| 519 |
|
| 520 |
# Event handlers
|
| 521 |
mode.change(
|
text-paraphraser-expander.ipynb
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"name":"python","version":"3.11.13","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"},"kaggle":{"accelerator":"none","dataSources":[],"dockerImageVersionId":31153,"isInternetEnabled":true,"language":"python","sourceType":"notebook","isGpuEnabled":false}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"code","source":"pip install gradio transformers torch sentence-transformers","metadata":{"_uuid":"8f2839f25d086af736a60e9eeb907d3b93b6e0e5","_cell_guid":"b1076dfc-b9ad-4769-8c92-a6c4dae69d19","trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"import gradio as gr\nimport torch\nfrom transformers import (\n AutoTokenizer, \n AutoModelForSeq2SeqLM,\n T5ForConditionalGeneration,\n T5Tokenizer\n)\nfrom sentence_transformers import SentenceTransformer, util\nimport numpy as np\nfrom typing import List, Tuple, Dict\nimport re\nimport numpy as np\nimport difflib","metadata":{"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"# Initialize similarity model\nsimilarity_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')","metadata":{"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"PARAPHRASE_MODELS = {\n \"T5-Base\": \"Vamsi/T5_Paraphrase_Paws\",\n \"PEGASUS-Paraphrase\": \"tuner007/pegasus_paraphrase\",\n \"Parrot-Paraphraser\": \"prithivida/parrot_paraphraser_on_T5\",\n \"BART-Paraphrase\": \"eugenesiow/bart-paraphrase\",\n \"ChatGPT-Style-T5\": \"humarin/chatgpt_paraphraser_on_T5_base\",\n}","metadata":{"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"EXPANSION_MODELS = {\n \"Flan-T5-Base\": \"google/flan-t5-base\",\n \"Flan-T5-Large\": \"google/flan-t5-large\",\n}","metadata":{"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"# Cache for loaded models\nmodel_cache = {}","metadata":{"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"def load_model(model_name: str, model_path: str):\n \"\"\"Load model and tokenizer with caching\"\"\"\n if model_name in model_cache:\n return model_cache[model_name]\n \n print(f\"Loading {model_name}...\")\n tokenizer = AutoTokenizer.from_pretrained(model_path)\n model = AutoModelForSeq2SeqLM.from_pretrained(model_path)\n \n # Move to GPU if available\n device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n model = model.to(device)\n \n model_cache[model_name] = (model, tokenizer, device)\n return model, tokenizer, device","metadata":{"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"def chunk_text(text: str, max_sentences: int = 4) -> List[str]:\n \"\"\"Split text into chunks based on number of sentences\"\"\"\n import re\n sentences = re.split(r'(?<=[.!?]) +', text.strip())\n chunks = [' '.join(sentences[i:i+max_sentences]) for i in range(0, len(sentences), max_sentences)]\n return [chunk for chunk in chunks if chunk.strip()]\n\ndef estimate_tokens(text: str) -> int:\n \"\"\"Estimate number of tokens in text (approximate: 1 token β 0.75 words)\"\"\"\n word_count = len(text.split())\n return int(word_count / 0.75)\n\ndef calculate_max_length(input_text: str, mode: str, base_max_length: int) -> int:\n \"\"\"Calculate appropriate max_length based on input tokens\"\"\"\n input_tokens = estimate_tokens(input_text)\n \n if mode == \"Paraphrase\":\n # For paraphrasing: output should be 1.2-1.5x input tokens\n calculated_max = int(input_tokens * 1.5) + 50\n else:\n # For expansion: output should be 2-3x input tokens\n calculated_max = int(input_tokens * 3) + 100\n \n # Use the larger of calculated or user-specified max_length\n final_max_length = max(calculated_max, base_max_length)\n \n # Cap at reasonable maximum to avoid memory issues\n return min(final_max_length, 1024)\n","metadata":{"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"# def chunk_text(text: str, max_tokens: int = 300) -> List[str]:\n# \"\"\"Split text into chunks based on sentences to avoid exceeding token limits\"\"\"\n# sentences = re.split(r'(?<=[.!?])\\s+', text)\n# chunks = []\n# current_chunk = []\n# current_length = 0\n \n# for sentence in sentences:\n# sentence_length = len(sentence.split())\n# if current_length + sentence_length > max_tokens and current_chunk:\n# chunks.append(' '.join(current_chunk))\n# current_chunk = [sentence]\n# current_length = sentence_length\n# else:\n# current_chunk.append(sentence)\n# current_length += sentence_length\n \n# if current_chunk:\n# chunks.append(' '.join(current_chunk))\n \n# return chunks if chunks else [text]","metadata":{"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"def calculate_similarity(text1: str, text2: str) -> float:\n \"\"\"Calculate cosine similarity between two texts\"\"\"\n embeddings = similarity_model.encode([text1, text2], convert_to_tensor=True)\n similarity = util.cos_sim(embeddings[0], embeddings[1])\n return similarity.item()","metadata":{"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"def highlight_differences(original: str, generated: str) -> Tuple[str, str, Dict]:\n \"\"\"\n Create highlighted HTML versions of both texts showing differences\n Returns: (highlighted_original, highlighted_generated, statistics)\n \"\"\"\n # Split into words for comparison\n original_words = original.split()\n generated_words = generated.split()\n \n # Use difflib to find differences\n diff = difflib.SequenceMatcher(None, original_words, generated_words)\n \n highlighted_original = []\n highlighted_generated = []\n \n changes_count = 0\n additions_count = 0\n deletions_count = 0\n unchanged_count = 0\n word_substitutions = []\n \n for tag, i1, i2, j1, j2 in diff.get_opcodes():\n original_chunk = ' '.join(original_words[i1:i2])\n generated_chunk = ' '.join(generated_words[j1:j2])\n \n if tag == 'equal':\n # Unchanged text\n highlighted_original.append(original_chunk)\n highlighted_generated.append(generated_chunk)\n unchanged_count += (i2 - i1)\n \n elif tag == 'replace':\n # Changed text\n highlighted_original.append(f'<span style=\"color: #ffcccc; padding: 2px 4px; border-radius: 3px; text-decoration: line-through;\">{original_chunk}</span>')\n highlighted_generated.append(f'<span style=\"color: #ccffcc; padding: 2px 4px; border-radius: 3px; font-weight: 500;\">{generated_chunk}</span>')\n changes_count += max(i2 - i1, j2 - j1)\n \n # Track word substitutions (limit to single word changes for clarity)\n if i2 - i1 == 1 and j2 - j1 == 1:\n word_substitutions.append((original_chunk, generated_chunk))\n \n elif tag == 'delete':\n # Text removed in generated\n highlighted_original.append(f'<span style=\"color: #ffcccc; padding: 2px 4px; border-radius: 3px; text-decoration: line-through;\">{original_chunk}</span>')\n deletions_count += (i2 - i1)\n \n elif tag == 'insert':\n # Text added in generated\n highlighted_generated.append(f'<span style=\"color: #ccffcc; padding: 2px 4px; border-radius: 3px; font-weight: 500;\">{generated_chunk}</span>')\n additions_count += (j2 - j1)\n \n # Join with spaces\n final_original = ' '.join(highlighted_original)\n final_generated = ' '.join(highlighted_generated)\n \n # Calculate statistics\n total_original_words = len(original_words)\n total_generated_words = len(generated_words)\n \n percentage_changed = (changes_count + deletions_count + additions_count) / max(total_original_words, 1) * 100\n percentage_unchanged = (unchanged_count / max(total_original_words, 1)) * 100\n \n statistics = {\n 'total_original': total_original_words,\n 'total_generated': total_generated_words,\n 'unchanged': unchanged_count,\n 'changed': changes_count,\n 'added': additions_count,\n 'deleted': deletions_count,\n 'percentage_changed': percentage_changed,\n 'percentage_unchanged': percentage_unchanged,\n 'substitutions': word_substitutions[:10] # Limit to first 10\n }\n \n return final_original, final_generated, statistics\n","metadata":{"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"def format_statistics(stats: Dict) -> str:\n \"\"\"Format statistics into a readable HTML string with dark theme\"\"\"\n html = f\"\"\"\n <div style=\"padding: 15px; background: #000000; border-radius: 10px; color: white; margin: 10px 0;\">\n <h3 style=\"margin-top: 0; color: white;\">π Change Analysis</h3>\n \n <div style=\"display: grid; grid-template-columns: repeat(auto-fit, minmax(150px, 1fr)); gap: 10px; margin: 15px 0;\">\n <div style=\"background: rgba(255,255,255,0.05); padding: 10px; border-radius: 8px; text-align: center;\">\n <div style=\"font-size: 24px; font-weight: bold;\">{stats['total_original']}</div>\n <div style=\"font-size: 12px; opacity: 0.8;\">Original Words</div>\n </div>\n <div style=\"background: rgba(255,255,255,0.05); padding: 10px; border-radius: 8px; text-align: center;\">\n <div style=\"font-size: 24px; font-weight: bold;\">{stats['total_generated']}</div>\n <div style=\"font-size: 12px; opacity: 0.8;\">Generated Words</div>\n </div>\n <div style=\"background: rgba(255,255,255,0.05); padding: 10px; border-radius: 8px; text-align: center;\">\n <div style=\"font-size: 24px; font-weight: bold; color: #90EE90;\">{stats['unchanged']}</div>\n <div style=\"font-size: 12px; opacity: 0.8;\">Unchanged</div>\n </div>\n <div style=\"background: rgba(255,255,255,0.05); padding: 10px; border-radius: 8px; text-align: center;\">\n <div style=\"font-size: 24px; font-weight: bold; color: #FF4C4C;\">{stats['changed']}</div>\n <div style=\"font-size: 12px; opacity: 0.8;\">Changed</div>\n </div>\n </div>\n \n <div style=\"margin: 15px 0; padding: 10px; background: rgba(255,255,255,0.05); border-radius: 8px;\">\n <div style=\"margin-bottom: 8px;\">\n <strong>Modification Rate:</strong> {stats['percentage_changed']:.1f}% modified, {stats['percentage_unchanged']:.1f}% preserved\n </div>\n <div style=\"margin-bottom: 8px;\">\n <span style=\"color: #90EE90;\">β Added: {stats['added']} words</span> | \n <span style=\"color: #FF4C4C;\">β Removed: {stats['deleted']} words</span>\n </div>\n </div>\n \"\"\"\n \n if stats['substitutions']:\n html += \"\"\"\n <div style=\"margin-top: 15px; padding: 10px; background: rgba(255,255,255,0.05); border-radius: 8px;\">\n <strong>π Sample Word Substitutions:</strong>\n <div style=\"margin-top: 8px; font-size: 13px; line-height: 1.6;\">\n \"\"\"\n for orig, new in stats['substitutions']:\n html += f'<div style=\"margin: 4px 0;\"><span style=\"color: #FF4C4C;\">{orig}</span> β <span style=\"color: #90EE90;\">{new}</span></div>'\n html += \"\"\"\n </div>\n </div>\n \"\"\"\n \n html += \"\"\"\n <div style=\"margin-top: 15px; padding: 8px; background: rgba(255,255,255,0.05); border-radius: 6px; font-size: 12px;\">\n <strong>Legend:</strong> \n <span style=\"background-color: #FF4C4C; padding: 2px 6px; border-radius: 3px; margin: 0 4px;\">Removed/Changed</span>\n <span style=\"background-color: #90EE90; padding: 2px 6px; border-radius: 3px; margin: 0 4px;\">Added/New</span>\n </div>\n </div>\n \"\"\"\n \n return html\n","metadata":{"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"def paraphrase_text(\n text: str,\n model_name: str,\n temperature: float,\n top_p: float,\n max_length: int,\n num_beams: int,\n max_sentences: int,\n target_words: int = None,\n mode: str = \"Paraphrase\"\n) -> Tuple[str, float]:\n \"\"\"Paraphrase or expand text based on mode\"\"\"\n \n if not text.strip():\n return \"Please enter some text to process.\", 0.0\n \n # Select appropriate model based on mode\n if mode == \"Paraphrase\":\n models_dict = PARAPHRASE_MODELS\n if model_name not in models_dict:\n model_name = list(models_dict.keys())[0]\n model_path = models_dict[model_name]\n prefix = \"paraphrase: \" if \"T5\" in model_name else \"\"\n else: # Expand mode\n models_dict = EXPANSION_MODELS\n if model_name not in models_dict:\n model_name = list(models_dict.keys())[0]\n model_path = models_dict[model_name]\n target_words = target_words or 300\n prefix = f\"Expand the following text to approximately {target_words} words, adding more details and context: \"\n \n # Load model\n model, tokenizer, device = load_model(model_name, model_path)\n \n # Chunk text based on sentences\n chunks = chunk_text(text, max_sentences=max_sentences)\n \n processed_chunks = []\n \n print(f\"\\n{'='*60}\")\n print(f\"Processing {len(chunks)} chunk(s) with {max_sentences} sentences per chunk\")\n print(f\"{'='*60}\")\n \n for i, chunk in enumerate(chunks):\n # Calculate dynamic max_length for this chunk\n chunk_max_length = calculate_max_length(chunk, mode, max_length)\n input_tokens = estimate_tokens(chunk)\n \n # Prepare input\n input_text = prefix + chunk + \" </s>\" if mode == \"Paraphrase\" else prefix + chunk\n inputs = tokenizer.encode(\n input_text, \n return_tensors=\"pt\", \n max_length=512, \n truncation=True\n )\n inputs = inputs.to(device)\n \n # Calculate min_length to ensure output isn't too short\n if mode == \"Paraphrase\":\n min_length_calc = int(input_tokens * 0.8)\n else:\n min_length_calc = int(input_tokens * 1.5)\n \n # Generate\n with torch.no_grad():\n outputs = model.generate(\n inputs,\n max_length=chunk_max_length,\n min_length=min(min_length_calc, chunk_max_length - 10),\n num_beams=num_beams,\n temperature=temperature if temperature > 0 else 1.0,\n top_p=top_p,\n top_k=120 if mode == \"Paraphrase\" else 50,\n do_sample=temperature > 0,\n early_stopping=True,\n no_repeat_ngram_size=3 if mode == \"Expand\" else 2,\n length_penalty=1.0 if mode == \"Paraphrase\" else 1.5,\n repetition_penalty=1.2,\n )\n \n # Decode output\n processed_text = tokenizer.decode(outputs[0], skip_special_tokens=True)\n processed_chunks.append(processed_text.strip())\n \n output_tokens = estimate_tokens(processed_text)\n print(f\"Chunk {i+1}/{len(chunks)}:\")\n print(f\" Input: {len(chunk.split())} words (~{input_tokens} tokens)\")\n print(f\" Output: {len(processed_text.split())} words (~{output_tokens} tokens)\")\n print(f\" Max length used: {chunk_max_length}\")\n print(f\"-\" * 60)\n \n # Combine chunks with double newline\n final_text = \"\\n\\n\".join(processed_chunks)\n \n # Calculate similarity\n similarity_score = calculate_similarity(text, final_text)\n \n print(f\"{'='*60}\")\n print(f\"Total: {len(text.split())} β {len(final_text.split())} words\")\n print(f\"Similarity: {similarity_score:.4f}\")\n print(f\"{'='*60}\\n\")\n \n return final_text, similarity_score","metadata":{"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"def update_model_choices(mode: str):\n \"\"\"Update model dropdown based on selected mode\"\"\"\n if mode == \"Paraphrase\":\n choices = list(PARAPHRASE_MODELS.keys())\n else:\n choices = list(EXPANSION_MODELS.keys())\n return gr.Dropdown(choices=choices, value=choices[0])\n\ndef update_parameters_visibility(mode: str):\n \"\"\"Show/hide target words parameter based on mode\"\"\"\n if mode == \"Expand\":\n return gr.Number(visible=True)\n else:\n return gr.Number(visible=False)","metadata":{"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"def process_text(\n input_text: str,\n mode: str,\n model_name: str,\n temperature: float,\n top_p: float,\n max_length: int,\n num_beams: int,\n max_sentences: int,\n target_words: int\n):\n \"\"\"Main processing function\"\"\"\n try:\n output_text, similarity = paraphrase_text(\n input_text,\n model_name,\n temperature,\n top_p,\n max_length,\n num_beams,\n max_sentences,\n target_words,\n mode\n )\n \n word_count_original = len(input_text.split())\n word_count_output = len(output_text.split())\n \n # Generate highlighted comparison\n highlighted_original, highlighted_generated, statistics = highlight_differences(\n input_text, \n output_text\n )\n \n # Format statistics\n stats_html = format_statistics(statistics)\n \n # Basic stats line\n basic_stats = f\"**Original:** {word_count_original} words | **Generated:** {word_count_output} words | **Similarity:** {similarity:.4f}\"\n \n return output_text, basic_stats, similarity, highlighted_original, highlighted_generated, stats_html\n except Exception as e:\n import traceback\n error_msg = f\"Error: {str(e)}\\n\\n{traceback.format_exc()}\"\n print(error_msg)\n return error_msg, \"Error occurred\", 0.0, \"\", \"\", \"\"\n","metadata":{"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"# Create Gradio interface\nwith gr.Blocks(title=\"Text Paraphraser & Expander\", theme=gr.themes.Soft()) as demo:\n gr.Markdown(\n \"\"\"\n # π Text Paraphraser & Expander\n Transform your text with AI-powered paraphrasing and expansion capabilities.\n \"\"\"\n )\n \n with gr.Row():\n with gr.Column(scale=1):\n mode = gr.Radio(\n choices=[\"Paraphrase\", \"Expand\"],\n value=\"Paraphrase\",\n label=\"Mode\",\n info=\"Choose to paraphrase or expand your text\"\n )\n \n model_dropdown = gr.Dropdown(\n choices=list(PARAPHRASE_MODELS.keys()),\n value=list(PARAPHRASE_MODELS.keys())[0],\n label=\"Model Selection\",\n info=\"Choose the model for processing\"\n )\n \n gr.Markdown(\"### βοΈ Parameters\")\n \n temperature = gr.Slider(\n minimum=0.0,\n maximum=2.0,\n value=0.7,\n step=0.1,\n label=\"Temperature\",\n info=\"Higher = more creative, Lower = more focused\"\n )\n \n top_p = gr.Slider(\n minimum=0.1,\n maximum=1.0,\n value=0.9,\n step=0.05,\n label=\"Top-p (Nucleus Sampling)\",\n info=\"Probability threshold for token selection\"\n )\n \n max_length = gr.Slider(\n minimum=128,\n maximum=1024,\n value=512,\n step=32,\n label=\"Max Length (tokens)\",\n info=\"Maximum length of generated text per chunk\"\n )\n \n num_beams = gr.Slider(\n minimum=1,\n maximum=10,\n value=4,\n step=1,\n label=\"Number of Beams\",\n info=\"Higher = better quality but slower\"\n )\n \n max_sentences = gr.Slider(\n minimum=1,\n maximum=10,\n value=4,\n step=1,\n label=\"Sentences per Chunk\",\n info=\"Number of sentences to process together\"\n )\n \n target_words = gr.Number(\n value=300,\n label=\"Target Word Count (Expand mode)\",\n info=\"Approximate number of words for expansion\",\n visible=False\n )\n \n with gr.Row():\n with gr.Column(scale=1):\n gr.Markdown(\"### π₯ Input Text\")\n input_text = gr.Textbox(\n lines=10,\n placeholder=\"Enter your text here...\",\n label=\"Original Text\",\n show_copy_button=True\n )\n \n with gr.Column(scale=1):\n gr.Markdown(\"### π€ Generated Text\")\n output_text = gr.Textbox(\n lines=10,\n label=\"Processed Text\",\n show_copy_button=True\n )\n \n with gr.Row():\n process_btn = gr.Button(\"π Generate\", variant=\"primary\", size=\"lg\")\n clear_btn = gr.ClearButton([input_text, output_text], value=\"ποΈ Clear\")\n \n stats_display = gr.Markdown()\n \n similarity_display = gr.Number(\n label=\"Cosine Similarity Score\",\n precision=4,\n interactive=False\n )\n \n # Highlighted comparison section\n gr.Markdown(\"---\")\n gr.Markdown(\"## π Visual Comparison - See What Changed\")\n \n with gr.Row():\n with gr.Column(scale=1):\n gr.Markdown(\"### π Original Text (with changes highlighted)\")\n highlighted_original = gr.HTML(\n label=\"Original with Changes\",\n show_label=False\n )\n \n with gr.Column(scale=1):\n gr.Markdown(\"### β¨ Generated Text (with changes highlighted)\")\n highlighted_generated = gr.HTML(\n label=\"Generated with Changes\",\n show_label=False\n )\n\n change_stats = gr.HTML(label=\"Change Statistics\")\n \n # Event handlers\n mode.change(\n fn=update_model_choices,\n inputs=[mode],\n outputs=[model_dropdown]\n )\n \n mode.change(\n fn=update_parameters_visibility,\n inputs=[mode],\n outputs=[target_words]\n )\n \n process_btn.click(\n fn=process_text,\n inputs=[\n input_text,\n mode,\n model_dropdown,\n temperature,\n top_p,\n max_length,\n num_beams,\n max_sentences,\n target_words\n ],\n outputs=[\n output_text, \n stats_display, \n similarity_display,\n highlighted_original,\n highlighted_generated,\n change_stats\n ]\n )\n \n gr.Markdown(\n \"\"\"\n ---\n ### π‘ Tips:\n - **Paraphrase Mode**: Rewrites text while preserving meaning\n - **Expand Mode**: Adds details and elaboration to make text longer\n - **Sentences per Chunk**: Controls how many sentences are processed together (4 recommended)\n - Adjust temperature for creativity (0.7-1.0 for paraphrase, 1.0-1.5 for expansion)\n - Higher beam count = better quality but slower processing\n - Max length is automatically calculated based on input, but can be overridden\n - Output chunks are separated by double newlines for readability\n \"\"\"\n )","metadata":{"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"if __name__ == \"__main__\":\n demo.launch(share=True)","metadata":{"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"","metadata":{"trusted":true},"outputs":[],"execution_count":null}]}
|