Spaces:
Sleeping
Sleeping
HuuHuy227
commited on
Commit
·
d6d5bda
1
Parent(s):
7b9f840
new-modified
Browse files- Dockerfile +1 -4
- app.py +177 -227
- requirements.txt +5 -5
- utils.py +0 -133
Dockerfile
CHANGED
|
@@ -6,15 +6,12 @@ WORKDIR /app
|
|
| 6 |
|
| 7 |
# Install system dependencies for cairosvg
|
| 8 |
RUN apt-get update && apt-get install -y \
|
|
|
|
| 9 |
build-essential \
|
| 10 |
python3-dev \
|
| 11 |
python3-pip \
|
| 12 |
python3-setuptools \
|
| 13 |
-
libcairo2-dev \
|
| 14 |
pkg-config \
|
| 15 |
-
libcairo2 \
|
| 16 |
-
libcairo-gobject2 \
|
| 17 |
-
python3-cairo \
|
| 18 |
libpango1.0-dev \
|
| 19 |
shared-mime-info \
|
| 20 |
mime-support \
|
|
|
|
| 6 |
|
| 7 |
# Install system dependencies for cairosvg
|
| 8 |
RUN apt-get update && apt-get install -y \
|
| 9 |
+
graphviz \
|
| 10 |
build-essential \
|
| 11 |
python3-dev \
|
| 12 |
python3-pip \
|
| 13 |
python3-setuptools \
|
|
|
|
| 14 |
pkg-config \
|
|
|
|
|
|
|
|
|
|
| 15 |
libpango1.0-dev \
|
| 16 |
shared-mime-info \
|
| 17 |
mime-support \
|
app.py
CHANGED
|
@@ -1,246 +1,196 @@
|
|
| 1 |
import streamlit as st
|
| 2 |
import spacy
|
| 3 |
-
|
| 4 |
import pandas as pd
|
| 5 |
-
from collections import Counter
|
| 6 |
-
import plotly.express as px
|
| 7 |
-
from utils import analyze_text
|
| 8 |
-
from utils import svg_to_png
|
| 9 |
import base64
|
|
|
|
|
|
|
| 10 |
|
| 11 |
-
#
|
| 12 |
-
|
| 13 |
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
"
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
1. Enter your text in the input box
|
| 39 |
-
2. Click "Analyze Text" to see:
|
| 40 |
-
- Sentence structure visualization
|
| 41 |
-
- Detailed token analysis
|
| 42 |
-
- Additional analysis in expandable sections
|
| 43 |
-
3. Use mouse wheel or buttons to zoom the visualization
|
| 44 |
-
4. Click and drag to pan around
|
| 45 |
-
""")
|
| 46 |
-
|
| 47 |
-
if analyze_button:
|
| 48 |
-
if text_input:
|
| 49 |
-
tokens, entities, noun_chunks, stats, doc = analyze_text(nlp, text_input)
|
| 50 |
-
|
| 51 |
-
# 1. Dependency Parse with improved visualization
|
| 52 |
-
st.header("Sentence Structure Analysis")
|
| 53 |
-
|
| 54 |
-
# Generate sentence visualizations
|
| 55 |
-
sentences = list(doc.sents)
|
| 56 |
-
sentence_htmls = []
|
| 57 |
-
for sent in sentences:
|
| 58 |
-
sent_html = displacy.render(sent, style="dep", options={
|
| 59 |
-
"distance": 120,
|
| 60 |
-
"arrow_stroke": 2,
|
| 61 |
-
"arrow_width": 8,
|
| 62 |
-
"font": "Arial",
|
| 63 |
-
"bg": "#ffffff",
|
| 64 |
})
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 71 |
|
| 72 |
-
#
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
png_b64 = base64.b64encode(png_bytes).decode()
|
| 78 |
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
transform-origin: 0 0;
|
| 92 |
-
transition: transform 0.1s;
|
| 93 |
-
}
|
| 94 |
-
.download-btn {
|
| 95 |
-
position: absolute;
|
| 96 |
-
right: 10px;
|
| 97 |
-
top: 10px;
|
| 98 |
-
background: rgba(255, 255, 255, 0.8);
|
| 99 |
-
border: 1px solid #ddd;
|
| 100 |
-
border-radius: 4px;
|
| 101 |
-
padding: 5px 10px;
|
| 102 |
-
cursor: pointer;
|
| 103 |
-
}
|
| 104 |
-
.download-btn:hover {
|
| 105 |
-
background: white;
|
| 106 |
-
}
|
| 107 |
-
</style>
|
| 108 |
-
""", unsafe_allow_html=True)
|
| 109 |
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
// Zoom functionality
|
| 131 |
-
container.addEventListener('wheel', (e) => {{
|
| 132 |
-
e.preventDefault();
|
| 133 |
-
const rect = container.getBoundingClientRect();
|
| 134 |
-
const mouseX = e.clientX - rect.left;
|
| 135 |
-
const mouseY = e.clientY - rect.top;
|
| 136 |
-
|
| 137 |
-
const delta = e.deltaY * -0.01;
|
| 138 |
-
const newScale = Math.max(1, Math.min(scale + delta, 4));
|
| 139 |
-
const scaleChange = newScale / scale;
|
| 140 |
-
|
| 141 |
-
translateX = mouseX - (mouseX - translateX) * scaleChange;
|
| 142 |
-
translateY = mouseY - (mouseY - translateY) * scaleChange;
|
| 143 |
-
|
| 144 |
-
scale = newScale;
|
| 145 |
-
updateTransform();
|
| 146 |
-
}});
|
| 147 |
-
|
| 148 |
-
// Pan functionality
|
| 149 |
-
container.addEventListener('mousedown', (e) => {{
|
| 150 |
-
isPanning = true;
|
| 151 |
-
startX = e.clientX - translateX;
|
| 152 |
-
startY = e.clientY - translateY;
|
| 153 |
-
container.style.cursor = 'grabbing';
|
| 154 |
-
}});
|
| 155 |
-
|
| 156 |
-
container.addEventListener('mousemove', (e) => {{
|
| 157 |
-
if (!isPanning) return;
|
| 158 |
-
translateX = e.clientX - startX;
|
| 159 |
-
translateY = e.clientY - startY;
|
| 160 |
-
updateTransform();
|
| 161 |
-
}});
|
| 162 |
-
|
| 163 |
-
container.addEventListener('mouseup', () => {{
|
| 164 |
-
isPanning = false;
|
| 165 |
-
container.style.cursor = 'grab';
|
| 166 |
-
}});
|
| 167 |
-
|
| 168 |
-
container.addEventListener('mouseleave', () => {{
|
| 169 |
-
isPanning = false;
|
| 170 |
-
container.style.cursor = 'grab';
|
| 171 |
-
}});
|
| 172 |
-
|
| 173 |
-
function updateTransform() {{
|
| 174 |
-
img.style.transform = `translate(${{translateX}}px, ${{translateY}}px) scale(${{scale}})`;
|
| 175 |
-
}}
|
| 176 |
-
|
| 177 |
-
// Initialize
|
| 178 |
-
container.style.cursor = 'grab';
|
| 179 |
-
container.style.height = '500px';
|
| 180 |
-
</script>
|
| 181 |
-
"""
|
| 182 |
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 189 |
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 193 |
|
| 194 |
-
#
|
| 195 |
-
|
| 196 |
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 207 |
|
| 208 |
-
|
| 209 |
-
st.
|
| 210 |
-
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
|
| 214 |
-
|
| 215 |
|
| 216 |
-
#
|
| 217 |
-
|
| 218 |
-
|
| 219 |
-
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
labels={'x': 'Entity Type', 'y': 'Count'}
|
| 223 |
-
)
|
| 224 |
-
st.plotly_chart(fig)
|
| 225 |
|
| 226 |
-
|
| 227 |
-
|
| 228 |
-
st.
|
| 229 |
-
|
| 230 |
-
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
st.info("No noun chunks found in the text.")
|
| 235 |
-
|
| 236 |
-
with st.expander("Text Statistics"):
|
| 237 |
-
col1, col2, col3 = st.columns(3)
|
| 238 |
-
with col1:
|
| 239 |
-
st.metric("Word Count", stats['Word Count'])
|
| 240 |
-
with col2:
|
| 241 |
-
st.metric("Sentence Count", stats['Sentence Count'])
|
| 242 |
-
with col3:
|
| 243 |
-
st.metric("Unique Words", stats['Unique Words'])
|
| 244 |
|
| 245 |
-
|
| 246 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import streamlit as st
|
| 2 |
import spacy
|
| 3 |
+
import graphviz
|
| 4 |
import pandas as pd
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
import base64
|
| 6 |
+
import shutil
|
| 7 |
+
import subprocess
|
| 8 |
|
| 9 |
+
# Load English language model for spaCy
|
| 10 |
+
nlp = spacy.load('en_core_web_md')
|
| 11 |
|
| 12 |
+
def check_graphviz_installation():
|
| 13 |
+
"""
|
| 14 |
+
Check if Graphviz is installed and accessible
|
| 15 |
+
"""
|
| 16 |
+
if shutil.which('dot') is None:
|
| 17 |
+
return False
|
| 18 |
+
try:
|
| 19 |
+
subprocess.run(['dot', '-V'], capture_output=True, check=True)
|
| 20 |
+
return True
|
| 21 |
+
except (subprocess.SubprocessError, OSError):
|
| 22 |
+
return False
|
| 23 |
|
| 24 |
+
def identify_clauses(doc):
|
| 25 |
+
"""
|
| 26 |
+
Identify clauses in the sentence using spaCy, correctly separating dependent and independent clauses
|
| 27 |
+
"""
|
| 28 |
+
clauses = []
|
| 29 |
+
|
| 30 |
+
# First identify all subordinate clauses and their spans
|
| 31 |
+
subordinate_spans = []
|
| 32 |
+
for token in doc:
|
| 33 |
+
if token.dep_ in ["ccomp", "xcomp", "advcl", "relcl"]:
|
| 34 |
+
span = doc[token.left_edge.i:token.right_edge.i + 1]
|
| 35 |
+
subordinate_spans.append({
|
| 36 |
+
"span": span,
|
| 37 |
+
"type": {
|
| 38 |
+
"ccomp": "Complement Clause",
|
| 39 |
+
"xcomp": "Open Complement Clause",
|
| 40 |
+
"advcl": "Adverbial Clause",
|
| 41 |
+
"relcl": "Adjective Clause"
|
| 42 |
+
}[token.dep_]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
})
|
| 44 |
+
|
| 45 |
+
# Find the root and construct the main clause by excluding subordinate spans
|
| 46 |
+
root = None
|
| 47 |
+
for token in doc:
|
| 48 |
+
if token.dep_ == "ROOT":
|
| 49 |
+
root = token
|
| 50 |
+
break
|
| 51 |
+
|
| 52 |
+
if root:
|
| 53 |
+
# Get all tokens in the root's subtree
|
| 54 |
+
main_clause_tokens = set(token for token in root.subtree)
|
| 55 |
|
| 56 |
+
# Remove tokens that are part of subordinate clauses
|
| 57 |
+
for sub_clause in subordinate_spans:
|
| 58 |
+
for token in sub_clause["span"]:
|
| 59 |
+
if token in main_clause_tokens:
|
| 60 |
+
main_clause_tokens.remove(token)
|
|
|
|
| 61 |
|
| 62 |
+
# Construct the main clause text from remaining tokens
|
| 63 |
+
main_clause_text = " ".join(sorted([token.text for token in main_clause_tokens],
|
| 64 |
+
key=lambda x: [t.i for t in doc if t.text == x][0]))
|
| 65 |
+
main_clause_text = main_clause_text.strip().replace(",","").replace(".","")
|
| 66 |
+
clauses.append({"Type": "Independent Clause", "Text": main_clause_text})
|
| 67 |
+
|
| 68 |
+
# Add the subordinate clauses
|
| 69 |
+
for sub_clause in subordinate_spans:
|
| 70 |
+
clauses.append({
|
| 71 |
+
"Type": sub_clause["type"],
|
| 72 |
+
"Text": sub_clause["span"].text
|
| 73 |
+
})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 74 |
|
| 75 |
+
return clauses
|
| 76 |
+
|
| 77 |
+
def analyze_clause_functions(doc):
|
| 78 |
+
"""
|
| 79 |
+
Analyze the function of each clause
|
| 80 |
+
"""
|
| 81 |
+
functions = []
|
| 82 |
+
|
| 83 |
+
for token in doc:
|
| 84 |
+
if token.dep_ == "ROOT":
|
| 85 |
+
functions.append({"Type": "Independent Clause", "Function": "Express the primary action or state"})
|
| 86 |
+
elif token.dep_ == "ccomp":
|
| 87 |
+
functions.append({"Type": "Complement Clause", "Function": "Acts as object of the main verb"})
|
| 88 |
+
elif token.dep_ == "xcomp":
|
| 89 |
+
functions.append({"Type": "Open Complement Clause", "Function": "Predicate complement without its own subject"})
|
| 90 |
+
elif token.dep_ == "advcl":
|
| 91 |
+
functions.append({"Type": "Adverbial Clause", "Function": "Modifies the verb like an adverb"})
|
| 92 |
+
elif token.dep_ == "relcl":
|
| 93 |
+
functions.append({"Type": "Adjective Clause", "Function": "Modifies a noun like an adjective"})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 94 |
|
| 95 |
+
return functions
|
| 96 |
+
|
| 97 |
+
def create_dependency_graph(doc):
|
| 98 |
+
"""
|
| 99 |
+
Create a graphviz visualization of the dependency tree
|
| 100 |
+
"""
|
| 101 |
+
if not check_graphviz_installation():
|
| 102 |
+
return None
|
| 103 |
+
|
| 104 |
+
dot = graphviz.Digraph(comment='Dependency Tree')
|
| 105 |
+
|
| 106 |
+
# Add nodes
|
| 107 |
+
for token in doc:
|
| 108 |
+
dot.node(str(token.i), f"{token.text}\n({token.pos_})")
|
| 109 |
+
|
| 110 |
+
# Add edges
|
| 111 |
+
for token in doc:
|
| 112 |
+
if token.head is not token: # Skip root
|
| 113 |
+
dot.edge(str(token.head.i), str(token.i), token.dep_)
|
| 114 |
+
|
| 115 |
+
return dot
|
| 116 |
|
| 117 |
+
def get_graph_download_link(dot):
|
| 118 |
+
"""
|
| 119 |
+
Generate a download link for the graph image
|
| 120 |
+
"""
|
| 121 |
+
try:
|
| 122 |
+
# Create PDF in memory
|
| 123 |
+
pdf = dot.pipe(format='pdf')
|
| 124 |
|
| 125 |
+
# Encode to base64
|
| 126 |
+
b64 = base64.b64encode(pdf).decode()
|
| 127 |
|
| 128 |
+
href = f'<a href="data:application/pdf;base64,{b64}" download="syntax_tree.pdf">Download Syntax Tree (PDF)</a>'
|
| 129 |
+
return href
|
| 130 |
+
except Exception as e:
|
| 131 |
+
return f"Error generating download link: {str(e)}"
|
| 132 |
+
|
| 133 |
+
def main():
|
| 134 |
+
# Set page to wide mode for better visualization
|
| 135 |
+
st.set_page_config(layout="wide")
|
| 136 |
+
st.markdown("<h1 style='text-align: center; color: white;'>English Clause Analyzer</h1>", unsafe_allow_html=True)
|
| 137 |
+
st.write("Enter an English sentence to analyze its clauses, their functions, and syntax tree.")
|
| 138 |
+
|
| 139 |
+
# Input text
|
| 140 |
+
text = st.text_area("Enter your sentence:", "When I arrived at the station, the train had already left.", height=100)
|
| 141 |
+
|
| 142 |
+
if st.button("Analyze"):
|
| 143 |
+
if text:
|
| 144 |
+
# Process the text
|
| 145 |
+
doc = nlp(text)
|
| 146 |
|
| 147 |
+
# Create two columns for layout
|
| 148 |
+
col1, col2 = st.columns(2)
|
| 149 |
+
|
| 150 |
+
with col1:
|
| 151 |
+
# Identify clauses
|
| 152 |
+
clauses = identify_clauses(doc)
|
| 153 |
+
st.subheader(f"Clauses Analysis")
|
| 154 |
|
| 155 |
+
# Convert clauses to DataFrame for better presentation
|
| 156 |
+
df_clauses = pd.DataFrame(clauses)
|
| 157 |
+
st.table(df_clauses.style.set_properties(**{
|
| 158 |
+
'background-color': 'rgba(0,0,0,0.1)',
|
| 159 |
+
'color': 'white'
|
| 160 |
+
}))
|
|
|
|
|
|
|
|
|
|
| 161 |
|
| 162 |
+
# Display clause functions
|
| 163 |
+
functions = analyze_clause_functions(doc)
|
| 164 |
+
st.subheader("Clause Functions")
|
| 165 |
+
df_functions = pd.DataFrame(functions)
|
| 166 |
+
st.table(df_functions.style.set_properties(**{
|
| 167 |
+
'background-color': 'rgba(0,0,0,0.1)',
|
| 168 |
+
'color': 'white'
|
| 169 |
+
}))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 170 |
|
| 171 |
+
with col2:
|
| 172 |
+
# Display dependency visualization
|
| 173 |
+
st.subheader("Syntax Tree Visualization")
|
| 174 |
+
if not check_graphviz_installation():
|
| 175 |
+
st.error("Graphviz is not installed. Please install it using:")
|
| 176 |
+
st.code("sudo apt-get install graphviz")
|
| 177 |
+
st.markdown("After installation, restart the application.")
|
| 178 |
+
else:
|
| 179 |
+
dot = create_dependency_graph(doc)
|
| 180 |
+
st.graphviz_chart(dot)
|
| 181 |
+
|
| 182 |
+
# Add download button for the graph
|
| 183 |
+
st.markdown(get_graph_download_link(dot), unsafe_allow_html=True)
|
| 184 |
+
|
| 185 |
+
# Display part-of-speech tags in a table
|
| 186 |
+
st.subheader("Part-of-Speech Analysis")
|
| 187 |
+
pos_data = [{"Word": token.text, "Part of Speech": token.pos_,
|
| 188 |
+
"Description": spacy.explain(token.pos_)} for token in doc]
|
| 189 |
+
df_pos = pd.DataFrame(pos_data)
|
| 190 |
+
st.table(df_pos.style.set_properties(**{
|
| 191 |
+
'background-color': 'rgba(0,0,0,0.1)',
|
| 192 |
+
'color': 'white'
|
| 193 |
+
}))
|
| 194 |
+
|
| 195 |
+
if __name__ == "__main__":
|
| 196 |
+
main()
|
requirements.txt
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
-
streamlit
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
|
|
|
| 1 |
+
streamlit
|
| 2 |
+
nltk
|
| 3 |
+
spacy
|
| 4 |
+
matplotlib
|
| 5 |
+
graphviz
|
utils.py
DELETED
|
@@ -1,133 +0,0 @@
|
|
| 1 |
-
import io
|
| 2 |
-
from cairosvg import svg2png
|
| 3 |
-
from PIL import Image
|
| 4 |
-
# import base64
|
| 5 |
-
|
| 6 |
-
def get_entity_explanation(label):
|
| 7 |
-
"""Return explanation for named entity labels"""
|
| 8 |
-
explanations = {
|
| 9 |
-
'PERSON': 'People, including fictional',
|
| 10 |
-
'NORP': 'Nationalities, religious or political groups',
|
| 11 |
-
'FAC': 'Buildings, airports, highways, bridges, etc.',
|
| 12 |
-
'ORG': 'Companies, agencies, institutions, etc.',
|
| 13 |
-
'GPE': 'Countries, cities, states',
|
| 14 |
-
'LOC': 'Non-GPE locations, mountain ranges, water bodies',
|
| 15 |
-
'PRODUCT': 'Objects, vehicles, foods, etc.',
|
| 16 |
-
'EVENT': 'Named hurricanes, battles, wars, sports events, etc.',
|
| 17 |
-
'WORK_OF_ART': 'Titles of books, songs, etc.',
|
| 18 |
-
'DATE': 'Absolute or relative dates or periods',
|
| 19 |
-
'TIME': 'Times smaller than a day',
|
| 20 |
-
'MONEY': 'Monetary values, including unit',
|
| 21 |
-
'QUANTITY': 'Measurements, as of weight or distance'
|
| 22 |
-
}
|
| 23 |
-
return explanations.get(label, 'Other type of entity')
|
| 24 |
-
|
| 25 |
-
def analyze_text(nlp, text):
|
| 26 |
-
doc = nlp(text)
|
| 27 |
-
|
| 28 |
-
# Basic tokenization and POS analysis
|
| 29 |
-
tokens = [{
|
| 30 |
-
'Text': token.text,
|
| 31 |
-
'Lemma': token.lemma_,
|
| 32 |
-
'POS': token.pos_,
|
| 33 |
-
'Tag': token.tag_,
|
| 34 |
-
'Dependency': token.dep_,
|
| 35 |
-
'Shape': token.shape_,
|
| 36 |
-
'Is Alpha': token.is_alpha,
|
| 37 |
-
'Is Stop': token.is_stop
|
| 38 |
-
} for token in doc]
|
| 39 |
-
|
| 40 |
-
# Named Entity Recognition
|
| 41 |
-
entities = [{
|
| 42 |
-
'Text': ent.text,
|
| 43 |
-
'Label': ent.label_,
|
| 44 |
-
'Explanation': get_entity_explanation(ent.label_),
|
| 45 |
-
'Start': ent.start_char,
|
| 46 |
-
'End': ent.end_char
|
| 47 |
-
} for ent in doc.ents]
|
| 48 |
-
|
| 49 |
-
# Noun Chunks (phrases)
|
| 50 |
-
noun_chunks = [{
|
| 51 |
-
'Text': chunk.text,
|
| 52 |
-
'Root Text': chunk.root.text,
|
| 53 |
-
'Root Dep': chunk.root.dep_,
|
| 54 |
-
'Root Head Text': chunk.root.head.text
|
| 55 |
-
} for chunk in doc.noun_chunks]
|
| 56 |
-
|
| 57 |
-
# Text Statistics
|
| 58 |
-
stats = {
|
| 59 |
-
'Word Count': len([token for token in doc if not token.is_punct]),
|
| 60 |
-
'Sentence Count': len(list(doc.sents)),
|
| 61 |
-
'Average Words per Sentence': round(len([token for token in doc if not token.is_punct]) / len(list(doc.sents)), 2),
|
| 62 |
-
'Unique Words': len(set([token.text.lower() for token in doc if token.is_alpha])),
|
| 63 |
-
'Stop Words %': round(len([token for token in doc if token.is_stop]) / len(doc) * 100, 2)
|
| 64 |
-
}
|
| 65 |
-
|
| 66 |
-
return tokens, entities, noun_chunks, stats, doc
|
| 67 |
-
|
| 68 |
-
def svg_to_png(svg_content, background_color='white'):
|
| 69 |
-
"""Convert SVG to PNG with specified background color"""
|
| 70 |
-
# Split multiple SVGs if present
|
| 71 |
-
svg_parts = svg_content.split('<br><br>')
|
| 72 |
-
images = []
|
| 73 |
-
|
| 74 |
-
for svg in svg_parts:
|
| 75 |
-
# Add SVG namespace if missing
|
| 76 |
-
if not 'xmlns="http://www.w3.org/2000/svg"' in svg:
|
| 77 |
-
svg = svg.replace('<svg', '<svg xmlns="http://www.w3.org/2000/svg"')
|
| 78 |
-
|
| 79 |
-
try:
|
| 80 |
-
# Convert SVG to PNG bytes
|
| 81 |
-
png_bytes = svg2png(bytestring=svg.encode('utf-8'),
|
| 82 |
-
background_color=background_color,
|
| 83 |
-
scale=1)
|
| 84 |
-
|
| 85 |
-
# Create PIL Image from PNG bytes
|
| 86 |
-
img = Image.open(io.BytesIO(png_bytes))
|
| 87 |
-
|
| 88 |
-
# Convert RGBA to RGB with white background
|
| 89 |
-
if img.mode == 'RGBA':
|
| 90 |
-
background = Image.new('RGB', img.size, background_color)
|
| 91 |
-
background.paste(img, mask=img.split()[3]) # Use alpha channel as mask
|
| 92 |
-
img = background
|
| 93 |
-
|
| 94 |
-
# Add some padding
|
| 95 |
-
padding = 20 # pixels
|
| 96 |
-
img_with_padding = Image.new('RGB',
|
| 97 |
-
(img.width, img.height + padding * 2),
|
| 98 |
-
background_color)
|
| 99 |
-
img_with_padding.paste(img, (0, padding))
|
| 100 |
-
images.append(img_with_padding)
|
| 101 |
-
|
| 102 |
-
except Exception as e:
|
| 103 |
-
st.error(f"Error converting SVG to PNG: {str(e)}")
|
| 104 |
-
continue
|
| 105 |
-
|
| 106 |
-
if not images:
|
| 107 |
-
return None
|
| 108 |
-
|
| 109 |
-
# Combine images vertically if there are multiple
|
| 110 |
-
if len(images) > 1:
|
| 111 |
-
# Calculate total height and max width
|
| 112 |
-
total_height = sum(img.height for img in images)
|
| 113 |
-
max_width = max(img.width for img in images)
|
| 114 |
-
|
| 115 |
-
# Create new image to hold all sentences
|
| 116 |
-
combined = Image.new('RGB', (max_width, total_height), background_color)
|
| 117 |
-
|
| 118 |
-
# Paste each image
|
| 119 |
-
y_offset = 0
|
| 120 |
-
for img in images:
|
| 121 |
-
# Center image horizontally
|
| 122 |
-
x_offset = (max_width - img.width) // 2
|
| 123 |
-
combined.paste(img, (x_offset, y_offset))
|
| 124 |
-
y_offset += img.height
|
| 125 |
-
else:
|
| 126 |
-
combined = images[0]
|
| 127 |
-
|
| 128 |
-
# Convert to bytes for Streamlit
|
| 129 |
-
img_byte_arr = io.BytesIO()
|
| 130 |
-
combined.save(img_byte_arr, format='PNG')
|
| 131 |
-
img_byte_arr.seek(0)
|
| 132 |
-
|
| 133 |
-
return img_byte_arr.getvalue()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|