SakibAhmed commited on
Commit
7ea8f5a
·
verified ·
1 Parent(s): a590847

Upload 4 files

Browse files
Files changed (4) hide show
  1. Dockerfile +20 -0
  2. README.md +12 -11
  3. app.py +549 -0
  4. requirements.txt +3 -0
Dockerfile ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Dockerfile
2
+ FROM python:3.11
3
+
4
+ WORKDIR /app
5
+
6
+ COPY requirements.txt .
7
+ RUN pip install --no-cache-dir --upgrade -r requirements.txt
8
+
9
+ COPY . .
10
+
11
+ RUN useradd -m -u 1000 user
12
+ USER user
13
+
14
+ ENV HOME=/home/user \
15
+ PATH=/home/user/.local/bin:$PATH
16
+
17
+ EXPOSE 7860
18
+
19
+ # Run both Flask and agent using supervisord or similar
20
+ CMD python -m flask run --host=0.0.0.0 --port=7860 & python app.py
README.md CHANGED
@@ -1,11 +1,12 @@
1
- ---
2
- title: Website Scraper And HTML Cleaner API
3
- emoji: 🐢
4
- colorFrom: green
5
- colorTo: yellow
6
- sdk: docker
7
- pinned: false
8
- short_description: Website-Scraper-and-HTML-Cleaner-API
9
- ---
10
-
11
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
1
+ # Website-Scraper-and-HTML-Cleaner-API
2
+ ---
3
+ title: Website Scraper And HTML Cleaner API
4
+ emoji: 🐠
5
+ colorFrom: yellow
6
+ colorTo: purple
7
+ sdk: docker
8
+ pinned: false
9
+ short_description: Website-Scraper-and-HTML-Cleaner-API
10
+ ---
11
+
12
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,549 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from flask import Flask, request, jsonify, render_template_string
2
+ from flask_cors import CORS
3
+ from bs4 import BeautifulSoup
4
+ import re
5
+
6
+ app = Flask(__name__)
7
+ CORS(app)
8
+
9
+ HTML_TEMPLATE = """
10
+ <!DOCTYPE html>
11
+ <html lang="en">
12
+ <head>
13
+ <meta charset="UTF-8">
14
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
15
+ <title>HTML Cleaner</title>
16
+ <style>
17
+ * {
18
+ margin: 0;
19
+ padding: 0;
20
+ box-sizing: border-box;
21
+ }
22
+
23
+ body {
24
+ font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
25
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
26
+ min-height: 100vh;
27
+ padding: 20px;
28
+ }
29
+
30
+ .container {
31
+ max-width: 1400px;
32
+ margin: 0 auto;
33
+ }
34
+
35
+ .header {
36
+ text-align: center;
37
+ color: white;
38
+ margin-bottom: 30px;
39
+ }
40
+
41
+ .header h1 {
42
+ font-size: 2.5em;
43
+ margin-bottom: 10px;
44
+ text-shadow: 2px 2px 4px rgba(0,0,0,0.3);
45
+ }
46
+
47
+ .header p {
48
+ font-size: 1.1em;
49
+ opacity: 0.9;
50
+ }
51
+
52
+ .main-content {
53
+ display: grid;
54
+ grid-template-columns: 1fr 1fr;
55
+ gap: 20px;
56
+ margin-bottom: 20px;
57
+ }
58
+
59
+ .panel {
60
+ background: white;
61
+ border-radius: 12px;
62
+ padding: 25px;
63
+ box-shadow: 0 10px 30px rgba(0,0,0,0.2);
64
+ }
65
+
66
+ .panel h2 {
67
+ color: #667eea;
68
+ margin-bottom: 15px;
69
+ font-size: 1.5em;
70
+ display: flex;
71
+ align-items: center;
72
+ gap: 10px;
73
+ }
74
+
75
+ .icon {
76
+ font-size: 1.2em;
77
+ }
78
+
79
+ textarea {
80
+ width: 100%;
81
+ height: 400px;
82
+ padding: 15px;
83
+ border: 2px solid #e0e0e0;
84
+ border-radius: 8px;
85
+ font-family: 'Courier New', monospace;
86
+ font-size: 14px;
87
+ resize: vertical;
88
+ transition: border-color 0.3s;
89
+ }
90
+
91
+ textarea:focus {
92
+ outline: none;
93
+ border-color: #667eea;
94
+ }
95
+
96
+ .button-group {
97
+ display: flex;
98
+ gap: 10px;
99
+ margin-top: 15px;
100
+ flex-wrap: wrap;
101
+ }
102
+
103
+ button {
104
+ padding: 12px 24px;
105
+ border: none;
106
+ border-radius: 8px;
107
+ font-size: 16px;
108
+ font-weight: 600;
109
+ cursor: pointer;
110
+ transition: all 0.3s;
111
+ display: flex;
112
+ align-items: center;
113
+ gap: 8px;
114
+ }
115
+
116
+ .btn-primary {
117
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
118
+ color: white;
119
+ flex: 1;
120
+ }
121
+
122
+ .btn-primary:hover {
123
+ transform: translateY(-2px);
124
+ box-shadow: 0 5px 15px rgba(102, 126, 234, 0.4);
125
+ }
126
+
127
+ .btn-secondary {
128
+ background: #f5f5f5;
129
+ color: #333;
130
+ }
131
+
132
+ .btn-secondary:hover {
133
+ background: #e0e0e0;
134
+ }
135
+
136
+ .btn-success {
137
+ background: #10b981;
138
+ color: white;
139
+ }
140
+
141
+ .btn-success:hover {
142
+ background: #059669;
143
+ }
144
+
145
+ .stats {
146
+ display: grid;
147
+ grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
148
+ gap: 15px;
149
+ margin-top: 15px;
150
+ }
151
+
152
+ .stat-card {
153
+ background: linear-gradient(135deg, #667eea15 0%, #764ba215 100%);
154
+ padding: 15px;
155
+ border-radius: 8px;
156
+ border-left: 4px solid #667eea;
157
+ }
158
+
159
+ .stat-label {
160
+ font-size: 12px;
161
+ color: #666;
162
+ text-transform: uppercase;
163
+ letter-spacing: 1px;
164
+ margin-bottom: 5px;
165
+ }
166
+
167
+ .stat-value {
168
+ font-size: 24px;
169
+ font-weight: bold;
170
+ color: #333;
171
+ }
172
+
173
+ .status {
174
+ padding: 10px 15px;
175
+ border-radius: 8px;
176
+ margin-top: 15px;
177
+ display: none;
178
+ animation: slideIn 0.3s;
179
+ }
180
+
181
+ @keyframes slideIn {
182
+ from {
183
+ opacity: 0;
184
+ transform: translateY(-10px);
185
+ }
186
+ to {
187
+ opacity: 1;
188
+ transform: translateY(0);
189
+ }
190
+ }
191
+
192
+ .status.success {
193
+ background: #d1fae5;
194
+ color: #065f46;
195
+ border-left: 4px solid #10b981;
196
+ }
197
+
198
+ .status.error {
199
+ background: #fee2e2;
200
+ color: #991b1b;
201
+ border-left: 4px solid #ef4444;
202
+ }
203
+
204
+ .loading {
205
+ display: inline-block;
206
+ width: 16px;
207
+ height: 16px;
208
+ border: 3px solid #ffffff;
209
+ border-radius: 50%;
210
+ border-top-color: transparent;
211
+ animation: spin 1s linear infinite;
212
+ }
213
+
214
+ @keyframes spin {
215
+ to { transform: rotate(360deg); }
216
+ }
217
+
218
+ .example-section {
219
+ background: white;
220
+ border-radius: 12px;
221
+ padding: 25px;
222
+ box-shadow: 0 10px 30px rgba(0,0,0,0.2);
223
+ margin-top: 20px;
224
+ }
225
+
226
+ .example-section h3 {
227
+ color: #667eea;
228
+ margin-bottom: 15px;
229
+ }
230
+
231
+ .example-buttons {
232
+ display: flex;
233
+ gap: 10px;
234
+ flex-wrap: wrap;
235
+ }
236
+
237
+ .example-btn {
238
+ padding: 8px 16px;
239
+ background: #f0f0f0;
240
+ border: 2px solid #667eea;
241
+ color: #667eea;
242
+ border-radius: 6px;
243
+ cursor: pointer;
244
+ transition: all 0.3s;
245
+ }
246
+
247
+ .example-btn:hover {
248
+ background: #667eea;
249
+ color: white;
250
+ }
251
+
252
+ @media (max-width: 768px) {
253
+ .main-content {
254
+ grid-template-columns: 1fr;
255
+ }
256
+
257
+ .header h1 {
258
+ font-size: 1.8em;
259
+ }
260
+ }
261
+ </style>
262
+ </head>
263
+ <body>
264
+ <div class="container">
265
+ <div class="header">
266
+ <h1>🧹 HTML Cleaner</h1>
267
+ <p>Remove HTML tags while preserving visible text and links</p>
268
+ </div>
269
+
270
+ <div class="main-content">
271
+ <div class="panel">
272
+ <h2><span class="icon">📝</span> Input HTML</h2>
273
+ <textarea id="inputHtml" placeholder="Paste your HTML code here..."></textarea>
274
+ <div class="button-group">
275
+ <button class="btn-primary" onclick="cleanHtml()">
276
+ <span id="cleanBtnText">🧹 Clean HTML</span>
277
+ <span id="cleanBtnLoader" class="loading" style="display: none;"></span>
278
+ </button>
279
+ <button class="btn-secondary" onclick="clearInput()">🗑️ Clear</button>
280
+ </div>
281
+ <div class="stats">
282
+ <div class="stat-card">
283
+ <div class="stat-label">Characters</div>
284
+ <div class="stat-value" id="inputChars">0</div>
285
+ </div>
286
+ <div class="stat-card">
287
+ <div class="stat-label">Words</div>
288
+ <div class="stat-value" id="inputWords">0</div>
289
+ </div>
290
+ </div>
291
+ </div>
292
+
293
+ <div class="panel">
294
+ <h2><span class="icon">✨</span> Cleaned Output</h2>
295
+ <textarea id="outputText" placeholder="Cleaned text will appear here..." readonly></textarea>
296
+ <div class="button-group">
297
+ <button class="btn-success" onclick="copyOutput()">📋 Copy to Clipboard</button>
298
+ <button class="btn-secondary" onclick="downloadOutput()">💾 Download</button>
299
+ </div>
300
+ <div class="stats">
301
+ <div class="stat-card">
302
+ <div class="stat-label">Characters</div>
303
+ <div class="stat-value" id="outputChars">0</div>
304
+ </div>
305
+ <div class="stat-card">
306
+ <div class="stat-label">Words</div>
307
+ <div class="stat-value" id="outputWords">0</div>
308
+ </div>
309
+ </div>
310
+ <div id="status" class="status"></div>
311
+ </div>
312
+ </div>
313
+
314
+ <div class="example-section">
315
+ <h3>📚 Quick Examples</h3>
316
+ <div class="example-buttons">
317
+ <button class="example-btn" onclick="loadExample(1)">Simple Paragraph</button>
318
+ <button class="example-btn" onclick="loadExample(2)">With Links</button>
319
+ <button class="example-btn" onclick="loadExample(3)">Complex HTML</button>
320
+ <button class="example-btn" onclick="loadExample(4)">Article with Images</button>
321
+ </div>
322
+ </div>
323
+ </div>
324
+
325
+ <script>
326
+ const examples = {
327
+ 1: '<div><h1>Welcome</h1><p>This is a <strong>simple</strong> example.</p></div>',
328
+ 2: '<p>Visit <a href="https://example.com">our website</a> for more info. Contact us at <a href="mailto:info@example.com">info@example.com</a></p>',
329
+ 3: '<div class="container"><header><nav><ul><li><a href="/">Home</a></li><li><a href="/about">About</a></li></ul></nav></header><main><article><h2>Article Title</h2><p>Some content here with <span style="color:red;">styled text</span>.</p></article></main></div>',
330
+ 4: '<article><h1>Travel Guide</h1><img src="photo.jpg" alt="Beach photo"><p>Check out this <a href="https://beach.com">amazing beach</a>!</p><img src="sunset.jpg" alt="Sunset view"><p>Beautiful sunsets every evening.</p></article>'
331
+ };
332
+
333
+ const inputHtml = document.getElementById('inputHtml');
334
+ const outputText = document.getElementById('outputText');
335
+
336
+ inputHtml.addEventListener('input', updateInputStats);
337
+
338
+ function updateInputStats() {
339
+ const text = inputHtml.value;
340
+ document.getElementById('inputChars').textContent = text.length;
341
+ document.getElementById('inputWords').textContent = text.trim() ? text.trim().split(/\s+/).length : 0;
342
+ }
343
+
344
+ function updateOutputStats() {
345
+ const text = outputText.value;
346
+ document.getElementById('outputChars').textContent = text.length;
347
+ document.getElementById('outputWords').textContent = text.trim() ? text.trim().split(/\s+/).length : 0;
348
+ }
349
+
350
+ async function cleanHtml() {
351
+ const html = inputHtml.value.trim();
352
+
353
+ if (!html) {
354
+ showStatus('Please enter some HTML to clean', 'error');
355
+ return;
356
+ }
357
+
358
+ const cleanBtn = document.querySelector('.btn-primary');
359
+ const btnText = document.getElementById('cleanBtnText');
360
+ const btnLoader = document.getElementById('cleanBtnLoader');
361
+
362
+ btnText.style.display = 'none';
363
+ btnLoader.style.display = 'inline-block';
364
+ cleanBtn.disabled = true;
365
+
366
+ try {
367
+ const response = await fetch('/api/clean', {
368
+ method: 'POST',
369
+ headers: {
370
+ 'Content-Type': 'application/json',
371
+ },
372
+ body: JSON.stringify({ html: html })
373
+ });
374
+
375
+ const data = await response.json();
376
+
377
+ if (data.success) {
378
+ outputText.value = data.cleaned_text;
379
+ updateOutputStats();
380
+ showStatus('HTML cleaned successfully!', 'success');
381
+ } else {
382
+ showStatus('Error: ' + data.error, 'error');
383
+ }
384
+ } catch (error) {
385
+ showStatus('Error: ' + error.message, 'error');
386
+ } finally {
387
+ btnText.style.display = 'inline';
388
+ btnLoader.style.display = 'none';
389
+ cleanBtn.disabled = false;
390
+ }
391
+ }
392
+
393
+ function clearInput() {
394
+ inputHtml.value = '';
395
+ outputText.value = '';
396
+ updateInputStats();
397
+ updateOutputStats();
398
+ hideStatus();
399
+ }
400
+
401
+ function copyOutput() {
402
+ if (!outputText.value) {
403
+ showStatus('Nothing to copy', 'error');
404
+ return;
405
+ }
406
+
407
+ outputText.select();
408
+ document.execCommand('copy');
409
+ showStatus('Copied to clipboard!', 'success');
410
+ }
411
+
412
+ function downloadOutput() {
413
+ if (!outputText.value) {
414
+ showStatus('Nothing to download', 'error');
415
+ return;
416
+ }
417
+
418
+ const blob = new Blob([outputText.value], { type: 'text/plain' });
419
+ const url = window.URL.createObjectURL(blob);
420
+ const a = document.createElement('a');
421
+ a.href = url;
422
+ a.download = 'cleaned_text.txt';
423
+ a.click();
424
+ window.URL.revokeObjectURL(url);
425
+ showStatus('Downloaded successfully!', 'success');
426
+ }
427
+
428
+ function loadExample(num) {
429
+ inputHtml.value = examples[num];
430
+ updateInputStats();
431
+ outputText.value = '';
432
+ updateOutputStats();
433
+ hideStatus();
434
+ }
435
+
436
+ function showStatus(message, type) {
437
+ const status = document.getElementById('status');
438
+ status.textContent = message;
439
+ status.className = 'status ' + type;
440
+ status.style.display = 'block';
441
+
442
+ setTimeout(() => {
443
+ hideStatus();
444
+ }, 5000);
445
+ }
446
+
447
+ function hideStatus() {
448
+ const status = document.getElementById('status');
449
+ status.style.display = 'none';
450
+ }
451
+ </script>
452
+ </body>
453
+ </html>
454
+ """
455
+
456
+ def clean_html(html_content):
457
+ """
458
+ Clean HTML content by removing tags but preserving visible text and links
459
+ """
460
+ try:
461
+ soup = BeautifulSoup(html_content, 'html.parser')
462
+
463
+ # Remove script and style elements
464
+ for script in soup(["script", "style", "meta", "link", "head"]):
465
+ script.decompose()
466
+
467
+ # Process the HTML to extract text and links
468
+ def extract_text(element):
469
+ result = []
470
+
471
+ for content in element.children:
472
+ if content.name is None: # Text node
473
+ text = str(content).strip()
474
+ if text:
475
+ result.append(text)
476
+ elif content.name == 'a': # Link element
477
+ link_text = content.get_text().strip()
478
+ href = content.get('href', '')
479
+ if link_text and href:
480
+ result.append(f"{link_text} ({href})")
481
+ elif link_text:
482
+ result.append(link_text)
483
+ elif content.name == 'br': # Line break
484
+ result.append('\n')
485
+ elif content.name in ['p', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li']:
486
+ # Block elements - add newlines
487
+ inner = extract_text(content)
488
+ if inner:
489
+ result.append('\n' + ' '.join(inner) + '\n')
490
+ elif content.name == 'img': # Image with alt text
491
+ alt = content.get('alt', '')
492
+ src = content.get('src', '')
493
+ if alt:
494
+ result.append(f"[Image: {alt}]")
495
+ elif src:
496
+ result.append(f"[Image: {src}]")
497
+ else:
498
+ # Recursively process other elements
499
+ result.extend(extract_text(content))
500
+
501
+ return result
502
+
503
+ # Extract all text and links
504
+ text_parts = extract_text(soup)
505
+
506
+ # Join and clean up whitespace
507
+ cleaned_text = ' '.join(text_parts)
508
+ cleaned_text = re.sub(r'\n\s*\n', '\n\n', cleaned_text) # Remove extra newlines
509
+ cleaned_text = re.sub(r' +', ' ', cleaned_text) # Remove extra spaces
510
+ cleaned_text = cleaned_text.strip()
511
+
512
+ return cleaned_text
513
+
514
+ except Exception as e:
515
+ raise Exception(f"Error cleaning HTML: {str(e)}")
516
+
517
+ @app.route('/')
518
+ def index():
519
+ return render_template_string(HTML_TEMPLATE)
520
+
521
+ @app.route('/api/clean', methods=['POST'])
522
+ def clean_html_api():
523
+ try:
524
+ data = request.get_json()
525
+
526
+ if not data or 'html' not in data:
527
+ return jsonify({
528
+ 'success': False,
529
+ 'error': 'No HTML content provided'
530
+ }), 400
531
+
532
+ html_content = data['html']
533
+ cleaned_text = clean_html(html_content)
534
+
535
+ return jsonify({
536
+ 'success': True,
537
+ 'cleaned_text': cleaned_text,
538
+ 'original_length': len(html_content),
539
+ 'cleaned_length': len(cleaned_text)
540
+ })
541
+
542
+ except Exception as e:
543
+ return jsonify({
544
+ 'success': False,
545
+ 'error': str(e)
546
+ }), 500
547
+
548
+ if __name__ == '__main__':
549
+ app.run(host='0.0.0.0', port=7860, debug=True)
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ beautifulsoup4==4.14.2
2
+ Flask==3.1.2
3
+ Flask_Cors==5.0.0