Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
| 1 |
import gradio as gr
|
| 2 |
import requests
|
| 3 |
import os
|
|
|
|
| 4 |
import markdownify
|
| 5 |
import fitz # PyMuPDF
|
| 6 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
|
@@ -32,6 +33,15 @@ def markdown_to_text(md_path):
|
|
| 32 |
with open(md_path, 'r') as file:
|
| 33 |
return file.read()
|
| 34 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 35 |
# Function to split text into chunks
|
| 36 |
def split_text_into_chunks(text, chunk_size=25, chunk_overlap=5, min_chunk_chars=50):
|
| 37 |
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
|
|
@@ -91,9 +101,10 @@ def process_files(uploaded_files, use_example, chunk_size, chunk_overlap, min_ch
|
|
| 91 |
text = markdown_to_text(file_path)
|
| 92 |
else:
|
| 93 |
text = ""
|
| 94 |
-
|
| 95 |
markdown_text = markdownify.markdownify(text)
|
| 96 |
file_id = os.path.splitext(os.path.basename(file_path))[0]
|
|
|
|
| 97 |
markdown_path = os.path.join(output_dir, f"{file_id}.md")
|
| 98 |
with open(markdown_path, 'w') as file:
|
| 99 |
file.write(markdown_text)
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
import requests
|
| 3 |
import os
|
| 4 |
+
import re
|
| 5 |
import markdownify
|
| 6 |
import fitz # PyMuPDF
|
| 7 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
|
|
|
| 33 |
with open(md_path, 'r') as file:
|
| 34 |
return file.read()
|
| 35 |
|
| 36 |
+
def sanitize_key(filename):
|
| 37 |
+
# Replace spaces with underscores
|
| 38 |
+
filename = filename.replace(" ", "_")
|
| 39 |
+
# Remove special characters except for underscores
|
| 40 |
+
filename = re.sub(r'[^a-zA-Z0-9_]', '', filename)
|
| 41 |
+
# Ensure the key is not too long
|
| 42 |
+
filename = filename[:100] # Truncate to 100 characters if necessary
|
| 43 |
+
return filename
|
| 44 |
+
|
| 45 |
# Function to split text into chunks
|
| 46 |
def split_text_into_chunks(text, chunk_size=25, chunk_overlap=5, min_chunk_chars=50):
|
| 47 |
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
|
|
|
|
| 101 |
text = markdown_to_text(file_path)
|
| 102 |
else:
|
| 103 |
text = ""
|
| 104 |
+
|
| 105 |
markdown_text = markdownify.markdownify(text)
|
| 106 |
file_id = os.path.splitext(os.path.basename(file_path))[0]
|
| 107 |
+
file_id = sanitize_key(file_id)
|
| 108 |
markdown_path = os.path.join(output_dir, f"{file_id}.md")
|
| 109 |
with open(markdown_path, 'w') as file:
|
| 110 |
file.write(markdown_text)
|