Spaces:

gretelai
/

docs_to_synthetic_qa

Running

mvansegbroeck commited on Jun 7, 2024

Commit

1420dd0

verified ·

1 Parent(s): c2923d2

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import gradio as gr
 import requests
 import os
 import markdownify
 import fitz  # PyMuPDF
 from langchain.text_splitter import RecursiveCharacterTextSplitter
@@ -32,6 +33,15 @@ def markdown_to_text(md_path):
     with open(md_path, 'r') as file:
         return file.read()
 # Function to split text into chunks
 def split_text_into_chunks(text, chunk_size=25, chunk_overlap=5, min_chunk_chars=50):
     text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
@@ -91,9 +101,10 @@ def process_files(uploaded_files, use_example, chunk_size, chunk_overlap, min_ch
             text = markdown_to_text(file_path)
         else:
             text = ""
         markdown_text = markdownify.markdownify(text)
         file_id = os.path.splitext(os.path.basename(file_path))[0]
         markdown_path = os.path.join(output_dir, f"{file_id}.md")
         with open(markdown_path, 'w') as file:
             file.write(markdown_text)

 import gradio as gr
 import requests
 import os
+import re
 import markdownify
 import fitz  # PyMuPDF
 from langchain.text_splitter import RecursiveCharacterTextSplitter
     with open(md_path, 'r') as file:
         return file.read()
+def sanitize_key(filename):
+    # Replace spaces with underscores
+    filename = filename.replace(" ", "_")
+    # Remove special characters except for underscores
+    filename = re.sub(r'[^a-zA-Z0-9_]', '', filename)
+    # Ensure the key is not too long
+    filename = filename[:100]  # Truncate to 100 characters if necessary
+    return filename
 # Function to split text into chunks
 def split_text_into_chunks(text, chunk_size=25, chunk_overlap=5, min_chunk_chars=50):
     text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
             text = markdown_to_text(file_path)
         else:
             text = ""
         markdown_text = markdownify.markdownify(text)
         file_id = os.path.splitext(os.path.basename(file_path))[0]
+        file_id = sanitize_key(file_id)
         markdown_path = os.path.join(output_dir, f"{file_id}.md")
         with open(markdown_path, 'w') as file:
             file.write(markdown_text)