Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -2,10 +2,18 @@ import gradio as gr
|
|
| 2 |
from datasets import load_from_disk
|
| 3 |
from pyserini.search.lucene import LuceneSearcher
|
| 4 |
from pyserini.analysis import JWhiteSpaceAnalyzer
|
|
|
|
|
|
|
| 5 |
|
| 6 |
searcher = LuceneSearcher("index")
|
| 7 |
searcher.set_analyzer(JWhiteSpaceAnalyzer())
|
| 8 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
ds = load_from_disk("data")
|
| 10 |
NUM_PAGES = 10 # STATIC. THIS CAN'T CHANGE BECAUSE GRADIO CAN'T DYNAMICALLY CREATE COMPONENTS.
|
| 11 |
RESULTS_PER_PAGE = 5
|
|
@@ -23,6 +31,7 @@ def format_results(results):
|
|
| 23 |
return "\n".join([result_html(result, meta) for result,meta in zip(results[TEXT_FIELD], results[METADATA_FIELD])])
|
| 24 |
|
| 25 |
def page_0(query):
|
|
|
|
| 26 |
hits = searcher.search(query, k=NUM_PAGES*RESULTS_PER_PAGE)
|
| 27 |
ix = [int(hit.docid) for hit in hits]
|
| 28 |
results = ds.select(ix).shard(num_shards=NUM_PAGES, index=0, contiguous=True) # no need to shard. split ix in batches instead. (would make sense if results was cacheable)
|
|
|
|
| 2 |
from datasets import load_from_disk
|
| 3 |
from pyserini.search.lucene import LuceneSearcher
|
| 4 |
from pyserini.analysis import JWhiteSpaceAnalyzer
|
| 5 |
+
from itertools import chain
|
| 6 |
+
from nltk.util import everygrams
|
| 7 |
|
| 8 |
searcher = LuceneSearcher("index")
|
| 9 |
searcher.set_analyzer(JWhiteSpaceAnalyzer())
|
| 10 |
|
| 11 |
+
def tokenize_word(word, min_len=2, max_len=4):
|
| 12 |
+
return [''.join(ngram) for ngram in list(everygrams(word, min_len=min_len, max_len=max_len))]
|
| 13 |
+
|
| 14 |
+
def tokenize_sentence(sentence, min_len=2, max_len=4):
|
| 15 |
+
return " ".join(chain(*[tokenize_word(word, min_len=min_len, max_len=max_len) for word in sentence.split()]))
|
| 16 |
+
|
| 17 |
ds = load_from_disk("data")
|
| 18 |
NUM_PAGES = 10 # STATIC. THIS CAN'T CHANGE BECAUSE GRADIO CAN'T DYNAMICALLY CREATE COMPONENTS.
|
| 19 |
RESULTS_PER_PAGE = 5
|
|
|
|
| 31 |
return "\n".join([result_html(result, meta) for result,meta in zip(results[TEXT_FIELD], results[METADATA_FIELD])])
|
| 32 |
|
| 33 |
def page_0(query):
|
| 34 |
+
query = tokenize_sentence(query)
|
| 35 |
hits = searcher.search(query, k=NUM_PAGES*RESULTS_PER_PAGE)
|
| 36 |
ix = [int(hit.docid) for hit in hits]
|
| 37 |
results = ds.select(ix).shard(num_shards=NUM_PAGES, index=0, contiguous=True) # no need to shard. split ix in batches instead. (would make sense if results was cacheable)
|