Spaces:

clr
/

spjall

Running

App Files Files Community

clr commited on May 14

Commit

b663aa2

•

1 Parent(s): 43243d3

offline text search

Browse files

Files changed (1) hide show

searcher.py +115 -0

searcher.py ADDED Viewed

	@@ -0,0 +1,115 @@

+import glob,json,unicodedata,re
+# basic search helper:
+# download transcript data, run searcher.py on it,
+# open output html file in a browser,
+# it contains quick links to listen to the results.
+# notes:
+# linked segment times are very approximate.
+# audio loading can be delayed, click segments again until they play.
+# transcripts aren't tagged or parsed, DIY morphosyntax by regex.
+def get_segs(tx_path):
+    with open(tx_path,'r') as handle:
+        tx = json.load(handle)
+    tx = tx['segments']
+    return [get_sent(sent) for sent in tx]
+def get_sent(sent):
+    def ms(t): # time in ms
+        return int(float(t.replace('s',''))*1000)
+    # keys of sent are 'startTime', 'endTime', 'words', 'speakerId'
+    st = sent['startTime']
+    et = sent['endTime']
+    ws = ''.join([wd['word'] for wd in sent['words']])
+    if st is None:
+        st=sent['words'][0]['startTime']
+    if et is None:
+        et=sent['words'][-1]['endTime']
+    return(ms(st),ms(et),ws)
+def html_line(match_line,url):
+    w,sk,ix = match_line
+    h = f'<p>({sk}) [{ix}] <a href="{url}#{ix}">{w}</a></p>'
+    return h
+def snorm(s):
+    s = ''.join([c.lower() for c in s if not unicodedata.category(c).startswith("P") ])
+    while '  ' in s:
+        s = s.replace('  ', ' ')
+    return s
+# the search function must operate on the conversation
+# and return the results in expected format
+def search_convos(corpus_dir, base_url, output_path, search_func, search_string=None):
+    convos = glob.glob(corpus_dir+'*/')
+    convos = [c.split(corpus_dir)[1].split('/')[0] for c in convos]
+    convos = sorted(convos)
+    result_html = ''
+    for i, convo in enumerate(convos):
+        convo_url = f'{base_url}{convo}.html'
+        txa = f'{corpus_dir}{convo}/speaker_a_convo_{convo}_transcript.json'
+        txb = f'{corpus_dir}{convo}/speaker_b_convo_{convo}_transcript.json'
+        sega = [(s,e,w,'a') for s,e,w in get_segs(txa)]
+        segb = [(s,e,w,'b') for s,e,w in get_segs(txb)]
+        segs = sega + segb
+        segs.sort(key=lambda s: s[0])
+        # discard timestamps but add turn number
+        segs = [(segs[i][2], segs[i][3], i) for i in range(len(segs))]
+        matches = search_func(segs,search_string)
+        if matches:
+            result_html += f'<h4>{convo}</h4>'
+            result_html += '\n'.join([html_line(m,convo_url) for m in matches])
+            result_html += f'<hr />'
+    with open(output_path,'w') as handle:
+        handle.write(result_html)
+def simple_search1(convo,search_string):
+    search_string = snorm(search_string)
+    norm = [(snorm(w),sk,ln) for w,sk,ln in convo]
+    matches = [(w,sk,ln) for w,sk,ln in norm if search_string in w]
+    return matches
+def regex_search1(convo,search_rx):
+    matches = [(w,sk,ln) for w,sk,ln in convo if re.findall(search_rx,snorm(w))]
+    return matches
+if __name__ == "__main__":
+    corpus_dir = './full_conversations/'
+    base_url = 'https://clr-spjall.static.hf.space/pages/'
+    output_path = './tmp.html'
+    #search_func = simple_search1
+    search_func = regex_search1
+    #search_string = 'kannski'
+    #search_string = 'eða'
+    #search_string = r'\Wá \w+ eða \w+'
+    #search_string = r'\Wí \w+ eða \w+'
+    #search_string = r'nei\S? \w+ \w+ (ekki|aldrei|ekkert)'#|enga|engu|eng\w\w)'
+    #search_string = r'hvor\S* .* eða'
+    #search_string = r'\Wef .* þá'
+    search_string = r'^\w+ sem'
+    search_convos(corpus_dir, base_url, output_path, search_func, search_string)