Spaces:
Running
Running
import glob,json,unicodedata,re | |
# basic search helper: | |
# download transcript data, run searcher.py on it, | |
# open output html file in a browser, | |
# it contains quick links to listen to the results. | |
# notes: | |
# linked segment times are very approximate. | |
# audio loading can be delayed, click segments again until they play. | |
# transcripts aren't tagged or parsed, DIY morphosyntax by regex. | |
def get_segs(tx_path): | |
with open(tx_path,'r') as handle: | |
tx = json.load(handle) | |
tx = tx['segments'] | |
return [get_sent(sent) for sent in tx] | |
def get_sent(sent): | |
def ms(t): # time in ms | |
return int(float(t.replace('s',''))*1000) | |
# keys of sent are 'startTime', 'endTime', 'words', 'speakerId' | |
st = sent['startTime'] | |
et = sent['endTime'] | |
ws = ''.join([wd['word'] for wd in sent['words']]) | |
if st is None: | |
st=sent['words'][0]['startTime'] | |
if et is None: | |
et=sent['words'][-1]['endTime'] | |
return(ms(st),ms(et),ws) | |
def html_line(match_line,url): | |
w,sk,ix = match_line | |
h = f'<p>({sk}) [{ix}] <a href="{url}#{ix}">{w}</a></p>' | |
return h | |
def snorm(s): | |
s = ''.join([c.lower() for c in s if not unicodedata.category(c).startswith("P") ]) | |
while ' ' in s: | |
s = s.replace(' ', ' ') | |
return s | |
# the search function must operate on the conversation | |
# and return the results in expected format | |
def search_convos(corpus_dir, base_url, output_path, search_func, search_string=None): | |
convos = glob.glob(corpus_dir+'*/') | |
convos = [c.split(corpus_dir)[1].split('/')[0] for c in convos] | |
convos = sorted(convos) | |
result_html = '' | |
for i, convo in enumerate(convos): | |
convo_url = f'{base_url}{convo}.html' | |
txa = f'{corpus_dir}{convo}/speaker_a_convo_{convo}_transcript.json' | |
txb = f'{corpus_dir}{convo}/speaker_b_convo_{convo}_transcript.json' | |
sega = [(s,e,w,'a') for s,e,w in get_segs(txa)] | |
segb = [(s,e,w,'b') for s,e,w in get_segs(txb)] | |
segs = sega + segb | |
segs.sort(key=lambda s: s[0]) | |
# discard timestamps but add turn number | |
segs = [(segs[i][2], segs[i][3], i) for i in range(len(segs))] | |
matches = search_func(segs,search_string) | |
if matches: | |
result_html += f'<h4>{convo}</h4>' | |
result_html += '\n'.join([html_line(m,convo_url) for m in matches]) | |
result_html += f'<hr />' | |
with open(output_path,'w') as handle: | |
handle.write(result_html) | |
def simple_search1(convo,search_string): | |
search_string = snorm(search_string) | |
norm = [(snorm(w),sk,ln) for w,sk,ln in convo] | |
matches = [(w,sk,ln) for w,sk,ln in norm if search_string in w] | |
return matches | |
def regex_search1(convo,search_rx): | |
matches = [(w,sk,ln) for w,sk,ln in convo if re.findall(search_rx,snorm(w))] | |
return matches | |
if __name__ == "__main__": | |
corpus_dir = './full_conversations/' | |
base_url = 'https://clr-spjall.static.hf.space/pages/' | |
output_path = './tmp.html' | |
#search_func = simple_search1 | |
search_func = regex_search1 | |
#search_string = 'kannski' | |
#search_string = 'eða' | |
#search_string = r'\Wá \w+ eða \w+' | |
#search_string = r'\Wí \w+ eða \w+' | |
#search_string = r'nei\S? \w+ \w+ (ekki|aldrei|ekkert)'#|enga|engu|eng\w\w)' | |
#search_string = r'hvor\S* .* eða' | |
#search_string = r'\Wef .* þá' | |
search_string = r'^\w+ sem' | |
search_convos(corpus_dir, base_url, output_path, search_func, search_string) | |