|
import gradio as gr |
|
import pandas as pd |
|
import re |
|
|
|
|
|
def load_data(): |
|
url = "https://huggingface.co/datasets/unijoh/RAVNlex/resolve/main/RAVNlex_small.csv" |
|
df = pd.read_csv(url, delimiter='\t', encoding='iso-8859-10', dtype=str) |
|
lemmas = {} |
|
current_lemma = None |
|
|
|
for row in df.itertuples(index=False, name=None): |
|
if len(row) < 5: |
|
print(f"Skipping problematic line: {row}") |
|
continue |
|
orto, ppos, phon1, phon2, comm, *pronunciations = map(lambda x: x if isinstance(x, str) else "", row) |
|
if orto == '---': |
|
current_lemma = None |
|
elif current_lemma is None: |
|
current_lemma = orto.replace("ORTO:", "") |
|
lemmas[current_lemma] = [] |
|
lemma_data = { |
|
'word': current_lemma, |
|
'PPOS': ppos.replace("PPOS:", "") if ppos else "", |
|
'PHON1': phon1.replace("PHON:", "") if phon1 else "", |
|
'PHON2': phon2.replace("PHON:", "") if phon2 else "", |
|
'COMM': comm if comm else "", |
|
'pronunciations': pronunciations |
|
} |
|
lemmas[current_lemma].append(lemma_data) |
|
else: |
|
lemma_data = { |
|
'word': orto.replace("ORTO:", "") if orto else "", |
|
'PPOS': ppos.replace("PPOS:", "") if ppos else "", |
|
'PHON1': phon1.replace("PHON:", "") if phon1 else "", |
|
'PHON2': phon2.replace("PHON:", "") if phon2 else "", |
|
'COMM': comm if comm else "", |
|
'pronunciations': pronunciations |
|
} |
|
lemmas[current_lemma].append(lemma_data) |
|
|
|
print("Loaded lemmas:", lemmas) |
|
return lemmas |
|
|
|
lemmas = load_data() |
|
|
|
def expand_ppos(ppos): |
|
matches = re.findall(r'\[([^\]]+)\]', ppos) |
|
if matches: |
|
expanded = [] |
|
for match in matches[0]: |
|
expanded.append(ppos.replace(f'[{matches[0]}]', match)) |
|
return expanded |
|
else: |
|
return [ppos] |
|
|
|
def create_noun_table(lemma, forms): |
|
table_data = { |
|
'ncmsn==iuu': '', 'ncmsn==duu': '', 'ncfsn==iuu': '', 'ncfsn==duu': '', 'ncnsn==iuu': '', 'ncnsn==duu': '', |
|
'ncmsa==iuu': '', 'ncmsa==duu': '', 'ncfsa==iuu': '', 'ncfsa==duu': '', 'ncnsa==iuu': '', 'ncnsa==duu': '', |
|
'ncmsd==iuu': '', 'ncmsd==duu': '', 'ncfsd==iuu': '', 'ncfsd==duu': '', 'ncnsd==iuu': '', 'ncnsd==duu': '', |
|
'ncmsg==iou': '', 'ncmsg==dou': '', 'ncfsg==iou': '', 'ncfsg==dou': '', 'ncnsg==iou': '', 'ncnsg==dou': '', |
|
'ncmpn==iuu': '', 'ncmpn==duu': '', 'ncfnn==iuu': '', 'ncfnn==duu': '', 'ncnnn==iuu': '', 'ncnnn==duu': '', |
|
'ncmpa==iuu': '', 'ncmpa==duu': '', 'ncfna==iuu': '', 'ncfna==duu': '', 'ncnna==iuu': '', 'ncnna==duu': '', |
|
'ncmpd==iuu': '', 'ncmpd==duu': '', 'ncmpg==iou': '', 'ncmpg==dou': '', 'ncfnn==iou': '', 'ncfnn==dou': '', 'ncnnn==iou': '', 'ncnnn==dou': '' |
|
} |
|
|
|
for form in forms: |
|
ppos = form['PPOS'].lower() |
|
word = form['word'] |
|
print(f"Processing: word={word}, ppos={ppos}, key={ppos}") |
|
expanded_ppos_list = expand_ppos(ppos) |
|
for expanded_ppos in expanded_ppos_list: |
|
key = expanded_ppos |
|
if key in table_data: |
|
table_data[key] = word |
|
else: |
|
print(f"Unmatched key: {key} for word: {word} with PPOS: {ppos}") |
|
|
|
print(f"Final table data for {lemma}: {table_data}") |
|
|
|
table = f""" |
|
<table border="1"> |
|
<thead> |
|
<tr> |
|
<th colspan="2">Eintal</th> |
|
<th colspan="2">Fleirtal</th> |
|
</tr> |
|
<tr> |
|
<th>Óbundið</th> |
|
<th>Bundið</th> |
|
<th>Óbundið</th> |
|
<th>Bundið</th> |
|
</tr> |
|
</thead> |
|
<tbody> |
|
<tr> |
|
<td>{table_data['ncmsn==iuu'] or table_data['ncfsn==iuu'] or table_data['ncnsn==iuu']}</td> |
|
<td>{table_data['ncmsn==duu'] or table_data['ncfsn==duu'] or table_data['ncnsn==duu']}</td> |
|
<td>{table_data['ncmpn==iuu'] or table_data['ncfnn==iuu'] or table_data['ncnnn==iuu']}</td> |
|
<td>{table_data['ncmpn==duu'] or table_data['ncfnn==duu'] or table_data['ncnnn==duu']}</td> |
|
</tr> |
|
<tr> |
|
<td>{table_data['ncmsa==iuu'] or table_data['ncfsa==iuu'] or table_data['ncnsa==iuu']}</td> |
|
<td>{table_data['ncmsa==duu'] or table_data['ncfsa==duu'] or table_data['ncnsa==duu']}</td> |
|
<td>{table_data['ncmpa==iuu'] or table_data['ncfna==iuu'] or table_data['ncnna==iuu']}</td> |
|
<td>{table_data['ncmpa==duu'] or table_data['ncfna==duu'] or table_data['ncnna==duu']}</td> |
|
</tr> |
|
<tr> |
|
<td>{table_data['ncmsd==iuu'] or table_data['ncfsd==iuu'] or table_data['ncnsd==iuu']}</td> |
|
<td>{table_data['ncmsd==duu'] or table_data['ncfsd==duu'] or table_data['ncnsd==duu']}</td> |
|
<td>{table_data['ncmpd==iuu'] or table_data['ncfnn==iuu'] or table_data['ncnnn==iuu']}</td> |
|
<td>{table_data['ncmpd==duu'] or table_data['ncfnn==duu'] or table_data['ncnnn==duu']}</td> |
|
</tr> |
|
<tr> |
|
<td>{table_data['ncmsg==iou'] or table_data['ncfsg==iou'] or table_data['ncnsg==iou']}</td> |
|
<td>{table_data['ncmsg==dou'] or table_data['ncfsg==dou'] or table_data['ncnsg==dou']}</td> |
|
<td>{table_data['ncmpg==iou'] or table_data['ncfnn==iou'] or table_data['ncnnn==iou']}</td> |
|
<td>{table_data['ncmpg==dou'] or table_data['ncfnn==dou'] or table_data['ncnnn==dou']}</td> |
|
</tr> |
|
</tbody> |
|
</table> |
|
""" |
|
return table |
|
|
|
def search_lemma(lemma): |
|
results = lemmas.get(lemma, None) |
|
if not results: |
|
return f"No results found for {lemma}" |
|
|
|
if 'n' in results[0]['PPOS'].lower(): |
|
table = create_noun_table(lemma, results) |
|
else: |
|
table = "Only noun tables are currently supported." |
|
|
|
return table |
|
|
|
iface = gr.Interface( |
|
fn=search_lemma, |
|
inputs="text", |
|
outputs="html", |
|
title="Lemma Search", |
|
description="Enter a lemma to search for its declensions and pronunciations." |
|
) |
|
|
|
if __name__ == "__main__": |
|
iface.launch() |
|
|