import gradio as gr import pandas as pd import re # Load and parse the CSV file from Hugging Face def load_data(): url = "https://huggingface.co/datasets/unijoh/RAVNlex/resolve/main/RAVNlex_small.csv" df = pd.read_csv(url, delimiter='\t', encoding='iso-8859-10', dtype=str) lemmas = {} current_lemma = None for row in df.itertuples(index=False, name=None): if len(row) < 5: print(f"Skipping problematic line: {row}") continue orto, ppos, phon1, phon2, comm, *pronunciations = map(lambda x: x if isinstance(x, str) else "", row) if orto == '---': current_lemma = None elif current_lemma is None: current_lemma = orto.replace("ORTO:", "") lemmas[current_lemma] = [] lemma_data = { 'word': current_lemma, 'PPOS': ppos.replace("PPOS:", "") if ppos else "", 'PHON1': phon1.replace("PHON:", "") if phon1 else "", 'PHON2': phon2.replace("PHON:", "") if phon2 else "", 'COMM': comm if comm else "", 'pronunciations': pronunciations } lemmas[current_lemma].append(lemma_data) else: lemma_data = { 'word': orto.replace("ORTO:", "") if orto else "", 'PPOS': ppos.replace("PPOS:", "") if ppos else "", 'PHON1': phon1.replace("PHON:", "") if phon1 else "", 'PHON2': phon2.replace("PHON:", "") if phon2 else "", 'COMM': comm if comm else "", 'pronunciations': pronunciations } lemmas[current_lemma].append(lemma_data) print("Loaded lemmas:", lemmas) # Debugging output return lemmas lemmas = load_data() def expand_ppos(ppos): matches = re.findall(r'\[([^\]]+)\]', ppos) if matches: expanded = [] for match in matches[0]: expanded.append(ppos.replace(f'[{matches[0]}]', match)) return expanded else: return [ppos] def create_noun_table(lemma, forms): table_data = { 'ncmsn==iuu': '', 'ncmsn==duu': '', 'ncfsn==iuu': '', 'ncfsn==duu': '', 'ncnsn==iuu': '', 'ncnsn==duu': '', 'ncmsa==iuu': '', 'ncmsa==duu': '', 'ncfsa==iuu': '', 'ncfsa==duu': '', 'ncnsa==iuu': '', 'ncnsa==duu': '', 'ncmsd==iuu': '', 'ncmsd==duu': '', 'ncfsd==iuu': '', 'ncfsd==duu': '', 'ncnsd==iuu': '', 'ncnsd==duu': '', 'ncmsg==iou': '', 'ncmsg==dou': '', 'ncfsg==iou': '', 'ncfsg==dou': '', 'ncnsg==iou': '', 'ncnsg==dou': '', 'ncmpn==iuu': '', 'ncmpn==duu': '', 'ncfnn==iuu': '', 'ncfnn==duu': '', 'ncnnn==iuu': '', 'ncnnn==duu': '', 'ncmpa==iuu': '', 'ncmpa==duu': '', 'ncfna==iuu': '', 'ncfna==duu': '', 'ncnna==iuu': '', 'ncnna==duu': '', 'ncmpd==iuu': '', 'ncmpd==duu': '', 'ncmpg==iou': '', 'ncmpg==dou': '', 'ncfnn==iou': '', 'ncfnn==dou': '', 'ncnnn==iou': '', 'ncnnn==dou': '' } for form in forms: ppos = form['PPOS'].lower() # Normalize to lowercase word = form['word'] print(f"Processing: word={word}, ppos={ppos}, key={ppos}") expanded_ppos_list = expand_ppos(ppos) for expanded_ppos in expanded_ppos_list: key = expanded_ppos if key in table_data: table_data[key] = word else: print(f"Unmatched key: {key} for word: {word} with PPOS: {ppos}") print(f"Final table data for {lemma}: {table_data}") # Debugging output table = f"""
Eintal | Fleirtal | ||
---|---|---|---|
Óbundið | Bundið | Óbundið | Bundið |
{table_data['ncmsn==iuu'] or table_data['ncfsn==iuu'] or table_data['ncnsn==iuu']} | {table_data['ncmsn==duu'] or table_data['ncfsn==duu'] or table_data['ncnsn==duu']} | {table_data['ncmpn==iuu'] or table_data['ncfnn==iuu'] or table_data['ncnnn==iuu']} | {table_data['ncmpn==duu'] or table_data['ncfnn==duu'] or table_data['ncnnn==duu']} |
{table_data['ncmsa==iuu'] or table_data['ncfsa==iuu'] or table_data['ncnsa==iuu']} | {table_data['ncmsa==duu'] or table_data['ncfsa==duu'] or table_data['ncnsa==duu']} | {table_data['ncmpa==iuu'] or table_data['ncfna==iuu'] or table_data['ncnna==iuu']} | {table_data['ncmpa==duu'] or table_data['ncfna==duu'] or table_data['ncnna==duu']} |
{table_data['ncmsd==iuu'] or table_data['ncfsd==iuu'] or table_data['ncnsd==iuu']} | {table_data['ncmsd==duu'] or table_data['ncfsd==duu'] or table_data['ncnsd==duu']} | {table_data['ncmpd==iuu'] or table_data['ncfnn==iuu'] or table_data['ncnnn==iuu']} | {table_data['ncmpd==duu'] or table_data['ncfnn==duu'] or table_data['ncnnn==duu']} |
{table_data['ncmsg==iou'] or table_data['ncfsg==iou'] or table_data['ncnsg==iou']} | {table_data['ncmsg==dou'] or table_data['ncfsg==dou'] or table_data['ncnsg==dou']} | {table_data['ncmpg==iou'] or table_data['ncfnn==iou'] or table_data['ncnnn==iou']} | {table_data['ncmpg==dou'] or table_data['ncfnn==dou'] or table_data['ncnnn==dou']} |