|
import gradio as gr |
|
import pandas as pd |
|
from sentence_transformers import SentenceTransformer, util |
|
|
|
sentences = [ |
|
"Molly ate a fish", |
|
"Давайте посетим море", |
|
"I would like to sell you a house", |
|
"Я пытаюсь купить дачу", |
|
"J'aimerais vous louer un grand appartement", |
|
"This is a wonderful investment opportunity", |
|
"これは素晴らしい投資機会です", |
|
"野球はあなたが思うよりも面白いことがあります", |
|
] |
|
translations = { |
|
"Давайте посетим море": "Let's visit the seaside", |
|
"Я пытаюсь купить дачу": "I'm trying to buy a summer home", |
|
"J'aimerais vous louer un grand appartement": "I would like to rent a large apartment to you", |
|
"これは素晴らしい投資機会です": "This is a great investment opportunity", |
|
"野球はあなたが思うよりも面白いことがあります": "Baseball can be more interesting than you think" |
|
} |
|
samples = '\n'.join(sentences) |
|
|
|
model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2") |
|
model2 = SentenceTransformer("sentence-transformers/all-MiniLM-L12-v2") |
|
|
|
|
|
def do_action(query, documents): |
|
sentences = documents.split("\n") |
|
|
|
query_embedding = model.encode([query]) |
|
corpus_embeddings = model.encode(sentences) |
|
rankings = util.semantic_search(query_embedding, corpus_embeddings)[0] |
|
|
|
query_embedding2 = model2.encode([query]) |
|
corpus_embeddings2 = model2.encode(sentences) |
|
rankings2 = util.semantic_search(query_embedding2, corpus_embeddings2)[0] |
|
|
|
|
|
results = [] |
|
for ranking, ranking2 in zip(rankings, rankings2): |
|
text = sentences[ranking['corpus_id']] |
|
if text in translations: |
|
text = f"{text} [english: {translations[text]}]" |
|
result = { |
|
'text': text, |
|
'score_multi': ranking['score'], |
|
'score_en': ranking2['score'], |
|
} |
|
results.append(result) |
|
|
|
title = f"## Matches for \"{query}\"" |
|
|
|
return title, pd.DataFrame(results) \ |
|
.sort_values(by='score_multi', ascending=False) \ |
|
.style.format(precision=2) \ |
|
.background_gradient('YlGnBu') |
|
|
|
query_input = gr.Textbox(value="Get rich quick by flipping cheap houses") |
|
docs_input = gr.Textbox(value=samples, label="Sentences") |
|
output_scores = gr.DataFrame(label="", wrap=True, scale=2) |
|
|
|
title = "Multilingual Semantic Search/Similarity Comparison" |
|
desc = """ |
|
A small demo to compare the [all-MiniLM-L12-v2](https://huggingface.co/sentence-transformers/all-MiniLM-L12-v2) and [paraphrase-multilingual-MiniLM-L12-v2](https://huggingface.co/sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2) embedding models. Notice how the multilingual model scores similar sentences higher, even if they aren't in the same language! |
|
""" |
|
demo = gr.Interface(fn=do_action, title=title, description=desc, inputs=[query_input, docs_input], outputs=["markdown", output_scores]) |
|
demo.launch() |