File size: 2,534 Bytes
050cd2b
6b2f9a9
 
 
 
 
 
 
 
 
050cd2b
 
 
 
 
985d3f5
 
 
2a2a619
 
985d3f5
 
 
050cd2b
985d3f5
 
 
 
be2864a
6b2f9a9
 
 
 
985d3f5
be2864a
 
 
 
 
985d3f5
6b2f9a9
be2864a
 
 
985d3f5
6b2f9a9
985d3f5
76d621c
985d3f5
 
 
 
 
 
 
 
050cd2b
985d3f5
 
 
6b2f9a9
985d3f5
 
6b2f9a9
985d3f5
 
2a2a619
050cd2b
2a2a619
 
 
050cd2b
2a2a619
6b2f9a9
 
2a2a619
050cd2b
2a2a619
6b2f9a9
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
# let's import the libraries
from sentence_transformers import util
from sentence_transformers import CrossEncoder
from sentence_transformers import SentenceTransformer
import sentence_transformers
import time
import sys
import os
import torch
import en_core_web_sm
from email import header
import streamlit as st
import pandas as pd
import numpy as np
import pickle
import spacy
from sklearn.metrics.pairwise import cosine_similarity
from datasets import load_dataset
import io
import netrc
from tqdm import tqdm
tqdm.pandas()

# let's load the english stsb dataset
stsb_dataset = load_dataset('stsb_multi_mt', 'en')
stsb_train = pd.DataFrame(stsb_dataset['train'])
stsb_test = pd.DataFrame(stsb_dataset['test'])

# let's create helper functions
nlp = en_core_web_sm.load()

#nlp = spacy.load("en_core_web_sm")


def text_processing(sentence):
    sentence = [token.lemma_.lower()
                for token in nlp(sentence)
                if token.is_alpha and not token.is_stop]
    return sentence


def cos_sim(sentence1_emb, sentence2_emb):
    cos_sim = cosine_similarity(sentence1_emb, sentence2_emb)
    return np.diag(cos_sim)


# let's read the csv file
data = (pd.read_csv("SBERT_data.csv")).drop(['Unnamed: 0'], axis=1)

prompt = "charles"
data['prompt'] = prompt
data.rename(columns={'target_text': 'sentence2',
            'prompt': 'sentence1'}, inplace=True)
data['sentence2'] = data['sentence2'].astype('str')
data['sentence1'] = data['sentence1'].astype('str')

# loop through the data
XpathFinder = CrossEncoder("cross-encoder/stsb-roberta-base")
sentence_pairs = []
for sentence1, sentence2 in zip(data['sentence1'], data['sentence2']):
    sentence_pairs.append([sentence1, sentence2])

data['SBERT CrossEncoder_Score'] = XpathFinder.predict(
    sentence_pairs, show_progress_bar=True)

loaded_model = XpathFinder

# let's create containers
header_container = st.container()
mod_container = st.container()

# let's create the header
with header_container:
    st.title("SBERT CrossEncoder")
    st.markdown("This is a demo of the SBERT CrossEncoder model")

# let's create the model container
with mod_container:
    # let's get input from the user
    prompt = st.text_input("Enter a description below...")

    if prompt:
        simscore = loaded_model.predict([prompt])
        # sort the values
        data['SBERT CrossEncoder_Score'] = simscore
        most_acc = data.head(5)
        st.write(most_acc)
        st.write("The most accurate sentence is: ",
                 most_acc['sentence2'].iloc[0])