File size: 3,289 Bytes
985d3f5
9d22f91
 
985d3f5
 
 
2a2a619
 
 
 
 
 
 
 
985d3f5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2a2a619
985d3f5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2a2a619
 
 
 
 
 
 
 
 
 
 
 
 
 
985d3f5
2a2a619
 
 
 
985d3f5
 
2a2a619
985d3f5
 
 
2a2a619
985d3f5
2a2a619
 
 
 
9d22f91
985d3f5
2a2a619
b6b6380
2a2a619
985d3f5
 
 
2a2a619
b6b6380
2a2a619
 
 
 
985d3f5
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
# let's import the libraries we need
#from sentence_transformers import SentenceTransformer
#from sentence_transformers import CrossEncoder
import spacy
from sklearn.metrics.pairwise import cosine_similarity
from datasets import load_dataset
import io
import netrc
import pickle
import sys
import pandas as pd
import numpy as np
import streamlit as st
import torch
from tqdm import tqdm
tqdm.pandas()

# Load the English STSB dataset
stsb_dataset = load_dataset('stsb_multi_mt', 'en')
stsb_train = pd.DataFrame(stsb_dataset['train'])
stsb_test = pd.DataFrame(stsb_dataset['test'])

# let's create helper functions
nlp = spacy.load("en_core_web_sm")


def text_processing(sentence):
    sentence = [token.lemma_.lower()
                for token in nlp(sentence)
                if token.is_alpha and not token.is_stop]
    return sentence


def cos_sim(sentence1_emb, sentence2_emb):
    cos_sim = cosine_similarity(sentence1_emb, sentence2_emb)
    return np.diag(cos_sim)


# let's read the csv file
data = (pd.read_csv("/SBERT_data.csv")).drop(['Unnamed: 0'], axis=1)

prompt = "charles"
data['prompt'] = prompt
data.rename(columns={'target_text': 'sentence2',
            'prompt': 'sentence1'}, inplace=True)
data['sentence2'] = data['sentence2'].astype('str')
data['sentence1'] = data['sentence1'].astype('str')

XpathFinder = CrossEncoder("cross-encoder/stsb-roberta-base")
sentence_pairs = []
for sentence1, sentence2 in zip(data['sentence1'], data['sentence2']):
    sentence_pairs.append([sentence1, sentence2])

data['SBERT CrossEncoder_Score'] = XpathFinder.predict(
    sentence_pairs, show_progress_bar=True)

# sorting the values
data.sort_values(by=['SBERT CrossEncoder_Score'], ascending=False)

loaded_model = XpathFinder

# Containers
header_container = st.container()
mod_container = st.container()

# Header
with header_container:

    # different levels of text you can include in your app
    st.title("Xpath Finder App")


# model container
with mod_container:

    # collecting input from user
    prompt = st.text_input("Enter your description below ...")

    # Loading e data
    data = (pd.read_csv("/content/SBERT_data.csv")
            ).drop(['Unnamed: 0'], axis=1)

    data['prompt'] = prompt
    data.rename(columns={'target_text': 'sentence2',
                'prompt': 'sentence1'}, inplace=True)
    data['sentence2'] = data['sentence2'].astype('str')
    data['sentence1'] = data['sentence1'].astype('str')

    # let's pass the input to the loaded_model with torch compiled with cuda
    if prompt:
        # let's get the result
        simscore = loaded_model.predict([prompt])

        from sentence_transformers import CrossEncoder
        loaded_model = CrossEncoder("cross-encoder/stsb-roberta-base")
        sentence_pairs = []
        for sentence1, sentence2 in zip(data['sentence1'], data['sentence2']):
            sentence_pairs.append([sentence1, sentence2])

        # sorting the df to get highest scoring xpath_container
        data['SBERT CrossEncoder_Score'] = loaded_model.predict(sentence_pairs)
        most_acc = data.head(5)
        # predictions
        st.write("Highest Similarity score: ", simscore)
        st.text("Is this one of these the Xpath you're looking for?")
        st.write(st.write(most_acc["input_text"]))