Spaces:
Runtime error
Runtime error
add all
Browse files- README.md +4 -6
- app.py +39 -0
- fetch_transcript.py +70 -0
- model.py +61 -0
- models/QA_Model/config.json +30 -0
- models/QA_Model/merges.txt +0 -0
- models/QA_Model/pytorch_model.bin +3 -0
- models/QA_Model/special_tokens_map.json +51 -0
- models/QA_Model/tokenizer.json +0 -0
- models/QA_Model/tokenizer_config.json +67 -0
- models/QA_Model/vocab.json +0 -0
- models/Similarity_Model/1_Pooling/config.json +7 -0
- models/Similarity_Model/README.md +176 -0
- models/Similarity_Model/config.json +26 -0
- models/Similarity_Model/config_sentence_transformers.json +7 -0
- models/Similarity_Model/modules.json +20 -0
- models/Similarity_Model/pytorch_model.bin +3 -0
- models/Similarity_Model/sentence_bert_config.json +4 -0
- models/Similarity_Model/special_tokens_map.json +7 -0
- models/Similarity_Model/tokenizer.json +0 -0
- models/Similarity_Model/tokenizer_config.json +16 -0
- models/Similarity_Model/vocab.txt +0 -0
- preprocessing.py +18 -0
- requirements.txt +4 -0
README.md
CHANGED
@@ -1,12 +1,10 @@
|
|
1 |
---
|
2 |
-
title: Youtube
|
3 |
-
emoji:
|
4 |
-
colorFrom:
|
5 |
-
colorTo:
|
6 |
sdk: streamlit
|
7 |
sdk_version: 1.17.0
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
---
|
11 |
-
|
12 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
1 |
---
|
2 |
+
title: Youtube Q&A
|
3 |
+
emoji: 📹️
|
4 |
+
colorFrom: red
|
5 |
+
colorTo: black
|
6 |
sdk: streamlit
|
7 |
sdk_version: 1.17.0
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
---
|
|
|
|
app.py
ADDED
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from streamlit_player import st_player
|
3 |
+
|
4 |
+
from model import Engine
|
5 |
+
from fetch_transcript import fetch_transcript
|
6 |
+
from preprocessing import create_similarity_text, create_result_url
|
7 |
+
|
8 |
+
with st.container():
|
9 |
+
st.title('YouTube Q&A Search')
|
10 |
+
st.write('Ask YouTube videos questions and get your answers :)')
|
11 |
+
|
12 |
+
with st.container():
|
13 |
+
|
14 |
+
url_input = st.text_input(label='Video',placeholder='enter YouTube video url')
|
15 |
+
|
16 |
+
question_input = st.text_input(label='Question',placeholder='enter your question')
|
17 |
+
|
18 |
+
get_ans = st.button(label='Answer!')
|
19 |
+
|
20 |
+
if len(url_input)!='' and len(question_input)!='' and get_ans:
|
21 |
+
|
22 |
+
with st.spinner('loading your video...'):
|
23 |
+
transcript = fetch_transcript(url_input)
|
24 |
+
model = Engine(transcript)
|
25 |
+
prev_url = url_input
|
26 |
+
|
27 |
+
with st.spinner('finding an answer...'):
|
28 |
+
answer = model.ask(question_input)
|
29 |
+
similarity_text = create_similarity_text(question_input,answer)
|
30 |
+
groups,timestamps = model.find_similar(similarity_text)
|
31 |
+
url = create_result_url(url_input,timestamps[0])
|
32 |
+
|
33 |
+
with st.container():
|
34 |
+
|
35 |
+
st.caption('Extracted Answer:')
|
36 |
+
st.write(answer)
|
37 |
+
st.caption('In Video:')
|
38 |
+
st_player(url)
|
39 |
+
|
fetch_transcript.py
ADDED
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import regex as re
|
2 |
+
from youtube_transcript_api import YouTubeRequestFailed, YouTubeTranscriptApi
|
3 |
+
|
4 |
+
from preprocessing import stride_sentences
|
5 |
+
|
6 |
+
|
7 |
+
def validate_youtube_link(url: str) -> str:
|
8 |
+
"""
|
9 |
+
this method validates the youtube video link provided.
|
10 |
+
input : url (str)
|
11 |
+
outputs: transcript (string/dict)
|
12 |
+
"""
|
13 |
+
yt_regex = r"^.*(youtu.be\/|v\/|u\/\w\/|embed\/|watch\?v=|\&v=|\?v=)([^#\&\?]*).*"
|
14 |
+
matches = re.findall(yt_regex, url)
|
15 |
+
|
16 |
+
assert (len(matches[0][1]) == 11), "Invalid YouTube Link"
|
17 |
+
|
18 |
+
video_id:str = matches[0][1]
|
19 |
+
|
20 |
+
return video_id
|
21 |
+
|
22 |
+
|
23 |
+
def zip_transcript(transcript:list) -> dict:
|
24 |
+
start_times = []
|
25 |
+
texts = []
|
26 |
+
for item in transcript:
|
27 |
+
start_times.append(item['start'])
|
28 |
+
texts.append(item['text'].strip().replace('\n',' '))
|
29 |
+
|
30 |
+
return {
|
31 |
+
'timestamps': start_times,
|
32 |
+
'texts': texts
|
33 |
+
}
|
34 |
+
|
35 |
+
|
36 |
+
|
37 |
+
def full_text(transcript: list) -> str:
|
38 |
+
texts = []
|
39 |
+
for item in transcript:
|
40 |
+
texts.append(item['text'])
|
41 |
+
return ' '.join(texts).strip()
|
42 |
+
|
43 |
+
|
44 |
+
def fetch_transcript(url: str) -> list:
|
45 |
+
|
46 |
+
video_id = validate_youtube_link(url)
|
47 |
+
|
48 |
+
try:
|
49 |
+
transcript:list = YouTubeTranscriptApi.get_transcript(video_id=video_id)
|
50 |
+
|
51 |
+
except YouTubeRequestFailed:
|
52 |
+
raise Exception('YouTube Request Failed, try again later.')
|
53 |
+
|
54 |
+
return transcript
|
55 |
+
|
56 |
+
|
57 |
+
|
58 |
+
if __name__ == '__main__':
|
59 |
+
sample = 'https://www.youtube.com/watch?v=t6V9i8fFADI'
|
60 |
+
sample2 = 'https://www.youtube.com/watch?v=1nLHIM2IPRY'
|
61 |
+
fake_sample = 'https://www.youtube.com/watch?v=asdf3'
|
62 |
+
transcript = fetch_transcript(url=sample)
|
63 |
+
|
64 |
+
times, texts = zip_transcript(transcript)
|
65 |
+
texts = stride_sentences(texts)
|
66 |
+
print(texts[0])
|
67 |
+
|
68 |
+
# with open('sample_group.txt','w') as f:
|
69 |
+
# for group in groups:
|
70 |
+
# f.write(f"{group}\n\n")
|
model.py
ADDED
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pathlib import Path
|
2 |
+
|
3 |
+
import torch
|
4 |
+
from sentence_transformers import SentenceTransformer
|
5 |
+
from sentence_transformers.util import cos_sim
|
6 |
+
from transformers import pipeline
|
7 |
+
from preprocessing import stride_sentences
|
8 |
+
from fetch_transcript import zip_transcript
|
9 |
+
|
10 |
+
|
11 |
+
class Engine:
|
12 |
+
def __init__(self, transcript:list) -> None:
|
13 |
+
|
14 |
+
self.base_path = Path('./models')
|
15 |
+
|
16 |
+
self.qa_model_name = 'QA_Model'
|
17 |
+
self.qa_model_path = self.base_path / self.qa_model_name
|
18 |
+
self.qa_model = pipeline('question-answering',model=str(self.qa_model_path))
|
19 |
+
|
20 |
+
self.sim_model_name = 'Similarity_Model'
|
21 |
+
self.sim_model_path = self.base_path / self.sim_model_name
|
22 |
+
self.sim_model = SentenceTransformer(self.sim_model_path)
|
23 |
+
|
24 |
+
self.timestamps, self.texts = zip_transcript(transcript).values()
|
25 |
+
|
26 |
+
self.stride = 10
|
27 |
+
self.text_groups = stride_sentences(self.texts,self.stride)
|
28 |
+
|
29 |
+
self.embeddings = self._encode_transcript()
|
30 |
+
|
31 |
+
|
32 |
+
def _encode_transcript(self):
|
33 |
+
return self.sim_model.encode(self.text_groups)
|
34 |
+
|
35 |
+
|
36 |
+
def ask(self, question_text:str):
|
37 |
+
|
38 |
+
result = self.qa_model(
|
39 |
+
question=question_text,
|
40 |
+
context=' '.join(self.text_groups).strip(),
|
41 |
+
doc_stride=256,
|
42 |
+
max_answer_len=512,
|
43 |
+
max_question_len=128,
|
44 |
+
)
|
45 |
+
return result['answer']
|
46 |
+
|
47 |
+
|
48 |
+
def find_similar(self, txt:str, top_k=1):
|
49 |
+
txt = self.sim_model.encode(txt)
|
50 |
+
similarities:torch.Tensor = cos_sim(txt,self.embeddings)
|
51 |
+
similarities = similarities.reshape(-1)
|
52 |
+
indices = list(torch.argsort(similarities))
|
53 |
+
indices = [idx.item() for idx in indices[::-1]][:top_k]
|
54 |
+
groups = [self.text_groups[i] for i in indices]
|
55 |
+
timestamps = [self.timestamps[self.stride*i] for i in indices]
|
56 |
+
return groups, timestamps
|
57 |
+
|
58 |
+
|
59 |
+
if __name__ == '__main__':
|
60 |
+
model = Engine()
|
61 |
+
|
models/QA_Model/config.json
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "deepset/roberta-base-squad2",
|
3 |
+
"architectures": [
|
4 |
+
"RobertaForQuestionAnswering"
|
5 |
+
],
|
6 |
+
"attention_probs_dropout_prob": 0.1,
|
7 |
+
"bos_token_id": 0,
|
8 |
+
"classifier_dropout": null,
|
9 |
+
"eos_token_id": 2,
|
10 |
+
"gradient_checkpointing": false,
|
11 |
+
"hidden_act": "gelu",
|
12 |
+
"hidden_dropout_prob": 0.1,
|
13 |
+
"hidden_size": 768,
|
14 |
+
"initializer_range": 0.02,
|
15 |
+
"intermediate_size": 3072,
|
16 |
+
"language": "english",
|
17 |
+
"layer_norm_eps": 1e-05,
|
18 |
+
"max_position_embeddings": 514,
|
19 |
+
"model_type": "roberta",
|
20 |
+
"name": "Roberta",
|
21 |
+
"num_attention_heads": 12,
|
22 |
+
"num_hidden_layers": 12,
|
23 |
+
"pad_token_id": 1,
|
24 |
+
"position_embedding_type": "absolute",
|
25 |
+
"torch_dtype": "float32",
|
26 |
+
"transformers_version": "4.26.1",
|
27 |
+
"type_vocab_size": 1,
|
28 |
+
"use_cache": true,
|
29 |
+
"vocab_size": 50265
|
30 |
+
}
|
models/QA_Model/merges.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
models/QA_Model/pytorch_model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6d3de9857583a4639b6f23a05a2e9531b7c1c64a0c13adcc9671156bf7bcd740
|
3 |
+
size 496296301
|
models/QA_Model/special_tokens_map.json
ADDED
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"bos_token": {
|
3 |
+
"content": "<s>",
|
4 |
+
"lstrip": false,
|
5 |
+
"normalized": true,
|
6 |
+
"rstrip": false,
|
7 |
+
"single_word": false
|
8 |
+
},
|
9 |
+
"cls_token": {
|
10 |
+
"content": "<s>",
|
11 |
+
"lstrip": false,
|
12 |
+
"normalized": true,
|
13 |
+
"rstrip": false,
|
14 |
+
"single_word": false
|
15 |
+
},
|
16 |
+
"eos_token": {
|
17 |
+
"content": "</s>",
|
18 |
+
"lstrip": false,
|
19 |
+
"normalized": true,
|
20 |
+
"rstrip": false,
|
21 |
+
"single_word": false
|
22 |
+
},
|
23 |
+
"mask_token": {
|
24 |
+
"content": "<mask>",
|
25 |
+
"lstrip": true,
|
26 |
+
"normalized": true,
|
27 |
+
"rstrip": false,
|
28 |
+
"single_word": false
|
29 |
+
},
|
30 |
+
"pad_token": {
|
31 |
+
"content": "<pad>",
|
32 |
+
"lstrip": false,
|
33 |
+
"normalized": true,
|
34 |
+
"rstrip": false,
|
35 |
+
"single_word": false
|
36 |
+
},
|
37 |
+
"sep_token": {
|
38 |
+
"content": "</s>",
|
39 |
+
"lstrip": false,
|
40 |
+
"normalized": true,
|
41 |
+
"rstrip": false,
|
42 |
+
"single_word": false
|
43 |
+
},
|
44 |
+
"unk_token": {
|
45 |
+
"content": "<unk>",
|
46 |
+
"lstrip": false,
|
47 |
+
"normalized": true,
|
48 |
+
"rstrip": false,
|
49 |
+
"single_word": false
|
50 |
+
}
|
51 |
+
}
|
models/QA_Model/tokenizer.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
models/QA_Model/tokenizer_config.json
ADDED
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"add_prefix_space": false,
|
3 |
+
"bos_token": {
|
4 |
+
"__type": "AddedToken",
|
5 |
+
"content": "<s>",
|
6 |
+
"lstrip": false,
|
7 |
+
"normalized": true,
|
8 |
+
"rstrip": false,
|
9 |
+
"single_word": false
|
10 |
+
},
|
11 |
+
"cls_token": {
|
12 |
+
"__type": "AddedToken",
|
13 |
+
"content": "<s>",
|
14 |
+
"lstrip": false,
|
15 |
+
"normalized": true,
|
16 |
+
"rstrip": false,
|
17 |
+
"single_word": false
|
18 |
+
},
|
19 |
+
"do_lower_case": false,
|
20 |
+
"eos_token": {
|
21 |
+
"__type": "AddedToken",
|
22 |
+
"content": "</s>",
|
23 |
+
"lstrip": false,
|
24 |
+
"normalized": true,
|
25 |
+
"rstrip": false,
|
26 |
+
"single_word": false
|
27 |
+
},
|
28 |
+
"errors": "replace",
|
29 |
+
"full_tokenizer_file": null,
|
30 |
+
"mask_token": {
|
31 |
+
"__type": "AddedToken",
|
32 |
+
"content": "<mask>",
|
33 |
+
"lstrip": true,
|
34 |
+
"normalized": true,
|
35 |
+
"rstrip": false,
|
36 |
+
"single_word": false
|
37 |
+
},
|
38 |
+
"model_max_length": 512,
|
39 |
+
"name_or_path": "deepset/roberta-base-squad2",
|
40 |
+
"pad_token": {
|
41 |
+
"__type": "AddedToken",
|
42 |
+
"content": "<pad>",
|
43 |
+
"lstrip": false,
|
44 |
+
"normalized": true,
|
45 |
+
"rstrip": false,
|
46 |
+
"single_word": false
|
47 |
+
},
|
48 |
+
"sep_token": {
|
49 |
+
"__type": "AddedToken",
|
50 |
+
"content": "</s>",
|
51 |
+
"lstrip": false,
|
52 |
+
"normalized": true,
|
53 |
+
"rstrip": false,
|
54 |
+
"single_word": false
|
55 |
+
},
|
56 |
+
"special_tokens_map_file": "/root/.cache/huggingface/hub/models--deepset--roberta-base-squad2/snapshots/d39b8d4166b0683451bbce6f047de1a238c0b5bf/special_tokens_map.json",
|
57 |
+
"tokenizer_class": "RobertaTokenizer",
|
58 |
+
"trim_offsets": true,
|
59 |
+
"unk_token": {
|
60 |
+
"__type": "AddedToken",
|
61 |
+
"content": "<unk>",
|
62 |
+
"lstrip": false,
|
63 |
+
"normalized": true,
|
64 |
+
"rstrip": false,
|
65 |
+
"single_word": false
|
66 |
+
}
|
67 |
+
}
|
models/QA_Model/vocab.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
models/Similarity_Model/1_Pooling/config.json
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"word_embedding_dimension": 384,
|
3 |
+
"pooling_mode_cls_token": false,
|
4 |
+
"pooling_mode_mean_tokens": true,
|
5 |
+
"pooling_mode_max_tokens": false,
|
6 |
+
"pooling_mode_mean_sqrt_len_tokens": false
|
7 |
+
}
|
models/Similarity_Model/README.md
ADDED
@@ -0,0 +1,176 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
pipeline_tag: sentence-similarity
|
3 |
+
tags:
|
4 |
+
- sentence-transformers
|
5 |
+
- feature-extraction
|
6 |
+
- sentence-similarity
|
7 |
+
language: en
|
8 |
+
license: apache-2.0
|
9 |
+
datasets:
|
10 |
+
- s2orc
|
11 |
+
- flax-sentence-embeddings/stackexchange_xml
|
12 |
+
- MS Marco
|
13 |
+
- gooaq
|
14 |
+
- yahoo_answers_topics
|
15 |
+
- code_search_net
|
16 |
+
- search_qa
|
17 |
+
- eli5
|
18 |
+
- snli
|
19 |
+
- multi_nli
|
20 |
+
- wikihow
|
21 |
+
- natural_questions
|
22 |
+
- trivia_qa
|
23 |
+
- embedding-data/sentence-compression
|
24 |
+
- embedding-data/flickr30k-captions
|
25 |
+
- embedding-data/altlex
|
26 |
+
- embedding-data/simple-wiki
|
27 |
+
- embedding-data/QQP
|
28 |
+
- embedding-data/SPECTER
|
29 |
+
- embedding-data/PAQ_pairs
|
30 |
+
- embedding-data/WikiAnswers
|
31 |
+
|
32 |
+
---
|
33 |
+
|
34 |
+
|
35 |
+
# all-MiniLM-L12-v2
|
36 |
+
This is a [sentence-transformers](https://www.SBERT.net) model: It maps sentences & paragraphs to a 384 dimensional dense vector space and can be used for tasks like clustering or semantic search.
|
37 |
+
|
38 |
+
## Usage (Sentence-Transformers)
|
39 |
+
Using this model becomes easy when you have [sentence-transformers](https://www.SBERT.net) installed:
|
40 |
+
|
41 |
+
```
|
42 |
+
pip install -U sentence-transformers
|
43 |
+
```
|
44 |
+
|
45 |
+
Then you can use the model like this:
|
46 |
+
```python
|
47 |
+
from sentence_transformers import SentenceTransformer
|
48 |
+
sentences = ["This is an example sentence", "Each sentence is converted"]
|
49 |
+
|
50 |
+
model = SentenceTransformer('sentence-transformers/all-MiniLM-L12-v2')
|
51 |
+
embeddings = model.encode(sentences)
|
52 |
+
print(embeddings)
|
53 |
+
```
|
54 |
+
|
55 |
+
## Usage (HuggingFace Transformers)
|
56 |
+
Without [sentence-transformers](https://www.SBERT.net), you can use the model like this: First, you pass your input through the transformer model, then you have to apply the right pooling-operation on-top of the contextualized word embeddings.
|
57 |
+
|
58 |
+
```python
|
59 |
+
from transformers import AutoTokenizer, AutoModel
|
60 |
+
import torch
|
61 |
+
import torch.nn.functional as F
|
62 |
+
|
63 |
+
#Mean Pooling - Take attention mask into account for correct averaging
|
64 |
+
def mean_pooling(model_output, attention_mask):
|
65 |
+
token_embeddings = model_output[0] #First element of model_output contains all token embeddings
|
66 |
+
input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
|
67 |
+
return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
|
68 |
+
|
69 |
+
|
70 |
+
# Sentences we want sentence embeddings for
|
71 |
+
sentences = ['This is an example sentence', 'Each sentence is converted']
|
72 |
+
|
73 |
+
# Load model from HuggingFace Hub
|
74 |
+
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L12-v2')
|
75 |
+
model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L12-v2')
|
76 |
+
|
77 |
+
# Tokenize sentences
|
78 |
+
encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
|
79 |
+
|
80 |
+
# Compute token embeddings
|
81 |
+
with torch.no_grad():
|
82 |
+
model_output = model(**encoded_input)
|
83 |
+
|
84 |
+
# Perform pooling
|
85 |
+
sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
|
86 |
+
|
87 |
+
# Normalize embeddings
|
88 |
+
sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
|
89 |
+
|
90 |
+
print("Sentence embeddings:")
|
91 |
+
print(sentence_embeddings)
|
92 |
+
```
|
93 |
+
|
94 |
+
## Evaluation Results
|
95 |
+
|
96 |
+
For an automated evaluation of this model, see the *Sentence Embeddings Benchmark*: [https://seb.sbert.net](https://seb.sbert.net?model_name=sentence-transformers/all-MiniLM-L12-v2)
|
97 |
+
|
98 |
+
------
|
99 |
+
|
100 |
+
## Background
|
101 |
+
|
102 |
+
The project aims to train sentence embedding models on very large sentence level datasets using a self-supervised
|
103 |
+
contrastive learning objective. We used the pretrained [`microsoft/MiniLM-L12-H384-uncased`](https://huggingface.co/microsoft/MiniLM-L12-H384-uncased) model and fine-tuned in on a
|
104 |
+
1B sentence pairs dataset. We use a contrastive learning objective: given a sentence from the pair, the model should predict which out of a set of randomly sampled other sentences, was actually paired with it in our dataset.
|
105 |
+
|
106 |
+
We developped this model during the
|
107 |
+
[Community week using JAX/Flax for NLP & CV](https://discuss.huggingface.co/t/open-to-the-community-community-week-using-jax-flax-for-nlp-cv/7104),
|
108 |
+
organized by Hugging Face. We developped this model as part of the project:
|
109 |
+
[Train the Best Sentence Embedding Model Ever with 1B Training Pairs](https://discuss.huggingface.co/t/train-the-best-sentence-embedding-model-ever-with-1b-training-pairs/7354). We benefited from efficient hardware infrastructure to run the project: 7 TPUs v3-8, as well as intervention from Googles Flax, JAX, and Cloud team member about efficient deep learning frameworks.
|
110 |
+
|
111 |
+
## Intended uses
|
112 |
+
|
113 |
+
Our model is intented to be used as a sentence and short paragraph encoder. Given an input text, it ouptuts a vector which captures
|
114 |
+
the semantic information. The sentence vector may be used for information retrieval, clustering or sentence similarity tasks.
|
115 |
+
|
116 |
+
By default, input text longer than 256 word pieces is truncated.
|
117 |
+
|
118 |
+
|
119 |
+
## Training procedure
|
120 |
+
|
121 |
+
### Pre-training
|
122 |
+
|
123 |
+
We use the pretrained [`microsoft/MiniLM-L12-H384-uncased`](https://huggingface.co/microsoft/MiniLM-L12-H384-uncased) model. Please refer to the model card for more detailed information about the pre-training procedure.
|
124 |
+
|
125 |
+
### Fine-tuning
|
126 |
+
|
127 |
+
We fine-tune the model using a contrastive objective. Formally, we compute the cosine similarity from each possible sentence pairs from the batch.
|
128 |
+
We then apply the cross entropy loss by comparing with true pairs.
|
129 |
+
|
130 |
+
#### Hyper parameters
|
131 |
+
|
132 |
+
We trained ou model on a TPU v3-8. We train the model during 100k steps using a batch size of 1024 (128 per TPU core).
|
133 |
+
We use a learning rate warm up of 500. The sequence length was limited to 128 tokens. We used the AdamW optimizer with
|
134 |
+
a 2e-5 learning rate. The full training script is accessible in this current repository: `train_script.py`.
|
135 |
+
|
136 |
+
#### Training data
|
137 |
+
|
138 |
+
We use the concatenation from multiple datasets to fine-tune our model. The total number of sentence pairs is above 1 billion sentences.
|
139 |
+
We sampled each dataset given a weighted probability which configuration is detailed in the `data_config.json` file.
|
140 |
+
|
141 |
+
|
142 |
+
| Dataset | Paper | Number of training tuples |
|
143 |
+
|--------------------------------------------------------|:----------------------------------------:|:--------------------------:|
|
144 |
+
| [Reddit comments (2015-2018)](https://github.com/PolyAI-LDN/conversational-datasets/tree/master/reddit) | [paper](https://arxiv.org/abs/1904.06472) | 726,484,430 |
|
145 |
+
| [S2ORC](https://github.com/allenai/s2orc) Citation pairs (Abstracts) | [paper](https://aclanthology.org/2020.acl-main.447/) | 116,288,806 |
|
146 |
+
| [WikiAnswers](https://github.com/afader/oqa#wikianswers-corpus) Duplicate question pairs | [paper](https://doi.org/10.1145/2623330.2623677) | 77,427,422 |
|
147 |
+
| [PAQ](https://github.com/facebookresearch/PAQ) (Question, Answer) pairs | [paper](https://arxiv.org/abs/2102.07033) | 64,371,441 |
|
148 |
+
| [S2ORC](https://github.com/allenai/s2orc) Citation pairs (Titles) | [paper](https://aclanthology.org/2020.acl-main.447/) | 52,603,982 |
|
149 |
+
| [S2ORC](https://github.com/allenai/s2orc) (Title, Abstract) | [paper](https://aclanthology.org/2020.acl-main.447/) | 41,769,185 |
|
150 |
+
| [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) (Title, Body) pairs | - | 25,316,456 |
|
151 |
+
| [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) (Title+Body, Answer) pairs | - | 21,396,559 |
|
152 |
+
| [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) (Title, Answer) pairs | - | 21,396,559 |
|
153 |
+
| [MS MARCO](https://microsoft.github.io/msmarco/) triplets | [paper](https://doi.org/10.1145/3404835.3462804) | 9,144,553 |
|
154 |
+
| [GOOAQ: Open Question Answering with Diverse Answer Types](https://github.com/allenai/gooaq) | [paper](https://arxiv.org/pdf/2104.08727.pdf) | 3,012,496 |
|
155 |
+
| [Yahoo Answers](https://www.kaggle.com/soumikrakshit/yahoo-answers-dataset) (Title, Answer) | [paper](https://proceedings.neurips.cc/paper/2015/hash/250cf8b51c773f3f8dc8b4be867a9a02-Abstract.html) | 1,198,260 |
|
156 |
+
| [Code Search](https://huggingface.co/datasets/code_search_net) | - | 1,151,414 |
|
157 |
+
| [COCO](https://cocodataset.org/#home) Image captions | [paper](https://link.springer.com/chapter/10.1007%2F978-3-319-10602-1_48) | 828,395|
|
158 |
+
| [SPECTER](https://github.com/allenai/specter) citation triplets | [paper](https://doi.org/10.18653/v1/2020.acl-main.207) | 684,100 |
|
159 |
+
| [Yahoo Answers](https://www.kaggle.com/soumikrakshit/yahoo-answers-dataset) (Question, Answer) | [paper](https://proceedings.neurips.cc/paper/2015/hash/250cf8b51c773f3f8dc8b4be867a9a02-Abstract.html) | 681,164 |
|
160 |
+
| [Yahoo Answers](https://www.kaggle.com/soumikrakshit/yahoo-answers-dataset) (Title, Question) | [paper](https://proceedings.neurips.cc/paper/2015/hash/250cf8b51c773f3f8dc8b4be867a9a02-Abstract.html) | 659,896 |
|
161 |
+
| [SearchQA](https://huggingface.co/datasets/search_qa) | [paper](https://arxiv.org/abs/1704.05179) | 582,261 |
|
162 |
+
| [Eli5](https://huggingface.co/datasets/eli5) | [paper](https://doi.org/10.18653/v1/p19-1346) | 325,475 |
|
163 |
+
| [Flickr 30k](https://shannon.cs.illinois.edu/DenotationGraph/) | [paper](https://transacl.org/ojs/index.php/tacl/article/view/229/33) | 317,695 |
|
164 |
+
| [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) Duplicate questions (titles) | | 304,525 |
|
165 |
+
| AllNLI ([SNLI](https://nlp.stanford.edu/projects/snli/) and [MultiNLI](https://cims.nyu.edu/~sbowman/multinli/) | [paper SNLI](https://doi.org/10.18653/v1/d15-1075), [paper MultiNLI](https://doi.org/10.18653/v1/n18-1101) | 277,230 |
|
166 |
+
| [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) Duplicate questions (bodies) | | 250,519 |
|
167 |
+
| [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) Duplicate questions (titles+bodies) | | 250,460 |
|
168 |
+
| [Sentence Compression](https://github.com/google-research-datasets/sentence-compression) | [paper](https://www.aclweb.org/anthology/D13-1155/) | 180,000 |
|
169 |
+
| [Wikihow](https://github.com/pvl/wikihow_pairs_dataset) | [paper](https://arxiv.org/abs/1810.09305) | 128,542 |
|
170 |
+
| [Altlex](https://github.com/chridey/altlex/) | [paper](https://aclanthology.org/P16-1135.pdf) | 112,696 |
|
171 |
+
| [Quora Question Triplets](https://quoradata.quora.com/First-Quora-Dataset-Release-Question-Pairs) | - | 103,663 |
|
172 |
+
| [Simple Wikipedia](https://cs.pomona.edu/~dkauchak/simplification/) | [paper](https://www.aclweb.org/anthology/P11-2117/) | 102,225 |
|
173 |
+
| [Natural Questions (NQ)](https://ai.google.com/research/NaturalQuestions) | [paper](https://transacl.org/ojs/index.php/tacl/article/view/1455) | 100,231 |
|
174 |
+
| [SQuAD2.0](https://rajpurkar.github.io/SQuAD-explorer/) | [paper](https://aclanthology.org/P18-2124.pdf) | 87,599 |
|
175 |
+
| [TriviaQA](https://huggingface.co/datasets/trivia_qa) | - | 73,346 |
|
176 |
+
| **Total** | | **1,170,060,424** |
|
models/Similarity_Model/config.json
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "/root/.cache/torch/sentence_transformers/sentence-transformers_all-MiniLM-L12-v2/",
|
3 |
+
"architectures": [
|
4 |
+
"BertModel"
|
5 |
+
],
|
6 |
+
"attention_probs_dropout_prob": 0.1,
|
7 |
+
"classifier_dropout": null,
|
8 |
+
"gradient_checkpointing": false,
|
9 |
+
"hidden_act": "gelu",
|
10 |
+
"hidden_dropout_prob": 0.1,
|
11 |
+
"hidden_size": 384,
|
12 |
+
"initializer_range": 0.02,
|
13 |
+
"intermediate_size": 1536,
|
14 |
+
"layer_norm_eps": 1e-12,
|
15 |
+
"max_position_embeddings": 512,
|
16 |
+
"model_type": "bert",
|
17 |
+
"num_attention_heads": 12,
|
18 |
+
"num_hidden_layers": 12,
|
19 |
+
"pad_token_id": 0,
|
20 |
+
"position_embedding_type": "absolute",
|
21 |
+
"torch_dtype": "float32",
|
22 |
+
"transformers_version": "4.20.1",
|
23 |
+
"type_vocab_size": 2,
|
24 |
+
"use_cache": true,
|
25 |
+
"vocab_size": 30522
|
26 |
+
}
|
models/Similarity_Model/config_sentence_transformers.json
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"__version__": {
|
3 |
+
"sentence_transformers": "2.0.0",
|
4 |
+
"transformers": "4.6.1",
|
5 |
+
"pytorch": "1.8.1"
|
6 |
+
}
|
7 |
+
}
|
models/Similarity_Model/modules.json
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{
|
3 |
+
"idx": 0,
|
4 |
+
"name": "0",
|
5 |
+
"path": "",
|
6 |
+
"type": "sentence_transformers.models.Transformer"
|
7 |
+
},
|
8 |
+
{
|
9 |
+
"idx": 1,
|
10 |
+
"name": "1",
|
11 |
+
"path": "1_Pooling",
|
12 |
+
"type": "sentence_transformers.models.Pooling"
|
13 |
+
},
|
14 |
+
{
|
15 |
+
"idx": 2,
|
16 |
+
"name": "2",
|
17 |
+
"path": "2_Normalize",
|
18 |
+
"type": "sentence_transformers.models.Normalize"
|
19 |
+
}
|
20 |
+
]
|
models/Similarity_Model/pytorch_model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:54609dea3ff88f3167f049eeadbfe780b1173a3117bfac862134ebcd8ce33661
|
3 |
+
size 133506609
|
models/Similarity_Model/sentence_bert_config.json
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"max_seq_length": 128,
|
3 |
+
"do_lower_case": false
|
4 |
+
}
|
models/Similarity_Model/special_tokens_map.json
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cls_token": "[CLS]",
|
3 |
+
"mask_token": "[MASK]",
|
4 |
+
"pad_token": "[PAD]",
|
5 |
+
"sep_token": "[SEP]",
|
6 |
+
"unk_token": "[UNK]"
|
7 |
+
}
|
models/Similarity_Model/tokenizer.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
models/Similarity_Model/tokenizer_config.json
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cls_token": "[CLS]",
|
3 |
+
"do_basic_tokenize": true,
|
4 |
+
"do_lower_case": true,
|
5 |
+
"mask_token": "[MASK]",
|
6 |
+
"model_max_length": 512,
|
7 |
+
"name_or_path": "/root/.cache/torch/sentence_transformers/sentence-transformers_all-MiniLM-L12-v2/",
|
8 |
+
"never_split": null,
|
9 |
+
"pad_token": "[PAD]",
|
10 |
+
"sep_token": "[SEP]",
|
11 |
+
"special_tokens_map_file": "/root/.cache/torch/sentence_transformers/sentence-transformers_all-MiniLM-L12-v2/special_tokens_map.json",
|
12 |
+
"strip_accents": null,
|
13 |
+
"tokenize_chinese_chars": true,
|
14 |
+
"tokenizer_class": "BertTokenizer",
|
15 |
+
"unk_token": "[UNK]"
|
16 |
+
}
|
models/Similarity_Model/vocab.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
preprocessing.py
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
def stride_sentences(texts:list, stride=10):
|
2 |
+
groups = [texts[i:i+stride] for i in range(0, len(texts), stride)]
|
3 |
+
groups = [' '.join(group).strip() for group in groups]
|
4 |
+
return groups
|
5 |
+
|
6 |
+
|
7 |
+
def dequestionize(question:str):
|
8 |
+
question_words = [word for word in question.split() if word.lower() not in ['what','where','how','who','why']]
|
9 |
+
return ' '.join(question_words).replace('?','').strip()
|
10 |
+
|
11 |
+
|
12 |
+
def create_similarity_text(question:str, answer: str):
|
13 |
+
question = dequestionize(question)
|
14 |
+
return f"{answer} {question}"
|
15 |
+
|
16 |
+
def create_result_url(base_url,timestamp):
|
17 |
+
full_url = f"{base_url}&t={int(timestamp)}s"
|
18 |
+
return full_url
|
requirements.txt
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
transformers
|
2 |
+
youtube-transcript-api
|
3 |
+
streamlit
|
4 |
+
streamlit-player
|