Spaces:

shreydan
/

youtube-QandA

Runtime error

App Files Files Community

shreydan commited on Feb 27, 2023

Commit

697eefa

•

1 Parent(s): d309158

add all

Browse files

Files changed (24) hide show

README.md +4 -6
app.py +39 -0
fetch_transcript.py +70 -0
model.py +61 -0
models/QA_Model/config.json +30 -0
models/QA_Model/merges.txt +0 -0
models/QA_Model/pytorch_model.bin +3 -0
models/QA_Model/special_tokens_map.json +51 -0
models/QA_Model/tokenizer.json +0 -0
models/QA_Model/tokenizer_config.json +67 -0
models/QA_Model/vocab.json +0 -0
models/Similarity_Model/1_Pooling/config.json +7 -0
models/Similarity_Model/README.md +176 -0
models/Similarity_Model/config.json +26 -0
models/Similarity_Model/config_sentence_transformers.json +7 -0
models/Similarity_Model/modules.json +20 -0
models/Similarity_Model/pytorch_model.bin +3 -0
models/Similarity_Model/sentence_bert_config.json +4 -0
models/Similarity_Model/special_tokens_map.json +7 -0
models/Similarity_Model/tokenizer.json +0 -0
models/Similarity_Model/tokenizer_config.json +16 -0
models/Similarity_Model/vocab.txt +0 -0
preprocessing.py +18 -0
requirements.txt +4 -0

README.md CHANGED Viewed

@@ -1,12 +1,10 @@
 ---
-title: Youtube QandA
-emoji: 🐨
-colorFrom: blue
-colorTo: purple
 sdk: streamlit
 sdk_version: 1.17.0
 app_file: app.py
 pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Youtube Q&A
+emoji: 📹️
+colorFrom: red
+colorTo: black
 sdk: streamlit
 sdk_version: 1.17.0
 app_file: app.py
 pinned: false
 ---

app.py ADDED Viewed

	@@ -0,0 +1,39 @@

+import streamlit as st
+from streamlit_player import st_player
+from model import Engine
+from fetch_transcript import fetch_transcript
+from preprocessing import create_similarity_text, create_result_url
+with st.container():
+    st.title('YouTube Q&A Search')
+    st.write('Ask YouTube videos questions and get your answers :)')
+with st.container():
+    url_input = st.text_input(label='Video',placeholder='enter YouTube video url')
+    question_input = st.text_input(label='Question',placeholder='enter your question')
+    get_ans = st.button(label='Answer!')
+    if len(url_input)!='' and len(question_input)!='' and get_ans:
+        with st.spinner('loading your video...'):
+            transcript = fetch_transcript(url_input)
+            model = Engine(transcript)
+            prev_url = url_input
+        with st.spinner('finding an answer...'):
+            answer = model.ask(question_input)
+            similarity_text = create_similarity_text(question_input,answer)
+            groups,timestamps = model.find_similar(similarity_text)
+            url = create_result_url(url_input,timestamps[0])
+        with st.container():
+            st.caption('Extracted Answer:')
+            st.write(answer)
+            st.caption('In Video:')
+            st_player(url)

fetch_transcript.py ADDED Viewed

	@@ -0,0 +1,70 @@

+import regex as re
+from youtube_transcript_api import YouTubeRequestFailed, YouTubeTranscriptApi
+from preprocessing import stride_sentences
+def validate_youtube_link(url: str) -> str:
+    """
+    this method validates the youtube video link provided.
+    input  : url (str)
+    outputs: transcript (string/dict)
+    """
+    yt_regex = r"^.*(youtu.be\/|v\/|u\/\w\/|embed\/|watch\?v=|\&v=|\?v=)([^#\&\?]*).*"
+    matches = re.findall(yt_regex, url)
+    assert (len(matches[0][1]) == 11), "Invalid YouTube Link"
+    video_id:str = matches[0][1]
+    return video_id
+def zip_transcript(transcript:list) -> dict:
+    start_times = []
+    texts = []
+    for item in transcript:
+        start_times.append(item['start'])
+        texts.append(item['text'].strip().replace('\n',' '))
+    return {
+        'timestamps': start_times,
+        'texts': texts
+    }
+def full_text(transcript: list) -> str:
+    texts = []
+    for item in transcript:
+        texts.append(item['text'])
+    return ' '.join(texts).strip()
+def fetch_transcript(url: str) -> list:
+    video_id = validate_youtube_link(url)
+    try:
+        transcript:list = YouTubeTranscriptApi.get_transcript(video_id=video_id)
+    except YouTubeRequestFailed:
+        raise Exception('YouTube Request Failed, try again later.')
+    return transcript
+if __name__ == '__main__':
+    sample = 'https://www.youtube.com/watch?v=t6V9i8fFADI'
+    sample2 = 'https://www.youtube.com/watch?v=1nLHIM2IPRY'
+    fake_sample = 'https://www.youtube.com/watch?v=asdf3'
+    transcript = fetch_transcript(url=sample)
+    times, texts = zip_transcript(transcript)
+    texts = stride_sentences(texts)
+    print(texts[0])
+    # with open('sample_group.txt','w') as f:
+    #     for group in groups:
+    #         f.write(f"{group}\n\n")

model.py ADDED Viewed

	@@ -0,0 +1,61 @@

+from pathlib import Path
+import torch
+from sentence_transformers import SentenceTransformer
+from sentence_transformers.util import cos_sim
+from transformers import pipeline
+from preprocessing import stride_sentences
+from fetch_transcript import zip_transcript
+class Engine:
+    def __init__(self, transcript:list) -> None:
+        self.base_path = Path('./models')
+        self.qa_model_name = 'QA_Model'
+        self.qa_model_path = self.base_path / self.qa_model_name
+        self.qa_model = pipeline('question-answering',model=str(self.qa_model_path))
+        self.sim_model_name = 'Similarity_Model'
+        self.sim_model_path = self.base_path / self.sim_model_name
+        self.sim_model = SentenceTransformer(self.sim_model_path)
+        self.timestamps, self.texts = zip_transcript(transcript).values()
+        self.stride = 10
+        self.text_groups = stride_sentences(self.texts,self.stride)
+        self.embeddings = self._encode_transcript()
+    def _encode_transcript(self):
+        return self.sim_model.encode(self.text_groups)
+    def ask(self, question_text:str):
+        result = self.qa_model(
+            question=question_text,
+            context=' '.join(self.text_groups).strip(),
+            doc_stride=256,
+            max_answer_len=512,
+            max_question_len=128,
+        )
+        return result['answer']
+    def find_similar(self, txt:str, top_k=1):
+        txt = self.sim_model.encode(txt)
+        similarities:torch.Tensor = cos_sim(txt,self.embeddings)
+        similarities = similarities.reshape(-1)
+        indices = list(torch.argsort(similarities))
+        indices = [idx.item() for idx in indices[::-1]][:top_k]
+        groups = [self.text_groups[i] for i in indices]
+        timestamps = [self.timestamps[self.stride*i] for i in indices]
+        return groups, timestamps
+if __name__ == '__main__':
+    model = Engine()

models/QA_Model/config.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "_name_or_path": "deepset/roberta-base-squad2",
+  "architectures": [
+    "RobertaForQuestionAnswering"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "bos_token_id": 0,
+  "classifier_dropout": null,
+  "eos_token_id": 2,
+  "gradient_checkpointing": false,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "language": "english",
+  "layer_norm_eps": 1e-05,
+  "max_position_embeddings": 514,
+  "model_type": "roberta",
+  "name": "Roberta",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "pad_token_id": 1,
+  "position_embedding_type": "absolute",
+  "torch_dtype": "float32",
+  "transformers_version": "4.26.1",
+  "type_vocab_size": 1,
+  "use_cache": true,
+  "vocab_size": 50265
+}

models/QA_Model/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

models/QA_Model/pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6d3de9857583a4639b6f23a05a2e9531b7c1c64a0c13adcc9671156bf7bcd740
+size 496296301

models/QA_Model/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,51 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "cls_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "mask_token": {
+    "content": "<mask>",
+    "lstrip": true,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<pad>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "sep_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  }
+}

models/QA_Model/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

models/QA_Model/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,67 @@

+{
+  "add_prefix_space": false,
+  "bos_token": {
+    "__type": "AddedToken",
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "cls_token": {
+    "__type": "AddedToken",
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "do_lower_case": false,
+  "eos_token": {
+    "__type": "AddedToken",
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "errors": "replace",
+  "full_tokenizer_file": null,
+  "mask_token": {
+    "__type": "AddedToken",
+    "content": "<mask>",
+    "lstrip": true,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "model_max_length": 512,
+  "name_or_path": "deepset/roberta-base-squad2",
+  "pad_token": {
+    "__type": "AddedToken",
+    "content": "<pad>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "sep_token": {
+    "__type": "AddedToken",
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "special_tokens_map_file": "/root/.cache/huggingface/hub/models--deepset--roberta-base-squad2/snapshots/d39b8d4166b0683451bbce6f047de1a238c0b5bf/special_tokens_map.json",
+  "tokenizer_class": "RobertaTokenizer",
+  "trim_offsets": true,
+  "unk_token": {
+    "__type": "AddedToken",
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  }
+}

models/QA_Model/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

models/Similarity_Model/1_Pooling/config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "word_embedding_dimension": 384,
+  "pooling_mode_cls_token": false,
+  "pooling_mode_mean_tokens": true,
+  "pooling_mode_max_tokens": false,
+  "pooling_mode_mean_sqrt_len_tokens": false
+}

models/Similarity_Model/README.md ADDED Viewed

	@@ -0,0 +1,176 @@

+---
+pipeline_tag: sentence-similarity
+tags:
+- sentence-transformers
+- feature-extraction
+- sentence-similarity
+language: en
+license: apache-2.0
+datasets:
+- s2orc
+- flax-sentence-embeddings/stackexchange_xml
+- MS Marco
+- gooaq
+- yahoo_answers_topics
+- code_search_net
+- search_qa
+- eli5
+- snli
+- multi_nli
+- wikihow
+- natural_questions
+- trivia_qa
+- embedding-data/sentence-compression
+- embedding-data/flickr30k-captions
+- embedding-data/altlex
+- embedding-data/simple-wiki
+- embedding-data/QQP
+- embedding-data/SPECTER
+- embedding-data/PAQ_pairs
+- embedding-data/WikiAnswers
+---
+# all-MiniLM-L12-v2
+This is a [sentence-transformers](https://www.SBERT.net) model: It maps sentences & paragraphs to a 384 dimensional dense vector space and can be used for tasks like clustering or semantic search.
+## Usage (Sentence-Transformers)
+Using this model becomes easy when you have [sentence-transformers](https://www.SBERT.net) installed:
+```
+pip install -U sentence-transformers
+```
+Then you can use the model like this:
+```python
+from sentence_transformers import SentenceTransformer
+sentences = ["This is an example sentence", "Each sentence is converted"]
+model = SentenceTransformer('sentence-transformers/all-MiniLM-L12-v2')
+embeddings = model.encode(sentences)
+print(embeddings)
+```
+## Usage (HuggingFace Transformers)
+Without [sentence-transformers](https://www.SBERT.net), you can use the model like this: First, you pass your input through the transformer model, then you have to apply the right pooling-operation on-top of the contextualized word embeddings.
+```python
+from transformers import AutoTokenizer, AutoModel
+import torch
+import torch.nn.functional as F
+#Mean Pooling - Take attention mask into account for correct averaging
+def mean_pooling(model_output, attention_mask):
+    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
+    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
+    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
+# Sentences we want sentence embeddings for
+sentences = ['This is an example sentence', 'Each sentence is converted']
+# Load model from HuggingFace Hub
+tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L12-v2')
+model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L12-v2')
+# Tokenize sentences
+encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
+# Compute token embeddings
+with torch.no_grad():
+    model_output = model(**encoded_input)
+# Perform pooling
+sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
+# Normalize embeddings
+sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
+print("Sentence embeddings:")
+print(sentence_embeddings)
+```
+## Evaluation Results
+For an automated evaluation of this model, see the *Sentence Embeddings Benchmark*: [https://seb.sbert.net](https://seb.sbert.net?model_name=sentence-transformers/all-MiniLM-L12-v2)
+------
+## Background
+The project aims to train sentence embedding models on very large sentence level datasets using a self-supervised
+contrastive learning objective. We used the pretrained [`microsoft/MiniLM-L12-H384-uncased`](https://huggingface.co/microsoft/MiniLM-L12-H384-uncased) model and fine-tuned in on a
+1B sentence pairs dataset. We use a contrastive learning objective: given a sentence from the pair, the model should predict which out of a set of randomly sampled other sentences, was actually paired with it in our dataset.
+We developped this model during the
+[Community week using JAX/Flax for NLP & CV](https://discuss.huggingface.co/t/open-to-the-community-community-week-using-jax-flax-for-nlp-cv/7104),
+organized by Hugging Face. We developped this model as part of the project:
+[Train the Best Sentence Embedding Model Ever with 1B Training Pairs](https://discuss.huggingface.co/t/train-the-best-sentence-embedding-model-ever-with-1b-training-pairs/7354). We benefited from efficient hardware infrastructure to run the project: 7 TPUs v3-8, as well as intervention from Googles Flax, JAX, and Cloud team member about efficient deep learning frameworks.
+## Intended uses
+Our model is intented to be used as a sentence and short paragraph encoder. Given an input text, it ouptuts a vector which captures
+the semantic information. The sentence vector may be used for information retrieval, clustering or sentence similarity tasks.
+By default, input text longer than 256 word pieces is truncated.
+## Training procedure
+### Pre-training
+We use the pretrained [`microsoft/MiniLM-L12-H384-uncased`](https://huggingface.co/microsoft/MiniLM-L12-H384-uncased) model. Please refer to the model card for more detailed information about the pre-training procedure.
+### Fine-tuning
+We fine-tune the model using a contrastive objective. Formally, we compute the cosine similarity from each possible sentence pairs from the batch.
+We then apply the cross entropy loss by comparing with true pairs.
+#### Hyper parameters
+We trained ou model on a TPU v3-8. We train the model during 100k steps using a batch size of 1024 (128 per TPU core).
+We use a learning rate warm up of 500. The sequence length was limited to 128 tokens. We used the AdamW optimizer with
+a 2e-5 learning rate. The full training script is accessible in this current repository: `train_script.py`.
+#### Training data
+We use the concatenation from multiple datasets to fine-tune our model. The total number of sentence pairs is above 1 billion sentences.
+We sampled each dataset given a weighted probability which configuration is detailed in the `data_config.json` file.
+| Dataset                                                  | Paper                                    | Number of training tuples  |
+|--------------------------------------------------------|:----------------------------------------:|:--------------------------:|
+| [Reddit comments (2015-2018)](https://github.com/PolyAI-LDN/conversational-datasets/tree/master/reddit) | [paper](https://arxiv.org/abs/1904.06472) | 726,484,430 |
+| [S2ORC](https://github.com/allenai/s2orc) Citation pairs (Abstracts) | [paper](https://aclanthology.org/2020.acl-main.447/) | 116,288,806 |
+| [WikiAnswers](https://github.com/afader/oqa#wikianswers-corpus) Duplicate question pairs | [paper](https://doi.org/10.1145/2623330.2623677) | 77,427,422 |
+| [PAQ](https://github.com/facebookresearch/PAQ) (Question, Answer) pairs | [paper](https://arxiv.org/abs/2102.07033) | 64,371,441 |
+| [S2ORC](https://github.com/allenai/s2orc) Citation pairs (Titles) | [paper](https://aclanthology.org/2020.acl-main.447/) | 52,603,982 |
+| [S2ORC](https://github.com/allenai/s2orc) (Title, Abstract) | [paper](https://aclanthology.org/2020.acl-main.447/) | 41,769,185 |
+| [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) (Title, Body) pairs  | - | 25,316,456 |
+| [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) (Title+Body, Answer) pairs  | - | 21,396,559 |
+| [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) (Title, Answer) pairs  | - | 21,396,559 |
+| [MS MARCO](https://microsoft.github.io/msmarco/) triplets | [paper](https://doi.org/10.1145/3404835.3462804) | 9,144,553 |
+| [GOOAQ: Open Question Answering with Diverse Answer Types](https://github.com/allenai/gooaq) | [paper](https://arxiv.org/pdf/2104.08727.pdf) | 3,012,496 |
+| [Yahoo Answers](https://www.kaggle.com/soumikrakshit/yahoo-answers-dataset) (Title, Answer) | [paper](https://proceedings.neurips.cc/paper/2015/hash/250cf8b51c773f3f8dc8b4be867a9a02-Abstract.html) | 1,198,260 |
+| [Code Search](https://huggingface.co/datasets/code_search_net) | - | 1,151,414 |
+| [COCO](https://cocodataset.org/#home) Image captions | [paper](https://link.springer.com/chapter/10.1007%2F978-3-319-10602-1_48) | 828,395|
+| [SPECTER](https://github.com/allenai/specter) citation triplets | [paper](https://doi.org/10.18653/v1/2020.acl-main.207) | 684,100 |
+| [Yahoo Answers](https://www.kaggle.com/soumikrakshit/yahoo-answers-dataset) (Question, Answer) | [paper](https://proceedings.neurips.cc/paper/2015/hash/250cf8b51c773f3f8dc8b4be867a9a02-Abstract.html) | 681,164 |
+| [Yahoo Answers](https://www.kaggle.com/soumikrakshit/yahoo-answers-dataset) (Title, Question) | [paper](https://proceedings.neurips.cc/paper/2015/hash/250cf8b51c773f3f8dc8b4be867a9a02-Abstract.html) | 659,896 |
+| [SearchQA](https://huggingface.co/datasets/search_qa) | [paper](https://arxiv.org/abs/1704.05179) | 582,261 |
+| [Eli5](https://huggingface.co/datasets/eli5) | [paper](https://doi.org/10.18653/v1/p19-1346) | 325,475 |
+| [Flickr 30k](https://shannon.cs.illinois.edu/DenotationGraph/) | [paper](https://transacl.org/ojs/index.php/tacl/article/view/229/33) | 317,695 |
+| [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) Duplicate questions (titles) | | 304,525 |
+| AllNLI ([SNLI](https://nlp.stanford.edu/projects/snli/) and [MultiNLI](https://cims.nyu.edu/~sbowman/multinli/) | [paper SNLI](https://doi.org/10.18653/v1/d15-1075), [paper MultiNLI](https://doi.org/10.18653/v1/n18-1101) | 277,230 |
+| [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) Duplicate questions (bodies) | | 250,519 |
+| [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) Duplicate questions (titles+bodies) | | 250,460 |
+| [Sentence Compression](https://github.com/google-research-datasets/sentence-compression) | [paper](https://www.aclweb.org/anthology/D13-1155/) | 180,000 |
+| [Wikihow](https://github.com/pvl/wikihow_pairs_dataset) | [paper](https://arxiv.org/abs/1810.09305) | 128,542 |
+| [Altlex](https://github.com/chridey/altlex/) | [paper](https://aclanthology.org/P16-1135.pdf) | 112,696 |
+| [Quora Question Triplets](https://quoradata.quora.com/First-Quora-Dataset-Release-Question-Pairs) | - | 103,663 |
+| [Simple Wikipedia](https://cs.pomona.edu/~dkauchak/simplification/) | [paper](https://www.aclweb.org/anthology/P11-2117/) | 102,225 |
+| [Natural Questions (NQ)](https://ai.google.com/research/NaturalQuestions) | [paper](https://transacl.org/ojs/index.php/tacl/article/view/1455) | 100,231 |
+| [SQuAD2.0](https://rajpurkar.github.io/SQuAD-explorer/) | [paper](https://aclanthology.org/P18-2124.pdf) | 87,599 |
+| [TriviaQA](https://huggingface.co/datasets/trivia_qa) | - | 73,346 |
+| **Total** | | **1,170,060,424** |

models/Similarity_Model/config.json ADDED Viewed

	@@ -0,0 +1,26 @@

+{
+  "_name_or_path": "/root/.cache/torch/sentence_transformers/sentence-transformers_all-MiniLM-L12-v2/",
+  "architectures": [
+    "BertModel"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "classifier_dropout": null,
+  "gradient_checkpointing": false,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 384,
+  "initializer_range": 0.02,
+  "intermediate_size": 1536,
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 512,
+  "model_type": "bert",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "pad_token_id": 0,
+  "position_embedding_type": "absolute",
+  "torch_dtype": "float32",
+  "transformers_version": "4.20.1",
+  "type_vocab_size": 2,
+  "use_cache": true,
+  "vocab_size": 30522
+}

models/Similarity_Model/config_sentence_transformers.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "__version__": {
+    "sentence_transformers": "2.0.0",
+    "transformers": "4.6.1",
+    "pytorch": "1.8.1"
+  }
+}

models/Similarity_Model/modules.json ADDED Viewed

	@@ -0,0 +1,20 @@

+[
+  {
+    "idx": 0,
+    "name": "0",
+    "path": "",
+    "type": "sentence_transformers.models.Transformer"
+  },
+  {
+    "idx": 1,
+    "name": "1",
+    "path": "1_Pooling",
+    "type": "sentence_transformers.models.Pooling"
+  },
+  {
+    "idx": 2,
+    "name": "2",
+    "path": "2_Normalize",
+    "type": "sentence_transformers.models.Normalize"
+  }
+]

models/Similarity_Model/pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:54609dea3ff88f3167f049eeadbfe780b1173a3117bfac862134ebcd8ce33661
+size 133506609

models/Similarity_Model/sentence_bert_config.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+  "max_seq_length": 128,
+  "do_lower_case": false
+}

models/Similarity_Model/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "cls_token": "[CLS]",
+  "mask_token": "[MASK]",
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "unk_token": "[UNK]"
+}

models/Similarity_Model/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

models/Similarity_Model/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,16 @@

+{
+  "cls_token": "[CLS]",
+  "do_basic_tokenize": true,
+  "do_lower_case": true,
+  "mask_token": "[MASK]",
+  "model_max_length": 512,
+  "name_or_path": "/root/.cache/torch/sentence_transformers/sentence-transformers_all-MiniLM-L12-v2/",
+  "never_split": null,
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "special_tokens_map_file": "/root/.cache/torch/sentence_transformers/sentence-transformers_all-MiniLM-L12-v2/special_tokens_map.json",
+  "strip_accents": null,
+  "tokenize_chinese_chars": true,
+  "tokenizer_class": "BertTokenizer",
+  "unk_token": "[UNK]"
+}

models/Similarity_Model/vocab.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

preprocessing.py ADDED Viewed

	@@ -0,0 +1,18 @@

+def stride_sentences(texts:list, stride=10):
+    groups = [texts[i:i+stride] for i in range(0, len(texts), stride)]
+    groups = [' '.join(group).strip() for group in groups]
+    return groups
+def dequestionize(question:str):
+    question_words = [word for word in question.split() if word.lower() not in ['what','where','how','who','why']]
+    return ' '.join(question_words).replace('?','').strip()
+def create_similarity_text(question:str, answer: str):
+    question = dequestionize(question)
+    return f"{answer} {question}"
+def create_result_url(base_url,timestamp):
+    full_url = f"{base_url}&t={int(timestamp)}s"
+    return full_url

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+transformers
+youtube-transcript-api
+streamlit
+streamlit-player