vilson
commited on
Commit
•
9db894e
1
Parent(s):
b09053c
App
Browse files- README.md +1 -13
- app.py +65 -0
- qa/chains.py +6 -0
- qa/loader.py +7 -0
- qa/manager.py +24 -0
- qa/model.py +6 -0
- qa/split.py +7 -0
- qa/vector_store.py +25 -0
- requirements.txt +5 -0
README.md
CHANGED
@@ -1,13 +1 @@
|
|
1 |
-
|
2 |
-
title: Youtube Retrieval Qa
|
3 |
-
emoji: 📚
|
4 |
-
colorFrom: indigo
|
5 |
-
colorTo: red
|
6 |
-
sdk: gradio
|
7 |
-
sdk_version: 3.32.0
|
8 |
-
app_file: app.py
|
9 |
-
pinned: false
|
10 |
-
license: mit
|
11 |
-
---
|
12 |
-
|
13 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
1 |
+
# youtube-retrieval-qa
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app.py
ADDED
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from typing import List
|
3 |
+
import gradio as gr
|
4 |
+
from qa.manager import YoutubeQA
|
5 |
+
|
6 |
+
DESCRIPTION = """
|
7 |
+
|
8 |
+
<h1> <center> 🤗 Hello. This App will help you do questions on youtube videos.</center> </h1>
|
9 |
+
|
10 |
+
<h4>
|
11 |
+
Follow this steps to use 😉:
|
12 |
+
</h4>
|
13 |
+
|
14 |
+
<ol>
|
15 |
+
<li>Set your OpenAI Key</li>
|
16 |
+
<li>Set your Youtube URL</li>
|
17 |
+
<li>Ask!</li>
|
18 |
+
</ol>
|
19 |
+
"""
|
20 |
+
|
21 |
+
qa = YoutubeQA()
|
22 |
+
|
23 |
+
def set_openai_key(key: str):
|
24 |
+
os.environ["OPENAI_API_KEY"] = key
|
25 |
+
# Set status field to Not Ready
|
26 |
+
return gr.update(lines=1, value="Not Ready 🥴")
|
27 |
+
|
28 |
+
def instanciate_retriver(url: str):
|
29 |
+
qa.load_model()
|
30 |
+
qa.load_vector_store(url)
|
31 |
+
qa.load_retriever()
|
32 |
+
# Set status field to Ready
|
33 |
+
return gr.update(lines=1, value="Ready 😎")
|
34 |
+
|
35 |
+
def respond(message: str, chat_history: List[str]):
|
36 |
+
bot_message = qa.run(message)
|
37 |
+
chat_history.append((message, bot_message))
|
38 |
+
return "", chat_history
|
39 |
+
|
40 |
+
with gr.Blocks() as app:
|
41 |
+
|
42 |
+
gr.Markdown(DESCRIPTION)
|
43 |
+
with gr.Tab("QA"):
|
44 |
+
status = gr.Textbox(label="🤔 Vector DB Status:", interactive=False)
|
45 |
+
chatbot = gr.Chatbot(label="🤖 Bot Answer:")
|
46 |
+
question = gr.Textbox(label="🕵️♀️ Question:", placeholder="Write your question here and press enter")
|
47 |
+
clear = gr.Button("Clear")
|
48 |
+
question.submit(respond, [question, chatbot], [question, chatbot])
|
49 |
+
clear.click(lambda: None, None, chatbot, queue=False)
|
50 |
+
|
51 |
+
with gr.Tab("Youtube URL"):
|
52 |
+
url = gr.Textbox(label="🎞️ URL:", lines=1, placeholder="Set your Youtube URL here...")
|
53 |
+
url_button = gr.Button("Set URL")
|
54 |
+
|
55 |
+
with gr.Tab("OpenAI Key"):
|
56 |
+
key = gr.Textbox(label="🔑 Key:", type="password", placeholder="Set your OpenAI Key here...")
|
57 |
+
key_button = gr.Button("Set Key")
|
58 |
+
|
59 |
+
#with gr.Accordion("Click me. About this App"):
|
60 |
+
# gr.Markdown("Look at me...")
|
61 |
+
|
62 |
+
url_button.click(instanciate_retriver, inputs=url, outputs=status)
|
63 |
+
key_button.click(set_openai_key, inputs=key, outputs=status)
|
64 |
+
|
65 |
+
app.launch()
|
qa/chains.py
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Callable
|
2 |
+
|
3 |
+
def retrieval_qa(llm: Callable, retriever: Callable) -> Callable:
|
4 |
+
from langchain.chains import RetrievalQA
|
5 |
+
qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever)
|
6 |
+
return qa
|
qa/loader.py
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import List
|
2 |
+
|
3 |
+
def youtube_doc_loader(url: str) -> List:
|
4 |
+
from langchain.document_loaders import YoutubeLoader
|
5 |
+
loader = YoutubeLoader.from_youtube_url(url, add_video_info=False)
|
6 |
+
data = loader.load()
|
7 |
+
return data
|
qa/manager.py
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from qa.chains import retrieval_qa
|
2 |
+
from qa.loader import youtube_doc_loader
|
3 |
+
from qa.model import load_llm
|
4 |
+
from qa.split import split_document
|
5 |
+
from qa.vector_store import create_vector_store
|
6 |
+
|
7 |
+
class YoutubeQA:
|
8 |
+
|
9 |
+
def __init__(self):
|
10 |
+
pass
|
11 |
+
|
12 |
+
def load_model(self) -> None:
|
13 |
+
self.llm = load_llm()
|
14 |
+
|
15 |
+
def load_vector_store(self, url: str) -> None:
|
16 |
+
data = youtube_doc_loader(url=url)
|
17 |
+
docs = split_document(data=data)
|
18 |
+
self.retriver = create_vector_store(docs=docs)
|
19 |
+
|
20 |
+
def load_retriever(self) -> None:
|
21 |
+
self.retrieval_qa = retrieval_qa(llm=self.llm, retriever=self.retriver)
|
22 |
+
|
23 |
+
def run(self, question: str) -> str:
|
24 |
+
return self.retrieval_qa.run(question)
|
qa/model.py
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Callable
|
2 |
+
|
3 |
+
def load_llm(temperature: float = 0.0, model: str = 'gpt-3.5-turbo') -> Callable:
|
4 |
+
from langchain.chat_models import ChatOpenAI
|
5 |
+
llm = ChatOpenAI(temperature=temperature, model=model)
|
6 |
+
return llm
|
qa/split.py
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import List
|
2 |
+
|
3 |
+
def split_document(data: List, chunk_size: int = 3000) -> List:
|
4 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
5 |
+
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=400)
|
6 |
+
docs = text_splitter.split_documents(data)
|
7 |
+
return docs
|
qa/vector_store.py
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Callable, List
|
2 |
+
|
3 |
+
def create_vector_store(
|
4 |
+
docs: List,
|
5 |
+
metric: str = 'cos',
|
6 |
+
top_k: int = 4
|
7 |
+
) -> Callable:
|
8 |
+
|
9 |
+
from langchain.vectorstores import FAISS
|
10 |
+
from langchain.embeddings.openai import OpenAIEmbeddings
|
11 |
+
|
12 |
+
embeddings = OpenAIEmbeddings()
|
13 |
+
|
14 |
+
# Embed your documents and combine with the raw text in a pseudo db.
|
15 |
+
# Note: This will make an API call to OpenAI
|
16 |
+
docsearch = FAISS.from_documents(docs, embeddings)
|
17 |
+
|
18 |
+
# Retriver object
|
19 |
+
retriever = docsearch.as_retriever()
|
20 |
+
|
21 |
+
# Retriver configs
|
22 |
+
retriever.search_kwargs['distance_metric'] = metric
|
23 |
+
retriever.search_kwargs['k'] = top_k
|
24 |
+
|
25 |
+
return retriever
|
requirements.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
langchain
|
2 |
+
openai
|
3 |
+
youtube-transcript-api
|
4 |
+
faiss-cpu
|
5 |
+
tiktoken
|