youtube-QandA / fetch_transcript.py
shreydan's picture
add all
697eefa
import regex as re
from youtube_transcript_api import YouTubeRequestFailed, YouTubeTranscriptApi
from preprocessing import stride_sentences
def validate_youtube_link(url: str) -> str:
"""
this method validates the youtube video link provided.
input : url (str)
outputs: transcript (string/dict)
"""
yt_regex = r"^.*(youtu.be\/|v\/|u\/\w\/|embed\/|watch\?v=|\&v=|\?v=)([^#\&\?]*).*"
matches = re.findall(yt_regex, url)
assert (len(matches[0][1]) == 11), "Invalid YouTube Link"
video_id:str = matches[0][1]
return video_id
def zip_transcript(transcript:list) -> dict:
start_times = []
texts = []
for item in transcript:
start_times.append(item['start'])
texts.append(item['text'].strip().replace('\n',' '))
return {
'timestamps': start_times,
'texts': texts
}
def full_text(transcript: list) -> str:
texts = []
for item in transcript:
texts.append(item['text'])
return ' '.join(texts).strip()
def fetch_transcript(url: str) -> list:
video_id = validate_youtube_link(url)
try:
transcript:list = YouTubeTranscriptApi.get_transcript(video_id=video_id)
except YouTubeRequestFailed:
raise Exception('YouTube Request Failed, try again later.')
return transcript
if __name__ == '__main__':
sample = 'https://www.youtube.com/watch?v=t6V9i8fFADI'
sample2 = 'https://www.youtube.com/watch?v=1nLHIM2IPRY'
fake_sample = 'https://www.youtube.com/watch?v=asdf3'
transcript = fetch_transcript(url=sample)
times, texts = zip_transcript(transcript)
texts = stride_sentences(texts)
print(texts[0])
# with open('sample_group.txt','w') as f:
# for group in groups:
# f.write(f"{group}\n\n")