Spaces:
Runtime error
Runtime error
import regex as re | |
from youtube_transcript_api import YouTubeRequestFailed, YouTubeTranscriptApi | |
from preprocessing import stride_sentences | |
def validate_youtube_link(url: str) -> str: | |
""" | |
this method validates the youtube video link provided. | |
input : url (str) | |
outputs: transcript (string/dict) | |
""" | |
yt_regex = r"^.*(youtu.be\/|v\/|u\/\w\/|embed\/|watch\?v=|\&v=|\?v=)([^#\&\?]*).*" | |
matches = re.findall(yt_regex, url) | |
assert (len(matches[0][1]) == 11), "Invalid YouTube Link" | |
video_id:str = matches[0][1] | |
return video_id | |
def zip_transcript(transcript:list) -> dict: | |
start_times = [] | |
texts = [] | |
for item in transcript: | |
start_times.append(item['start']) | |
texts.append(item['text'].strip().replace('\n',' ')) | |
return { | |
'timestamps': start_times, | |
'texts': texts | |
} | |
def full_text(transcript: list) -> str: | |
texts = [] | |
for item in transcript: | |
texts.append(item['text']) | |
return ' '.join(texts).strip() | |
def fetch_transcript(url: str) -> list: | |
video_id = validate_youtube_link(url) | |
try: | |
transcript:list = YouTubeTranscriptApi.get_transcript(video_id=video_id) | |
except YouTubeRequestFailed: | |
raise Exception('YouTube Request Failed, try again later.') | |
return transcript | |
if __name__ == '__main__': | |
sample = 'https://www.youtube.com/watch?v=t6V9i8fFADI' | |
sample2 = 'https://www.youtube.com/watch?v=1nLHIM2IPRY' | |
fake_sample = 'https://www.youtube.com/watch?v=asdf3' | |
transcript = fetch_transcript(url=sample) | |
times, texts = zip_transcript(transcript) | |
texts = stride_sentences(texts) | |
print(texts[0]) | |
# with open('sample_group.txt','w') as f: | |
# for group in groups: | |
# f.write(f"{group}\n\n") | |