Spaces:
Runtime error
Runtime error
File size: 1,821 Bytes
697eefa |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 |
import regex as re
from youtube_transcript_api import YouTubeRequestFailed, YouTubeTranscriptApi
from preprocessing import stride_sentences
def validate_youtube_link(url: str) -> str:
"""
this method validates the youtube video link provided.
input : url (str)
outputs: transcript (string/dict)
"""
yt_regex = r"^.*(youtu.be\/|v\/|u\/\w\/|embed\/|watch\?v=|\&v=|\?v=)([^#\&\?]*).*"
matches = re.findall(yt_regex, url)
assert (len(matches[0][1]) == 11), "Invalid YouTube Link"
video_id:str = matches[0][1]
return video_id
def zip_transcript(transcript:list) -> dict:
start_times = []
texts = []
for item in transcript:
start_times.append(item['start'])
texts.append(item['text'].strip().replace('\n',' '))
return {
'timestamps': start_times,
'texts': texts
}
def full_text(transcript: list) -> str:
texts = []
for item in transcript:
texts.append(item['text'])
return ' '.join(texts).strip()
def fetch_transcript(url: str) -> list:
video_id = validate_youtube_link(url)
try:
transcript:list = YouTubeTranscriptApi.get_transcript(video_id=video_id)
except YouTubeRequestFailed:
raise Exception('YouTube Request Failed, try again later.')
return transcript
if __name__ == '__main__':
sample = 'https://www.youtube.com/watch?v=t6V9i8fFADI'
sample2 = 'https://www.youtube.com/watch?v=1nLHIM2IPRY'
fake_sample = 'https://www.youtube.com/watch?v=asdf3'
transcript = fetch_transcript(url=sample)
times, texts = zip_transcript(transcript)
texts = stride_sentences(texts)
print(texts[0])
# with open('sample_group.txt','w') as f:
# for group in groups:
# f.write(f"{group}\n\n")
|