Spaces:
Sleeping
Sleeping
from youtube_transcript_api import YouTubeTranscriptApi as ytapi | |
from youtube_transcript_api.formatters import TextFormatter | |
import json | |
def getSubsText(video_id="", getGenerated=False): | |
tList = ytapi.list_transcripts(video_id) | |
if getGenerated: | |
# TODO: implement getGenerated | |
pass | |
for t in tList: | |
data = t.fetch() | |
return (TextFormatter().format_transcript(data)).replace("\n", " ") | |
def getSubs(video_id="", getGenerated=False, chunker=None): | |
tList = ytapi.list_transcripts(video_id) | |
if getGenerated: | |
pass | |
for t in tList: | |
data = t.fetch() | |
return data | |
class subs: | |
def __init__(self, video_id="", generated=False): | |
self.video_id = video_id | |
self.generated = generated | |
self.subs = getSubs(video_id, generated) | |
def __sizeof__(self) -> int: | |
count = 0 | |
for _ in self.subs: | |
count += 1 | |
return count | |
def getText(self): | |
return (TextFormatter().format_transcript(self.subs)).replace("\n", " ") | |
def getSubs(self): | |
subs = self.subs | |
# [chunk, duration] | |
c_d_subs = '\n'.join(f"{subs['text']}:::{subs['duration']}" for subs in subs) | |
return c_d_subs | |
def getSubsRaw(self): | |
return self.subs | |
def getSubsList(self, size=100): | |
subs = json.loads(json.dumps(self.subs)) | |
chunks = [] | |
current_chunk = "" # limited to {size} | |
current_duaration = 0 # TODO: add better variable name | |
for subline in subs: | |
current_duaration = subline["start"] | |
if len(current_chunk) + len(subline["text"]) + 1 <= size: | |
current_chunk += f"{subline['text']} " | |
else: | |
chunks.append( | |
[ | |
current_chunk.strip(), | |
current_duaration | |
] | |
) | |
current_chunk = f"{subline['text']} " | |
return chunks | |