Spaces:

YiYiXu
/

it-happened-one-frame-2

Runtime error

App Files Files Community

yiyixuxu commited on Jun 7, 2022

Commit

5f4ce2c

•

1 Parent(s): 15b3749

added batch processing for image encoding

Browse files

Files changed (1) hide show

app.py +48 -43

app.py CHANGED Viewed

@@ -30,19 +30,24 @@ def select_video_format(url, format_note='480p', ext='mp4'):
     format_id = format.get('format_id', None)
     fps = format.get('fps', None)
     print(f'format selected: {format}')
-    return(format_id, fps)
-def download_video(url,format_id):
-    # testing
-    print(f"testing...all the files in local directory: {os.listdir('.')}")
     ydl_opts = {
       'format':format_id,
-      'outtmpl': "%(id)s.%(ext)s"}
     with youtube_dl.YoutubeDL(ydl_opts) as ydl:
         try:
             ydl.cache.remove()
             meta = ydl.extract_info(url)
-            save_location = meta['id'] + '.' + meta['ext']
         except youtube_dl.DownloadError as error:
             print(f'error with download_video function: {error}')
         return(save_location)
@@ -51,17 +56,17 @@ def process_video_parallel(video, skip_frames, dest_path, num_processes, process
     cap = cv2.VideoCapture(video)
     frames_per_process = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) // (num_processes)
     count =  frames_per_process * process_number
     print(f"worker: {process_number}, process frames {count} ~ {frames_per_process * (process_number + 1)} \n total number of frames: {cap.get(cv2.CAP_PROP_FRAME_COUNT)} \n video: {video}; isOpen? : {cap.isOpened()}")
     while count < frames_per_process * (process_number + 1) :
         ret, frame = cap.read()
         if not ret:
             break
-        count += 1
-        if (count - frames_per_process * process_number) % skip_frames ==0:
           filename =f"{dest_path}/{count}.jpg"
           cv2.imwrite(filename, frame)
           #print(f"saved {filename}")
     cap.release()
@@ -74,9 +79,8 @@ def vid2frames(url, sampling_interval=1, ext='mp4'):
         shutil.rmtree(dest_path)
         dest_path.mkdir(parents=True)
     # figure out the format for download,
-    # by default select 480p, if not available, choose the best format available
-    # mp4
-    format_id, fps = select_video_format(url, format_note='480p', ext='mp4')
     # download the video
     video = download_video(url,format_id)
     # calculate skip_frames
@@ -85,27 +89,16 @@ def vid2frames(url, sampling_interval=1, ext='mp4'):
     except:
         skip_frames = int(30 * sampling_interval)
     print(f'video saved at: {video}, fps:{fps}, skip_frames: {skip_frames}')
     # extract video frames at given sampling interval with multiprocessing -
-    print('extracting frames...')
-    n_workers = min(os.cpu_count(), 1)
-   # testing..
-    cap = cv2.VideoCapture(video)
-    print(f'video: {video}; isOpen? : {cap.isOpened()}')
-    print(f'n_workers: {n_workers}')
     with Pool(n_workers) as pool:
         pool.map(partial(process_video_parallel, video, skip_frames, dest_path, n_workers), range(n_workers))
-    # read frames
-    original_images = []
-    images = []
-    filenames = sorted(dest_path.glob('*.jpg'),key=lambda p: int(p.stem))
-    print(f"extracted {len(filenames)} frames")
-    for filename in filenames:
-      image = Image.open(filename).convert("RGB")
-      original_images.append(image)
-      images.append(preprocess(image))
-    return original_images, images
 def captioned_strip(images, caption=None, times=None, rows=1):
@@ -116,8 +109,6 @@ def captioned_strip(images, caption=None, times=None, rows=1):
         img.paste(img_, (i // rows * w, increased_h + (i % rows) * h))
     if caption is not None:
         draw = ImageDraw.Draw(img)
-        #font = ImageFont.load_default()
-        #font_small = ImageFont.truetype("arial.pil", 12)
         font = ImageFont.truetype(
             "/usr/share/fonts/truetype/liberation2/LiberationMono-Bold.ttf", 16
         )
@@ -131,26 +122,40 @@ def captioned_strip(images, caption=None, times=None, rows=1):
           (255, 255, 255), font=font_small)
     return img
-def run_inference(url, sampling_interval, search_query):
-    original_images, images = vid2frames(url,sampling_interval)
-    image_input = torch.tensor(np.stack(images)).to(device)
-    print("testing.. created image_input")
     with torch.no_grad():
-        image_features = model.encode_image(image_input)
         text_features = model.encode_text(clip.tokenize(search_query).to(device))
-    image_features /= image_features.norm(dim=-1, keepdim=True)
-    text_features /= text_features.norm(dim=-1, keepdim=True)
     similarity = (100.0 * image_features @ text_features.T)
     values, indices = similarity.topk(4, dim=0)
-    print("testing.. selected best frames")
-    best_frames = [original_images[ind] for ind in indices]
     times = [f'{datetime.timedelta(seconds = ind[0].item() * sampling_interval)}' for ind in indices]
-    print("testing... before captioned_strip func")
     image_output = captioned_strip(best_frames,search_query, times,2)
     title = search_query
-    print("testing... after captioned_strip func")
     return(title, image_output)
 inputs = [gr.inputs.Textbox(label="Give us the link to your youtube video!"),

     format_id = format.get('format_id', None)
     fps = format.get('fps', None)
     print(f'format selected: {format}')
+    return(format, format_id, fps)
+# to-do: delete saved videos
+def download_video(url,format_id, n_keep=10):
     ydl_opts = {
       'format':format_id,
+      'outtmpl': "videos/%(id)s.%(ext)s"}
+    # create a directory for saved videos
+    video_path = Path('videos')
+    try:
+      video_path.mkdir(parents=True)
+    except FileExistsError:
+      pass
     with youtube_dl.YoutubeDL(ydl_opts) as ydl:
         try:
             ydl.cache.remove()
             meta = ydl.extract_info(url)
+            save_location = 'videos/' + meta['id'] + '.' + meta['ext']
         except youtube_dl.DownloadError as error:
             print(f'error with download_video function: {error}')
         return(save_location)
     cap = cv2.VideoCapture(video)
     frames_per_process = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) // (num_processes)
     count =  frames_per_process * process_number
+    cap.set(cv2.CAP_PROP_POS_FRAMES, count)
     print(f"worker: {process_number}, process frames {count} ~ {frames_per_process * (process_number + 1)} \n total number of frames: {cap.get(cv2.CAP_PROP_FRAME_COUNT)} \n video: {video}; isOpen? : {cap.isOpened()}")
     while count < frames_per_process * (process_number + 1) :
         ret, frame = cap.read()
         if not ret:
             break
+        if count  % skip_frames ==0:
           filename =f"{dest_path}/{count}.jpg"
           cv2.imwrite(filename, frame)
           #print(f"saved {filename}")
+        count += 1
     cap.release()
         shutil.rmtree(dest_path)
         dest_path.mkdir(parents=True)
     # figure out the format for download,
+    # by default select 480p and .mp4
+    format, format_id, fps = select_video_format(url, format_note='480p', ext='mp4')
     # download the video
     video = download_video(url,format_id)
     # calculate skip_frames
     except:
         skip_frames = int(30 * sampling_interval)
     print(f'video saved at: {video}, fps:{fps}, skip_frames: {skip_frames}')
     # extract video frames at given sampling interval with multiprocessing -
+    n_workers = min(os.cpu_count(), 12)
+    print(f'now extracting frames with {n_workers} process...')
     with Pool(n_workers) as pool:
         pool.map(partial(process_video_parallel, video, skip_frames, dest_path, n_workers), range(n_workers))
+    return(skip_frames, dest_path)
 def captioned_strip(images, caption=None, times=None, rows=1):
         img.paste(img_, (i // rows * w, increased_h + (i % rows) * h))
     if caption is not None:
         draw = ImageDraw.Draw(img)
         font = ImageFont.truetype(
             "/usr/share/fonts/truetype/liberation2/LiberationMono-Bold.ttf", 16
         )
           (255, 255, 255), font=font_small)
     return img
+def run_inference(url, sampling_interval, search_query, bs=256):
+    skip_frames, path_frames= vid2frames(url,sampling_interval)
+    filenames = sorted(path_frames.glob('*.jpg'),key=lambda p: int(p.stem))
+    n_frames = len(filenames)
+    bs = min(n_frames,bs)
+    print(f"extracted {n_frames} frames, now encoding images")
+    # encoding images one batch at a time, combine all batch outputs -> image_features, size n_frames x 512
+    image_features = torch.empty(size=(n_frames, 512), dtype=torch.float16).to(device)
+    print(f"batch size :{bs} ; number of batches: {len(range(0, n_frames,bs))}")
+    for b in range(0, n_frames,bs):
+        images = []
+        # loop through all frames in the batch -> create batch_image_input, size bs x 3 x 224 x 224
+        for filename in filenames[b:b+bs]:
+            image = Image.open(filename).convert("RGB")
+            images.append(preprocess(image))
+        batch_image_input = torch.tensor(np.stack(images)).to(device)
+        # encoding batch_image_input -> batch_image_features
+        with torch.no_grad():
+            batch_image_features = model.encode_image(batch_image_input)
+            batch_image_features /= batch_image_features.norm(dim=-1, keepdim=True)
+        # add encoded image embedding to image_features
+        image_features[b:b+bs] = batch_image_features
+    # encoding search query
     with torch.no_grad():
         text_features = model.encode_text(clip.tokenize(search_query).to(device))
+        text_features /= text_features.norm(dim=-1, keepdim=True)
     similarity = (100.0 * image_features @ text_features.T)
     values, indices = similarity.topk(4, dim=0)
+    best_frames = [Image.open(filenames[ind]).convert("RGB") for ind in indices]
     times = [f'{datetime.timedelta(seconds = ind[0].item() * sampling_interval)}' for ind in indices]
     image_output = captioned_strip(best_frames,search_query, times,2)
     title = search_query
     return(title, image_output)
 inputs = [gr.inputs.Textbox(label="Give us the link to your youtube video!"),