import numpy as np
from PIL import Image
import cv2
markdown_default = """
Sa2VA
"""
description = """
**Usage** :
(1) For **Grounded Caption Generation** Interleaved Segmentation, input prompt like: *"Could you provide me with a detailed analysis of this photo? Please output with interleaved segmentation masks for the corresponding parts of the answer."*
(2) For **Segmentation Output**, input prompt like: *"Can you please segment xxx in the given image"*
(3) For **Image Captioning** VQA, input prompt like: *"Could you please give me a detailed description of the image?"*
(4) For **Image Conversation**, input arbitrary text instruction.
"""
ONE_THIRD = 1.0/3.0
ONE_SIXTH = 1.0/6.0
TWO_THIRD = 2.0/3.0
def desaturate(rgb, factor=0.65):
"""
Desaturate an RGB color by a given factor.
:param rgb: A tuple of (r, g, b) where each value is in [0, 255].
:param factor: The factor by which to reduce the saturation.
0 means completely desaturated, 1 means original color.
:return: A tuple of desaturated (r, g, b) values in [0, 255].
"""
r, g, b = [x / 255.0 for x in rgb]
h, l, s = rgb_to_hls(r, g, b)
l = factor
new_r, new_g, new_b = hls_to_rgb(h, l, s)
return (int(new_r * 255), int(new_g * 255), int(new_b * 255))
def rgb_to_hls(r, g, b):
maxc = max(r, g, b)
minc = min(r, g, b)
sumc = (maxc+minc)
rangec = (maxc-minc)
l = sumc/2.0
if minc == maxc:
return 0.0, l, 0.0
if l <= 0.5:
s = rangec / sumc
else:
s = rangec / (2.0-sumc)
rc = (maxc-r) / rangec
gc = (maxc-g) / rangec
bc = (maxc-b) / rangec
if r == maxc:
h = bc-gc
elif g == maxc:
h = 2.0+rc-bc
else:
h = 4.0+gc-rc
h = (h/6.0) % 1.0
return h, l, s
def hls_to_rgb(h, l, s):
if s == 0.0:
return l, l, l
if l <= 0.5:
m2 = l * (1.0+s)
else:
m2 = l+s-(l*s)
m1 = 2.0*l - m2
return (_v(m1, m2, h+ONE_THIRD), _v(m1, m2, h), _v(m1, m2, h-ONE_THIRD))
def _v(m1, m2, hue):
hue = hue % 1.0
if hue < ONE_SIXTH:
return m1 + (m2-m1)*hue*6.0
if hue < 0.5:
return m2
if hue < TWO_THIRD:
return m1 + (m2-m1)*(TWO_THIRD-hue)*6.0
return m1
def process_markdown(output_str, colors):
output_str = output_str.replace("\n", "").replace(" ", " ").replace("", "")\
.replace("<|im_end|>", '').replace("<|end|>", "")
output_str = output_str.split("ASSISTANT: ")[-1]
# markdown_out = output_str.replace('[SEG]', '')
markdown_out = output_str
markdown_out = markdown_out.replace(
"
", "" ) markdown_out = markdown_out.replace("
", "") for color in colors: markdown_out = markdown_out.replace("[COLOR]", str(desaturate(tuple(color))), 1) markdown_out = f""" {markdown_out} """ markdown_out = markdown_default + "" + markdown_out return markdown_out def show_mask_pred(image, masks): masks = [mask[:1] for mask in masks] masks = np.concatenate(masks, axis=0) # (n, h, w) selected_colors = [] colors = [(255, 0, 0), (0, 255, 0), (0, 0, 255), (255, 255, 0), (255, 0, 255), (0, 255, 255), (128, 128, 255), [255, 192, 203], # Pink [165, 42, 42], # Brown [255, 165, 0], # Orange [128, 0, 128], # Purple [0, 0, 128], # Navy [128, 0, 0], # Maroon [128, 128, 0], # Olive [70, 130, 180], # Steel Blue [173, 216, 230], # Light Blue [255, 192, 0], # Gold [255, 165, 165], # Light Salmon [255, 20, 147], # Deep Pink ] _mask_image = np.zeros((masks.shape[1], masks.shape[2], 3), dtype=np.uint8) for i, mask in enumerate(masks): color = colors[i % len(colors)] selected_colors.append(color) _mask_image[:, :, 0] = _mask_image[:, :, 0] + mask.astype(np.uint8) * color[0] _mask_image[:, :, 1] = _mask_image[:, :, 1] + mask.astype(np.uint8) * color[1] _mask_image[:, :, 2] = _mask_image[:, :, 2] + mask.astype(np.uint8) * color[2] image = np.array(image) image = image * 0.5 + _mask_image * 0.5 image = image.astype(np.uint8) return image, selected_colors def show_mask_pred_video(video, masks): ret_video = [] selected_colors = [] colors = [(255, 0, 0), (0, 255, 0), (0, 0, 255), (255, 255, 0), (255, 0, 255), (0, 255, 255), (128, 128, 255), [255, 192, 203], # Pink [165, 42, 42], # Brown [255, 165, 0], # Orange [128, 0, 128], # Purple [0, 0, 128], # Navy [128, 0, 0], # Maroon [128, 128, 0], # Olive [70, 130, 180], # Steel Blue [173, 216, 230], # Light Blue [255, 192, 0], # Gold [255, 165, 165], # Light Salmon [255, 20, 147], # Deep Pink ] for i_frame in range(len(video)): frame_masks = [mask[i_frame:i_frame+1] for mask in masks] frame_masks = np.concatenate(frame_masks, axis=0) _mask_image = np.zeros((frame_masks.shape[1], frame_masks.shape[2], 3), dtype=np.uint8) for i, mask in enumerate(frame_masks): if i_frame == 0: color = colors[i % len(colors)] selected_colors.append(color) else: color = selected_colors[i] _mask_image[:, :, 0] = _mask_image[:, :, 0] + mask.astype(np.uint8) * color[0] _mask_image[:, :, 1] = _mask_image[:, :, 1] + mask.astype(np.uint8) * color[1] _mask_image[:, :, 2] = _mask_image[:, :, 2] + mask.astype(np.uint8) * color[2] image = np.array(video[i_frame]) image = image * 0.5 + _mask_image * 0.5 image = image.astype(np.uint8) ret_video.append(image) return ret_video, selected_colors def parse_visual_prompts(points): ret = {'points': [], 'boxes': []} for item in points: if item[2] == 1.0: ret['points'].append([item[0], item[1]]) elif item[2] == 2.0 or item[2] == 3.0: ret['boxes'].append([item[0], item[1], item[3], item[4]]) else: raise NotImplementedError return ret def get_video_frames(video_path): cap = cv2.VideoCapture(video_path) if not cap.isOpened(): print("Error: Cannot open video file.") return frames = [] frame_id = 0 while True: ret, frame = cap.read() if not ret: break frames.append(frame) frame_id += 1 cap.release() return frames def get_frames_from_video(video_path, n_frames=5, sample_type="uniform"): frames = get_video_frames(video_path) if sample_type == "uniform": stride = len(frames) / (n_frames + 1e-4) ret = [] for i in range(n_frames): idx = int(i * stride) frame = frames[idx] frame = frame[:, :, ::-1] frame_image = Image.fromarray(frame).convert('RGB') ret.append(frame_image) else: ret = [] for frame in frames[:500]: frame = frame[:, :, ::-1] frame_image = Image.fromarray(frame).convert('RGB') ret.append(frame_image) return ret def preprocess_video(video_path, text): if "Segment" in text or "segment" in text: sample_type = 'begin' else: sample_type = 'uniform' return get_frames_from_video(video_path, sample_type=sample_type) def image2video_and_save(frames, save_path): success = frames_to_video(frames, save_path) return save_path def frames_to_video( frames, output_path: str, fps: int = 24, ) -> bool: try: frames = [frame[:, :, ::-1] for frame in frames] # Use provided frame size or get from first frame height, width = frames[0].shape[:2] # Initialize video writer fourcc = cv2.VideoWriter_fourcc(*'mp4v') out = cv2.VideoWriter(output_path, fourcc, fps, (width, height)) # Process each frame for frame in frames: out.write(frame) # Release video writer out.release() print(f"Video saved successfully to {output_path}") return True except Exception as e: print(f"Error converting frames to video: {str(e)}") return False