Vintern-1B-v3.5-Demo

Running on Zero

App Files Files Community

Vintern-1B-v3.5-Demo / projects /llava_sam2 /gradio /app_utils.py

fffiloni

Migrated from GitHub

d59f323 verified 6 days ago

raw

history blame

9.61 kB

	import numpy as np
	from PIL import Image
	import cv2

	markdown_default = """
	<link href="https://fonts.googleapis.com/css2?family=Montserrat:wght@400;700&display=swap" rel="stylesheet">
	<style>
	.highlighted-text {
	font-family: 'Montserrat', sans-serif;
	font-weight: 600;
	font-size: 14px;
	color: rgb(255, 255, 239);
	background-color: rgb(225, 231, 254);
	border-radius: 7px;
	padding: 5px 7px;
	display: inline-block;
	}
	.regular-text {
	font-family: 'Montserrat', sans-serif;
	font-weight: 400;
	font-size: 14px;
	}
	.highlighted-response {
	font-family: 'Montserrat', sans-serif;
	font-weight: 600;
	font-size: 14px;
	border-radius: 6px;
	padding: 3px 4px;
	display: inline-block;
	}
	</style>
	<span class="highlighted-text" style='color:rgb(107, 100, 239)'>Sa2VA</span>
	"""

	description = """
	Usage : <br>
	&ensp;(1) For Grounded Caption Generation Interleaved Segmentation, input prompt like: "Could you provide me with a detailed analysis of this photo? Please output with interleaved segmentation masks for the corresponding parts of the answer." <br>
	&ensp;(2) For Segmentation Output, input prompt like: "Can you please segment xxx in the given image" <br>
	&ensp;(3) For Image Captioning VQA, input prompt like: "Could you please give me a detailed description of the image?" <br>
	&ensp;(4) For Image Conversation, input arbitrary text instruction. <br>
	"""

	ONE_THIRD = 1.0/3.0
	ONE_SIXTH = 1.0/6.0
	TWO_THIRD = 2.0/3.0

	def desaturate(rgb, factor=0.65):
	"""
	Desaturate an RGB color by a given factor.

	:param rgb: A tuple of (r, g, b) where each value is in [0, 255].
	:param factor: The factor by which to reduce the saturation.
	0 means completely desaturated, 1 means original color.
	:return: A tuple of desaturated (r, g, b) values in [0, 255].
	"""
	r, g, b = [x / 255.0 for x in rgb]
	h, l, s = rgb_to_hls(r, g, b)
	l = factor
	new_r, new_g, new_b = hls_to_rgb(h, l, s)
	return (int(new_r * 255), int(new_g * 255), int(new_b * 255))

	def rgb_to_hls(r, g, b):
	maxc = max(r, g, b)
	minc = min(r, g, b)
	sumc = (maxc+minc)
	rangec = (maxc-minc)
	l = sumc/2.0
	if minc == maxc:
	return 0.0, l, 0.0
	if l <= 0.5:
	s = rangec / sumc
	else:
	s = rangec / (2.0-sumc)
	rc = (maxc-r) / rangec
	gc = (maxc-g) / rangec
	bc = (maxc-b) / rangec
	if r == maxc:
	h = bc-gc
	elif g == maxc:
	h = 2.0+rc-bc
	else:
	h = 4.0+gc-rc
	h = (h/6.0) % 1.0
	return h, l, s

	def hls_to_rgb(h, l, s):
	if s == 0.0:
	return l, l, l
	if l <= 0.5:
	m2 = l * (1.0+s)
	else:
	m2 = l+s-(l*s)
	m1 = 2.0*l - m2
	return (_v(m1, m2, h+ONE_THIRD), _v(m1, m2, h), _v(m1, m2, h-ONE_THIRD))

	def _v(m1, m2, hue):
	hue = hue % 1.0
	if hue < ONE_SIXTH:
	return m1 + (m2-m1)hue6.0
	if hue < 0.5:
	return m2
	if hue < TWO_THIRD:
	return m1 + (m2-m1)(TWO_THIRD-hue)6.0
	return m1

	def process_markdown(output_str, colors):
	output_str = output_str.replace("\n", "").replace(" ", " ").replace("<s>", "")\
	.replace("<\|im_end\|>", '').replace("<\|end\|>", "")
	output_str = output_str.split("ASSISTANT: ")[-1]

	# markdown_out = output_str.replace('[SEG]', '')
	markdown_out = output_str
	markdown_out = markdown_out.replace(
	"<p>", "<span class='highlighted-response' style='background-color:rgb[COLOR]'>"
	)
	markdown_out = markdown_out.replace("</p>", "</span>")

	for color in colors:
	markdown_out = markdown_out.replace("[COLOR]", str(desaturate(tuple(color))), 1)

	markdown_out = f"""
	{markdown_out}
	"""
	markdown_out = markdown_default + "<p><span class='regular-text'>" + markdown_out
	return markdown_out

	def show_mask_pred(image, masks):
	masks = [mask[:1] for mask in masks]
	masks = np.concatenate(masks, axis=0) # (n, h, w)

	selected_colors = []

	colors = [(255, 0, 0), (0, 255, 0), (0, 0, 255),
	(255, 255, 0), (255, 0, 255), (0, 255, 255),
	(128, 128, 255), [255, 192, 203], # Pink
	[165, 42, 42], # Brown
	[255, 165, 0], # Orange
	[128, 0, 128], # Purple
	[0, 0, 128], # Navy
	[128, 0, 0], # Maroon
	[128, 128, 0], # Olive
	[70, 130, 180], # Steel Blue
	[173, 216, 230], # Light Blue
	[255, 192, 0], # Gold
	[255, 165, 165], # Light Salmon
	[255, 20, 147], # Deep Pink
	]

	_mask_image = np.zeros((masks.shape[1], masks.shape[2], 3), dtype=np.uint8)

	for i, mask in enumerate(masks):
	color = colors[i % len(colors)]
	selected_colors.append(color)
	_mask_image[:, :, 0] = _mask_image[:, :, 0] + mask.astype(np.uint8) * color[0]
	_mask_image[:, :, 1] = _mask_image[:, :, 1] + mask.astype(np.uint8) * color[1]
	_mask_image[:, :, 2] = _mask_image[:, :, 2] + mask.astype(np.uint8) * color[2]


	image = np.array(image)
	image = image * 0.5 + _mask_image * 0.5
	image = image.astype(np.uint8)
	return image, selected_colors

	def show_mask_pred_video(video, masks):
	ret_video = []
	selected_colors = []
	colors = [(255, 0, 0), (0, 255, 0), (0, 0, 255),
	(255, 255, 0), (255, 0, 255), (0, 255, 255),
	(128, 128, 255), [255, 192, 203], # Pink
	[165, 42, 42], # Brown
	[255, 165, 0], # Orange
	[128, 0, 128], # Purple
	[0, 0, 128], # Navy
	[128, 0, 0], # Maroon
	[128, 128, 0], # Olive
	[70, 130, 180], # Steel Blue
	[173, 216, 230], # Light Blue
	[255, 192, 0], # Gold
	[255, 165, 165], # Light Salmon
	[255, 20, 147], # Deep Pink
	]
	for i_frame in range(len(video)):
	frame_masks = [mask[i_frame:i_frame+1] for mask in masks]
	frame_masks = np.concatenate(frame_masks, axis=0)
	_mask_image = np.zeros((frame_masks.shape[1], frame_masks.shape[2], 3), dtype=np.uint8)

	for i, mask in enumerate(frame_masks):
	if i_frame == 0:
	color = colors[i % len(colors)]
	selected_colors.append(color)
	else:
	color = selected_colors[i]
	_mask_image[:, :, 0] = _mask_image[:, :, 0] + mask.astype(np.uint8) * color[0]
	_mask_image[:, :, 1] = _mask_image[:, :, 1] + mask.astype(np.uint8) * color[1]
	_mask_image[:, :, 2] = _mask_image[:, :, 2] + mask.astype(np.uint8) * color[2]

	image = np.array(video[i_frame])
	image = image * 0.5 + _mask_image * 0.5
	image = image.astype(np.uint8)
	ret_video.append(image)
	return ret_video, selected_colors

	def parse_visual_prompts(points):
	ret = {'points': [], 'boxes': []}
	for item in points:
	if item[2] == 1.0:
	ret['points'].append([item[0], item[1]])
	elif item[2] == 2.0 or item[2] == 3.0:
	ret['boxes'].append([item[0], item[1], item[3], item[4]])
	else:
	raise NotImplementedError
	return ret

	def get_video_frames(video_path):
	cap = cv2.VideoCapture(video_path)

	if not cap.isOpened():
	print("Error: Cannot open video file.")
	return

	frames = []

	frame_id = 0
	while True:
	ret, frame = cap.read()

	if not ret:
	break

	frames.append(frame)

	frame_id += 1

	cap.release()
	return frames

	def get_frames_from_video(video_path, n_frames=5, sample_type="uniform"):
	frames = get_video_frames(video_path)
	if sample_type == "uniform":
	stride = len(frames) / (n_frames + 1e-4)
	ret = []
	for i in range(n_frames):
	idx = int(i * stride)
	frame = frames[idx]
	frame = frame[:, :, ::-1]
	frame_image = Image.fromarray(frame).convert('RGB')
	ret.append(frame_image)
	else:
	ret = []
	for frame in frames[:500]:
	frame = frame[:, :, ::-1]
	frame_image = Image.fromarray(frame).convert('RGB')
	ret.append(frame_image)
	return ret

	def preprocess_video(video_path, text):
	if "Segment" in text or "segment" in text:
	sample_type = 'begin'
	else:
	sample_type = 'uniform'
	return get_frames_from_video(video_path, sample_type=sample_type)

	def image2video_and_save(frames, save_path):
	success = frames_to_video(frames, save_path)
	return save_path


	def frames_to_video(
	frames,
	output_path: str,
	fps: int = 24,
	) -> bool:
	try:
	frames = [frame[:, :, ::-1] for frame in frames]
	# Use provided frame size or get from first frame
	height, width = frames[0].shape[:2]

	# Initialize video writer
	fourcc = cv2.VideoWriter_fourcc(*'mp4v')
	out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))

	# Process each frame
	for frame in frames:
	out.write(frame)

	# Release video writer
	out.release()
	print(f"Video saved successfully to {output_path}")
	return True

	except Exception as e:
	print(f"Error converting frames to video: {str(e)}")
	return False