Vintern-1B-v3.5-Demo

Running on Zero

App Files Files Community

Vintern-1B-v3.5-Demo / projects /llava_sam2 /datasets /ReSAM2_Dataset.py

fffiloni

Migrated from GitHub

d59f323 verified 6 days ago

raw

history blame

19.3 kB

	import logging
	import os
	import torch
	from datasets import Dataset as HFDataset
	from datasets import DatasetDict, load_from_disk
	from mmengine import print_log
	from PIL import Image
	from torch.utils.data import Dataset
	import numpy as np

	from xtuner.registry import BUILDER
	from xtuner.dataset.huggingface import process_hf_dataset, build_origin_dataset
	import copy
	from .encode_fn import video_lisa_encode_fn
	import json
	import random
	import pycocotools.mask as maskUtils
	import cv2
	import torchvision.transforms as T
	from torchvision.transforms.functional import InterpolationMode

	SEG_QUESTIONS = [
	"Please segment the object according to the description: {class_name}",
	]

	SEG_QUESTIONS_SHORT = [
	"Can you segment the {class_name} in this image?",
	"Please segment {class_name} in this image.",
	"What is {class_name} in this image? Please respond with segmentation mask.",
	"What is {class_name} in this image? Please output segmentation mask.",

	"Can you segment the {class_name} in this image",
	"Please segment {class_name} in this image",
	"What is {class_name} in this image? Please respond with segmentation mask",
	"What is {class_name} in this image? Please output segmentation mask",

	"Could you provide a segmentation mask for the {class_name} in this image?",
	"Please identify and segment the {class_name} in this image.",
	"Where is the {class_name} in this picture? Please respond with a segmentation mask.",
	"Can you highlight the {class_name} in this image with a segmentation mask?",

	"Could you provide a segmentation mask for the {class_name} in this image",
	"Please identify and segment the {class_name} in this image",
	"Where is the {class_name} in this picture? Please respond with a segmentation mask",
	"Can you highlight the {class_name} in this image with a segmentation mask",
	]

	ANSWER_LIST = [
	"It is [SEG].",
	"Sure, [SEG].",
	"Sure, it is [SEG].",
	"Sure, the segmentation result is [SEG].",
	"[SEG].",
	]

	class VideoSAM2Dataset(Dataset):
	IMAGENET_MEAN = (0.485, 0.456, 0.406)
	IMAGENET_STD = (0.229, 0.224, 0.225)
	IMG_CONTEXT_TOKEN = '<IMG_CONTEXT>'
	IMG_START_TOKEN = '<img>'
	IMG_END_TOKEN = '</img>'

	FAST_IMG_CONTEXT_TOKEN = '<FAST_IMG_CONTEXT>'
	FAST_IMG_START_TOKEN = '<fast_img>'
	FAST_IMG_END_TOKEN = '</fast_img>'

	def __init__(self,
	sam2_folder,
	expression_file,
	extra_image_processor=None,
	tokenizer=None,
	select_number=5,
	sampled_frames=5,
	offline_processed_text_folder=None,
	template_map_fn=None,
	max_length=8196,
	lazy=True,
	repeats=1,
	special_tokens=None,
	use_fast=False,
	n_fast_images=50,
	fast_pool_size=4,
	mode='long',
	frame_contiguous_sample=False,
	):
	assert mode in ['long', 'long_short', 'short']
	self.mode = mode
	self.cur_mode = mode
	assert lazy is True
	self.tokenizer = BUILDER.build(tokenizer)
	self.select_number = select_number
	self.sampled_frames = sampled_frames
	assert offline_processed_text_folder or (expression_file and tokenizer)
	self.lazy = lazy

	self.max_length = max_length

	self.template_map_fn = template_map_fn
	if isinstance(self.template_map_fn, dict) and self.lazy:
	_type = self.template_map_fn['type']
	del self.template_map_fn['type']
	self.template_map_fn = _type(**self.template_map_fn)

	if offline_processed_text_folder and expression_file:
	print_log(
	'Both `offline_processed_text_folder` and '
	'`data_path` are set, and we load dataset from'
	'`offline_processed_text_folder` '
	f'({offline_processed_text_folder})',
	logger='current',
	level=logging.WARNING)

	if offline_processed_text_folder is not None:
	raise NotImplementedError
	else:
	video_ids, anno_dict = self.json_file_preprocess(expression_file)
	if self.lazy:
	self.video_ids = video_ids
	self.anno_dict = anno_dict
	else:
	raise NotImplementedError

	self.sam2_folder = sam2_folder
	if extra_image_processor is not None:
	self.extra_image_processor = BUILDER.build(extra_image_processor)
	self.down_ratio = 1
	self.repeats = repeats

	self._system = ''

	self.downsample_ratio = 0.5
	self.image_size = 448
	patch_size = 14
	self.patch_token = int((self.image_size // patch_size) ** 2 * (self.downsample_ratio ** 2))

	self.transformer = T.Compose([
	T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
	T.Resize((self.image_size, self.image_size), interpolation=InterpolationMode.BICUBIC),
	T.ToTensor(),
	T.Normalize(mean=self.IMAGENET_MEAN, std=self.IMAGENET_STD)
	])

	if special_tokens is not None:
	self.tokenizer.add_tokens(special_tokens, special_tokens=True)

	self.use_fast = use_fast
	self.n_fast_images = n_fast_images
	self.fast_pool_size = fast_pool_size

	self.frame_contiguous_sample = frame_contiguous_sample

	# for visualization debug
	self.save_folder = './work_dirs/video_debug/'
	self.cur_number = 0

	print("Video res dataset (ref-sam2), include {} items.".format(len(self.video_ids)))

	def __len__(self):
	return len(self.video_ids) * self.repeats

	@property
	def modality_length(self):
	length_list = []
	for data_dict in self.video_ids:
	cur_len = 20000
	length_list.append(cur_len)
	return length_list

	def real_len(self):
	return len(self.video_ids)

	def json_file_preprocess(self, expression_file):
	# prepare expression annotation files
	with open(expression_file, 'r') as f:
	expression_datas = json.load(f)

	video_ids = list(expression_datas.keys())
	return video_ids, expression_datas

	def dataset_map_fn(self, objects_expression_infos, n_frames, n_fast_frames=0):
	# prepare text
	if self.mode == 'long':
	expressions = [object_info['formated'] for object_info in objects_expression_infos]
	self.cur_mode = self.mode
	elif self.mode == 'short':
	expressions = [object_info['short_caps'][random.randint(0, len(object_info['short_caps'])-1)] for object_info in objects_expression_infos]
	self.cur_mode = self.mode
	else:
	if random.random() < 0.5:
	expressions = [object_info['formated'] for object_info in objects_expression_infos]
	self.cur_mode = 'long'
	else:
	expressions = [object_info['short_caps'][random.randint(0, len(object_info['short_caps']) - 1)] for
	object_info in objects_expression_infos]
	self.cur_mode = 'short'
	text_dict = self.prepare_text(n_frames, expressions, num_image_tokens=self.patch_token,
	n_fast_frames=n_fast_frames)
	ret = {'conversation': text_dict['conversation']}
	return ret

	def prepare_text(self, n_frames, expressions, num_image_tokens=256, n_fast_frames=0):

	if self.use_fast:
	fast_frame_token_str = f'{self.FAST_IMG_START_TOKEN}' \
	f'{self.FAST_IMG_CONTEXT_TOKEN * n_fast_frames * self.fast_pool_size * self.fast_pool_size}' \
	f'{self.FAST_IMG_END_TOKEN}' + '\n'
	else:
	fast_frame_token_str = ''

	frame_token_str = f'{self.IMG_START_TOKEN}' \
	f'{self.IMG_CONTEXT_TOKEN * num_image_tokens}' \
	f'{self.IMG_END_TOKEN}'

	questions = []
	answers = []
	for i, exp in enumerate(expressions):
	if self.cur_mode == 'short':
	question_template = random.choice(SEG_QUESTIONS_SHORT)
	exp = exp.replace("A ", '')
	else:
	question_template = random.choice(SEG_QUESTIONS)
	questions.append(question_template.format(class_name=exp))
	answers.append(random.choice(ANSWER_LIST))
	qa_list = []
	for i, (question, answer) in enumerate(zip(questions, answers)):
	if i == 0:
	frame_tokens = frame_token_str + '\n'
	# frame_tokens = '=' + ' '
	frame_tokens = frame_tokens * n_frames
	frame_tokens = frame_tokens.strip()
	frame_tokens = fast_frame_token_str + frame_tokens
	qa_list.append(
	{'from': 'human', 'value': frame_tokens + question}
	)
	else:
	qa_list.append(
	{'from': 'human', 'value': question}
	)
	qa_list.append(
	{'from': 'gpt', 'value': answer}
	)

	input = ''
	conversation = []
	for msg in qa_list:
	if msg['from'] == 'human':
	input += msg['value']
	elif msg['from'] == 'gpt':
	conversation.append({'input': input, 'output': msg['value']})
	input = ''
	else:
	raise NotImplementedError

	# add system information
	conversation[0].update({'system': self._system})
	return {'conversation': conversation}

	def __getitem__(self, index):
	index = index % self.real_len()
	video_id = self.video_ids[index]
	expression_dict = self.anno_dict[video_id]
	object_ids = list(expression_dict['objects'].keys())

	video_path = os.path.join(self.sam2_folder, expression_dict['video_path'])
	anno_path = os.path.join(self.sam2_folder, expression_dict['anno_path'])

	video_frames = get_video_frames(video_path)

	if self.use_fast:
	# sample fast branch
	fast_interval = len(video_frames) / (self.n_fast_images + 1e-4)
	sampled_fast_frame_idxs = [min(int(i * fast_interval), len(video_frames) - 1) for i in range(self.n_fast_images)]
	fast_video_frames = [video_frames[_idx] for _idx in sampled_fast_frame_idxs]
	else:
	fast_video_frames = None

	video_frames = video_frames[::4]

	# mask annotation
	with open(anno_path, 'r') as f:
	mask_data = json.load(f)
	masklents = decode_masklet(mask_data['masklet'])

	n_frames = len(masklents)
	n_objects = len(object_ids)

	# sample object
	if n_objects > self.select_number:
	selected_indexes = np.random.choice(n_objects, self.select_number)
	else:
	selected_indexes = np.random.choice(n_objects, self.select_number, replace=True)

	selected_object_ids = [object_ids[_idx] for _idx in selected_indexes]
	objects_expression_infos = [expression_dict['objects'][_idx] for _idx in selected_object_ids]
	_masklents = []
	for _mask in masklents:
	_mask_selected = []
	for _idx in selected_object_ids:
	_mask_selected.append(_mask[:, :, int(_idx)])
	_mask_selected = np.stack(_mask_selected, axis=2)
	_masklents.append(_mask_selected)
	masklents = _masklents

	# sample video frames
	# prepare images, random select k frames
	if n_frames > self.sampled_frames + 1:
	if self.frame_contiguous_sample and random.random() < 0.5:
	# do contiguous sample
	selected_start_frame = np.random.choice(n_frames - self.sampled_frames, 1, replace=False)
	selected_frame_indexes = [selected_start_frame[0] + _i for _i in range(self.sampled_frames)]
	else:
	selected_frame_indexes = np.random.choice(n_frames, self.sampled_frames, replace=False)
	else:
	selected_frame_indexes = np.random.choice(n_frames, self.sampled_frames, replace=True)
	selected_frame_indexes.sort()

	video_frames = [video_frames[_idx] for _idx in selected_frame_indexes]
	masklents = [masklents[_idx] for _idx in selected_frame_indexes]

	data_dict = self.dataset_map_fn(objects_expression_infos, len(video_frames), n_fast_frames=self.n_fast_images)
	result = self.template_map_fn(data_dict)
	data_dict.update(result)
	result = video_lisa_encode_fn(data_dict, tokenizer=self.tokenizer, max_length=self.max_length, with_image_token=True)
	data_dict.update(result)

	pixel_values = []
	extra_pixel_values = []
	for frame in video_frames:
	frame = frame[:, :, ::-1]
	frame_image = Image.fromarray(frame).convert('RGB')
	ori_width, ori_height = frame_image.size
	if self.extra_image_processor is not None:
	g_image = np.array(frame_image) # for grounding
	g_image = self.extra_image_processor.apply_image(g_image)
	g_pixel_values = torch.from_numpy(g_image).permute(2, 0, 1).contiguous()
	extra_pixel_values.append(g_pixel_values)

	frame_image = self.transformer(frame_image)
	pixel_values.append(frame_image)

	pixel_values = torch.stack(pixel_values, dim=0) # (n_f, 3, h, w)
	data_dict['pixel_values'] = pixel_values
	if self.extra_image_processor is not None:
	data_dict['g_pixel_values'] = extra_pixel_values

	# for fast branch
	if self.use_fast:
	fast_pixel_values = []
	for frame_image in fast_video_frames:
	frame = frame_image[:, :, ::-1]
	frame_image = Image.fromarray(frame).convert('RGB')
	ori_width, ori_height = frame_image.size

	frame_image = self.transformer(frame_image)
	fast_pixel_values.append(frame_image)

	fast_pixel_values = torch.stack(fast_pixel_values, dim=0) # (n_f, 3, h, w)
	data_dict['fast_pixel_values'] = fast_pixel_values

	# process and get masks
	masklents = np.stack(masklents, axis=0) # (n_frames, h, w, n_obj)
	masklents = torch.from_numpy(masklents).permute(3, 0, 1, 2)
	masklents = masklents.flatten(0, 1)
	# print('sam2-mask_shape:', masklents.shape)
	# print('sam2-pixel_values:', data_dict['pixel_values'].shape)
	# print('sam2-g_pixel_values:', len(data_dict['g_pixel_values']), ', ', data_dict['g_pixel_values'][0].shape)
	data_dict['masks'] = masklents
	data_dict['type'] = 'video'
	return data_dict

	def visualization_debug(self, data_dict):
	save_folder = os.path.join(self.save_folder, 'sample_{}'.format(self.cur_number))
	if not os.path.exists(save_folder):
	os.mkdir(save_folder)
	self.cur_number += 1

	# images

	show_images = []

	pixel_values = data_dict['pixel_values']
	save_folder_image = os.path.join(save_folder, 'image')
	if not os.path.exists(save_folder_image):
	os.mkdir(save_folder_image)
	for i_image, image_pixel_value in enumerate(pixel_values):
	# print(image_pixel_value.shape)
	image_pixel_value[0] = image_pixel_value[0] * 0.2686
	image_pixel_value[1] = image_pixel_value[1] * 0.2613
	image_pixel_value[2] = image_pixel_value[2] * 0.2757
	image_pixel_value[0] = image_pixel_value[0] + 0.4814
	image_pixel_value[1] = image_pixel_value[1] + 0.4578
	image_pixel_value[2] = image_pixel_value[2] + 0.4082
	image_pixel_value = image_pixel_value * 255
	image_pixel_value = image_pixel_value.permute(1, 2, 0)
	image_pixel_value = image_pixel_value.to(torch.uint8).numpy()
	# print(os.path.join(save_folder_image, '{}.jpg'.format(i_image)))
	# print(image_pixel_value.shape)
	show_images.append(image_pixel_value)
	cv2.imwrite(os.path.join(save_folder_image, '{}.jpg'.format(i_image)), image_pixel_value)

	# text
	input_text = self.tokenizer.decode(data_dict['input_ids'], skip_special_tokens=False)
	with open(os.path.join(save_folder, 'text.json'), 'w') as f:
	json.dump([input_text], f)

	# masks
	save_folder_mask = os.path.join(save_folder, 'mask')
	if not os.path.exists(save_folder_mask):
	os.mkdir(save_folder_mask)
	n_frames = len(pixel_values)
	masks = data_dict['masks']
	_, h, w = masks.shape
	masks = masks.reshape(-1, n_frames, h, w)
	for i_obj, obj_masks in enumerate(masks):
	save_folder_mask_obj_folder = os.path.join(save_folder_mask, 'obj_{}'.format(i_obj))
	if not os.path.exists(save_folder_mask_obj_folder):
	os.mkdir(save_folder_mask_obj_folder)
	for i_frame, f_mask in enumerate(obj_masks):
	f_mask = f_mask.numpy()
	f_mask = f_mask * 255
	f_mask = np.stack([f_mask * 1, f_mask * 0, f_mask * 0], axis=2)
	f_mask = show_images[i_frame] * 0.3 + 0.7 * f_mask
	f_mask = f_mask.astype(np.uint8)
	cv2.imwrite(os.path.join(save_folder_mask_obj_folder, '{}.png'.format(i_frame)), f_mask)
	return

	def get_video_frames(video_path):
	cap = cv2.VideoCapture(video_path)

	if not cap.isOpened():
	print("Error: Cannot open video file.")
	return

	frames = []

	frame_id = 0
	while True:
	ret, frame = cap.read()

	if not ret:
	break

	frames.append(frame)

	frame_id += 1

	cap.release()
	return frames


	def images_to_video(frames, video_name, fps=6):
	height, width, layers = frames[0].shape

	fourcc = cv2.VideoWriter_fourcc(*'mp4v')
	video = cv2.VideoWriter(video_name, fourcc, fps, (width, height))

	for frame in frames:
	video.write(frame)

	# cv2.destroyAllWindows()
	video.release()
	return

	def decode_masklet(masklet):
	masks = []
	for _rle in masklet:
	mask = maskUtils.decode(_rle)
	masks.append(mask)
	return masks

	def draw_mask(image, mask):
	obj_mask = mask * 255
	obj_mask = np.stack([obj_mask * 1, obj_mask * 0, obj_mask * 0], axis=2)
	obj_mask = obj_mask * 0.5 + copy.deepcopy(image) * 0.5
	obj_mask = obj_mask.astype(np.uint8)
	return obj_mask

	def add_mask2images(frames, masklets):
	show_videos = []
	for i_frames, (frame, masks) in enumerate(zip(frames, masklets)):
	if i_frames == 0:
	n_obj = masks.shape[-1]
	for i_obj in range(n_obj):
	show_videos.append([])

	n_obj = masks.shape[-1]
	for i_obj in range(n_obj):
	show_videos[i_obj].append(draw_mask(copy.deepcopy(frame), masks[:, :, i_obj]))
	return show_videos