--- license: apache-2.0 language: - en base_model: - Qwen/Qwen2-VL-7B-Instruct --- # Usage First follow the instructions in [our repo](https://github.com/VectorSpaceLab/Video-XL) to install relevant packages. ```python from videoxl.model.builder import load_pretrained_model from videoxl.mm_utils import tokenizer_image_token, process_images,transform_input_id from videoxl.constants import IMAGE_TOKEN_INDEX,TOKEN_PERFRAME from PIL import Image from decord import VideoReader, cpu import torch import numpy as np # fix seed torch.manual_seed(0) model_path = "assets/VideoXL_weight_8" video_path="assets/ad2_watch_15min.mp4" max_frames_num =900 gen_kwargs = {"do_sample": True, "temperature": 1, "top_p": None, "num_beams": 1, "use_cache": True, "max_new_tokens": 1024} tokenizer, model, image_processor, _ = load_pretrained_model(model_path, None, "llava_qwen", device_map="cuda:0") model.config.beacon_ratio=[8] # you can delete this line to realize random compression of {2,4,8} ratio #video input prompt = "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n\nDoes this video contain any inserted advertisement? If yes, which is the content of the ad?<|im_end|>\n<|im_start|>assistant\n" input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to(model.device) vr = VideoReader(video_path, ctx=cpu(0)) total_frame_num = len(vr) uniform_sampled_frames = np.linspace(0, total_frame_num - 1, max_frames_num, dtype=int) frame_idx = uniform_sampled_frames.tolist() frames = vr.get_batch(frame_idx).asnumpy() video_tensor = image_processor.preprocess(frames, return_tensors="pt")["pixel_values"].to(model.device, dtype=torch.float16) beacon_skip_first = (input_ids == IMAGE_TOKEN_INDEX).nonzero(as_tuple=True)[1].item() num_tokens=TOKEN_PERFRAME *max_frames_num beacon_skip_last = beacon_skip_first + num_tokens with torch.inference_mode(): output_ids = model.generate(input_ids, images=[video_tensor], modalities=["video"],beacon_skip_first=beacon_skip_first,beacon_skip_last=beacon_skip_last, **gen_kwargs) if IMAGE_TOKEN_INDEX in input_ids: transform_input_ids=transform_input_id(input_ids,num_tokens,model.config.vocab_size-1) output_ids=output_ids[:,transform_input_ids.shape[1]:] outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip() print(outputs) ``` ## License This project utilizes certain datasets and checkpoints that are subject to their respective original licenses. Users must comply with all terms and conditions of these original licenses, including but not limited to the OpenAI Terms of Use for the dataset and the specific licenses for base language models (Qwen2 license). This project does not impose any additional constraints beyond those stipulated in the original licenses. Furthermore, users are reminded to ensure that their use of the dataset and checkpoints is in compliance with all applicable laws and regulations.