Vision-CAIR's picture
Upload folder using huggingface_hub
d119b40 verified
raw
history blame
20 kB
"""
Copyright (c) 2022, salesforce.com, inc.
All rights reserved.
SPDX-License-Identifier: BSD-3-Clause
For full license text, see the LICENSE_Lavis file in the repo root or https://opensource.org/licenses/BSD-3-Clause
"""
import io
import json
import logging
import os
import pickle
import re
import shutil
import urllib
import urllib.error
import urllib.request
from typing import Optional
from urllib.parse import urlparse
import numpy as np
import pandas as pd
import yaml
from iopath.common.download import download
from iopath.common.file_io import file_lock, g_pathmgr
from .registry import registry
from torch.utils.model_zoo import tqdm
from torchvision.datasets.utils import (
check_integrity,
download_file_from_google_drive,
extract_archive,
)
def now():
from datetime import datetime
return datetime.now().strftime("%Y%m%d%H%M")
def is_url(url_or_filename):
parsed = urlparse(url_or_filename)
return parsed.scheme in ("http", "https")
def get_cache_path(rel_path):
return os.path.expanduser(os.path.join(registry.get_path("cache_root"), rel_path))
def get_abs_path(rel_path):
return os.path.join(registry.get_path("library_root"), rel_path)
def load_json(filename):
with open(filename, "r") as f:
return json.load(f)
# The following are adapted from torchvision and vissl
# torchvision: https://github.com/pytorch/vision
# vissl: https://github.com/facebookresearch/vissl/blob/main/vissl/utils/download.py
def makedir(dir_path):
"""
Create the directory if it does not exist.
"""
is_success = False
try:
if not g_pathmgr.exists(dir_path):
g_pathmgr.mkdirs(dir_path)
is_success = True
except BaseException:
print(f"Error creating directory: {dir_path}")
return is_success
def get_redirected_url(url: str):
"""
Given a URL, returns the URL it redirects to or the
original URL in case of no indirection
"""
import requests
with requests.Session() as session:
with session.get(url, stream=True, allow_redirects=True) as response:
if response.history:
return response.url
else:
return url
def to_google_drive_download_url(view_url: str) -> str:
"""
Utility function to transform a view URL of google drive
to a download URL for google drive
Example input:
https://drive.google.com/file/d/137RyRjvTBkBiIfeYBNZBtViDHQ6_Ewsp/view
Example output:
https://drive.google.com/uc?export=download&id=137RyRjvTBkBiIfeYBNZBtViDHQ6_Ewsp
"""
splits = view_url.split("/")
assert splits[-1] == "view"
file_id = splits[-2]
return f"https://drive.google.com/uc?export=download&id={file_id}"
def download_google_drive_url(url: str, output_path: str, output_file_name: str):
"""
Download a file from google drive
Downloading an URL from google drive requires confirmation when
the file of the size is too big (google drive notifies that
anti-viral checks cannot be performed on such files)
"""
import requests
with requests.Session() as session:
# First get the confirmation token and append it to the URL
with session.get(url, stream=True, allow_redirects=True) as response:
for k, v in response.cookies.items():
if k.startswith("download_warning"):
url = url + "&confirm=" + v
# Then download the content of the file
with session.get(url, stream=True, verify=True) as response:
makedir(output_path)
path = os.path.join(output_path, output_file_name)
total_size = int(response.headers.get("Content-length", 0))
with open(path, "wb") as file:
from tqdm import tqdm
with tqdm(total=total_size) as progress_bar:
for block in response.iter_content(
chunk_size=io.DEFAULT_BUFFER_SIZE
):
file.write(block)
progress_bar.update(len(block))
def _get_google_drive_file_id(url: str) -> Optional[str]:
parts = urlparse(url)
if re.match(r"(drive|docs)[.]google[.]com", parts.netloc) is None:
return None
match = re.match(r"/file/d/(?P<id>[^/]*)", parts.path)
if match is None:
return None
return match.group("id")
def _urlretrieve(url: str, filename: str, chunk_size: int = 1024) -> None:
with open(filename, "wb") as fh:
with urllib.request.urlopen(
urllib.request.Request(url, headers={"User-Agent": "vissl"})
) as response:
with tqdm(total=response.length) as pbar:
for chunk in iter(lambda: response.read(chunk_size), ""):
if not chunk:
break
pbar.update(chunk_size)
fh.write(chunk)
def download_url(
url: str,
root: str,
filename: Optional[str] = None,
md5: Optional[str] = None,
) -> None:
"""Download a file from a url and place it in root.
Args:
url (str): URL to download file from
root (str): Directory to place downloaded file in
filename (str, optional): Name to save the file under.
If None, use the basename of the URL.
md5 (str, optional): MD5 checksum of the download. If None, do not check
"""
root = os.path.expanduser(root)
if not filename:
filename = os.path.basename(url)
fpath = os.path.join(root, filename)
makedir(root)
# check if file is already present locally
if check_integrity(fpath, md5):
print("Using downloaded and verified file: " + fpath)
return
# expand redirect chain if needed
url = get_redirected_url(url)
# check if file is located on Google Drive
file_id = _get_google_drive_file_id(url)
if file_id is not None:
return download_file_from_google_drive(file_id, root, filename, md5)
# download the file
try:
print("Downloading " + url + " to " + fpath)
_urlretrieve(url, fpath)
except (urllib.error.URLError, IOError) as e: # type: ignore[attr-defined]
if url[:5] == "https":
url = url.replace("https:", "http:")
print(
"Failed download. Trying https -> http instead."
" Downloading " + url + " to " + fpath
)
_urlretrieve(url, fpath)
else:
raise e
# check integrity of downloaded file
if not check_integrity(fpath, md5):
raise RuntimeError("File not found or corrupted.")
def download_and_extract_archive(
url: str,
download_root: str,
extract_root: Optional[str] = None,
filename: Optional[str] = None,
md5: Optional[str] = None,
remove_finished: bool = False,
) -> None:
download_root = os.path.expanduser(download_root)
if extract_root is None:
extract_root = download_root
if not filename:
filename = os.path.basename(url)
download_url(url, download_root, filename, md5)
archive = os.path.join(download_root, filename)
print("Extracting {} to {}".format(archive, extract_root))
extract_archive(archive, extract_root, remove_finished)
def cache_url(url: str, cache_dir: str) -> str:
"""
This implementation downloads the remote resource and caches it locally.
The resource will only be downloaded if not previously requested.
"""
parsed_url = urlparse(url)
dirname = os.path.join(cache_dir, os.path.dirname(parsed_url.path.lstrip("/")))
makedir(dirname)
filename = url.split("/")[-1]
cached = os.path.join(dirname, filename)
with file_lock(cached):
if not os.path.isfile(cached):
logging.info(f"Downloading {url} to {cached} ...")
cached = download(url, dirname, filename=filename)
logging.info(f"URL {url} cached in {cached}")
return cached
# TODO (prigoyal): convert this into RAII-style API
def create_file_symlink(file1, file2):
"""
Simply create the symlinks for a given file1 to file2.
Useful during model checkpointing to symlinks to the
latest successful checkpoint.
"""
try:
if g_pathmgr.exists(file2):
g_pathmgr.rm(file2)
g_pathmgr.symlink(file1, file2)
except Exception as e:
logging.info(f"Could NOT create symlink. Error: {e}")
def save_file(data, filename, append_to_json=True, verbose=True):
"""
Common i/o utility to handle saving data to various file formats.
Supported:
.pkl, .pickle, .npy, .json
Specifically for .json, users have the option to either append (default)
or rewrite by passing in Boolean value to append_to_json.
"""
if verbose:
logging.info(f"Saving data to file: {filename}")
file_ext = os.path.splitext(filename)[1]
if file_ext in [".pkl", ".pickle"]:
with g_pathmgr.open(filename, "wb") as fopen:
pickle.dump(data, fopen, pickle.HIGHEST_PROTOCOL)
elif file_ext == ".npy":
with g_pathmgr.open(filename, "wb") as fopen:
np.save(fopen, data)
elif file_ext == ".json":
if append_to_json:
with g_pathmgr.open(filename, "a") as fopen:
fopen.write(json.dumps(data, sort_keys=True) + "\n")
fopen.flush()
else:
with g_pathmgr.open(filename, "w") as fopen:
fopen.write(json.dumps(data, sort_keys=True) + "\n")
fopen.flush()
elif file_ext == ".yaml":
with g_pathmgr.open(filename, "w") as fopen:
dump = yaml.dump(data)
fopen.write(dump)
fopen.flush()
else:
raise Exception(f"Saving {file_ext} is not supported yet")
if verbose:
logging.info(f"Saved data to file: {filename}")
def load_file(filename, mmap_mode=None, verbose=True, allow_pickle=False):
"""
Common i/o utility to handle loading data from various file formats.
Supported:
.pkl, .pickle, .npy, .json
For the npy files, we support reading the files in mmap_mode.
If the mmap_mode of reading is not successful, we load data without the
mmap_mode.
"""
if verbose:
logging.info(f"Loading data from file: {filename}")
file_ext = os.path.splitext(filename)[1]
if file_ext == ".txt":
with g_pathmgr.open(filename, "r") as fopen:
data = fopen.readlines()
elif file_ext in [".pkl", ".pickle"]:
with g_pathmgr.open(filename, "rb") as fopen:
data = pickle.load(fopen, encoding="latin1")
elif file_ext == ".npy":
if mmap_mode:
try:
with g_pathmgr.open(filename, "rb") as fopen:
data = np.load(
fopen,
allow_pickle=allow_pickle,
encoding="latin1",
mmap_mode=mmap_mode,
)
except ValueError as e:
logging.info(
f"Could not mmap {filename}: {e}. Trying without g_pathmgr"
)
data = np.load(
filename,
allow_pickle=allow_pickle,
encoding="latin1",
mmap_mode=mmap_mode,
)
logging.info("Successfully loaded without g_pathmgr")
except Exception:
logging.info("Could not mmap without g_pathmgr. Trying without mmap")
with g_pathmgr.open(filename, "rb") as fopen:
data = np.load(fopen, allow_pickle=allow_pickle, encoding="latin1")
else:
with g_pathmgr.open(filename, "rb") as fopen:
data = np.load(fopen, allow_pickle=allow_pickle, encoding="latin1")
elif file_ext == ".json":
with g_pathmgr.open(filename, "r") as fopen:
data = json.load(fopen)
elif file_ext == ".yaml":
with g_pathmgr.open(filename, "r") as fopen:
data = yaml.load(fopen, Loader=yaml.FullLoader)
elif file_ext == ".csv":
with g_pathmgr.open(filename, "r") as fopen:
data = pd.read_csv(fopen)
else:
raise Exception(f"Reading from {file_ext} is not supported yet")
return data
def abspath(resource_path: str):
"""
Make a path absolute, but take into account prefixes like
"http://" or "manifold://"
"""
regex = re.compile(r"^\w+://")
if regex.match(resource_path) is None:
return os.path.abspath(resource_path)
else:
return resource_path
def makedir(dir_path):
"""
Create the directory if it does not exist.
"""
is_success = False
try:
if not g_pathmgr.exists(dir_path):
g_pathmgr.mkdirs(dir_path)
is_success = True
except BaseException:
logging.info(f"Error creating directory: {dir_path}")
return is_success
def is_url(input_url):
"""
Check if an input string is a url. look for http(s):// and ignoring the case
"""
is_url = re.match(r"^(?:http)s?://", input_url, re.IGNORECASE) is not None
return is_url
def cleanup_dir(dir):
"""
Utility for deleting a directory. Useful for cleaning the storage space
that contains various training artifacts like checkpoints, data etc.
"""
if os.path.exists(dir):
logging.info(f"Deleting directory: {dir}")
shutil.rmtree(dir)
logging.info(f"Deleted contents of directory: {dir}")
def get_file_size(filename):
"""
Given a file, get the size of file in MB
"""
size_in_mb = os.path.getsize(filename) / float(1024**2)
return size_in_mb
from typing import Dict, List, Protocol, Tuple
import torch
from torch.func import functional_call
from vllm.multimodal import BatchedTensors
from vllm.utils import is_pin_memory_available
def merge_vision_embeddings(input_ids: torch.Tensor,
inputs_embeds: torch.Tensor,
vision_embeddings: BatchedTensors,
image_token_id: int) -> torch.Tensor:
"""
Merge `vision_embeddings` into `inputs_embeds` by overwriting the positions
in `inputs_embeds` corresponding to placeholder image tokens in `input_ids`.
Note:
This updates `inputs_embeds` in place.
"""
mask = (input_ids == image_token_id)
num_expected_tokens = mask.sum()
if isinstance(vision_embeddings, torch.Tensor):
batch_size, batch_tokens, *_, embed_dim = vision_embeddings.shape
total_tokens = batch_size * batch_tokens
if num_expected_tokens != total_tokens:
expr = f"{batch_size} x {batch_tokens}"
raise ValueError(
f"Attempted to assign {expr} = {total_tokens} "
f"image tokens to {num_expected_tokens} placeholders")
inputs_embeds[mask] = vision_embeddings.view(total_tokens, embed_dim)
else:
size_per_batch = [t.shape[0] for t in vision_embeddings]
total_tokens = sum(size_per_batch)
if num_expected_tokens != total_tokens:
expr = ' + '.join(map(str, size_per_batch))
raise ValueError(
f"Attempted to assign {expr} = {total_tokens} "
f"image tokens to {num_expected_tokens} placeholders")
inputs_embeds[mask] = torch.cat(vision_embeddings)
return inputs_embeds
class LayerFn(Protocol):
def __call__(
self,
prefix="",
) -> torch.nn.Module:
...
class PPMissingLayer(torch.nn.Identity):
"""
A placeholder layer for missing layers in a pipeline parallel model.
"""
def __init__(self, *args, **kwargs):
super().__init__()
_CPU_OFFLOAD_BYTES = 0
_CPU_OFFLOAD_MAX_BYTES = 0
def set_cpu_offload_max_bytes(max_bytes: int) -> None:
global _CPU_OFFLOAD_MAX_BYTES, _CPU_OFFLOAD_BYTES
_CPU_OFFLOAD_BYTES = 0
_CPU_OFFLOAD_MAX_BYTES = max_bytes
def maybe_offload_to_cpu(module: torch.nn.Module) -> torch.nn.Module:
device = next(module.parameters()).device
if device == torch.device("cpu"):
return module
global _CPU_OFFLOAD_MAX_BYTES, _CPU_OFFLOAD_BYTES
if _CPU_OFFLOAD_BYTES >= _CPU_OFFLOAD_MAX_BYTES:
return module
pin_memory = is_pin_memory_available()
# offload parameters to CPU
# use pin_memory if possible, which helps cudagraph capture speed
for p in module.parameters():
if _CPU_OFFLOAD_BYTES >= _CPU_OFFLOAD_MAX_BYTES:
# we use per-parameter offloading
# one module might have some parameters offloaded and some not
break
# `torch.empty_like` does not support `pin_memory` argument
cpu_data = torch.empty(size=p.data.size(),
dtype=p.data.dtype,
layout=p.data.layout,
device='cpu',
pin_memory=pin_memory)
cpu_data.copy_(p.data)
p.data = cpu_data
_CPU_OFFLOAD_BYTES += p.data.numel() * p.data.element_size()
state_dict: Dict[str, torch.Tensor] = module.state_dict()
original_forward = module.forward
def forward(*args, **kwargs):
module.forward = original_forward
device_state = {
# here we blindly call `to(device)`
# if the parameter is already on the device, it will be a no-op
k: v.to(device, non_blocking=True)
for k, v in state_dict.items()
}
output = functional_call(module,
device_state,
args=args,
kwargs=kwargs)
module.forward = forward
return output
module.forward = forward
return module
def make_layers(
num_hidden_layers: int,
layer_fn: LayerFn,
prefix: str,
) -> Tuple[int, int, torch.nn.ModuleList]:
"""Make a list of layers with the given layer function, taking
pipeline parallelism into account.
"""
from vllm.distributed.parallel_state import get_pp_group
from vllm.distributed.utils import get_pp_indices
start_layer, end_layer = get_pp_indices(num_hidden_layers,
get_pp_group().rank_in_group,
get_pp_group().world_size)
modules = torch.nn.ModuleList(
[PPMissingLayer() for _ in range(start_layer)] + [
maybe_offload_to_cpu(layer_fn(prefix=f"{prefix}.{idx}"))
for idx in range(start_layer, end_layer)
] + [PPMissingLayer() for _ in range(end_layer, num_hidden_layers)])
return start_layer, end_layer, modules
# NOTE: don't use lru_cache here because it can prevent garbage collection
_model_to_pp_missing_layer_names: Dict[int, List[str]] = {}
def get_pp_missing_layer_names(model: torch.nn.Module) -> List[str]:
"""Get the names of the missing layers in a pipeline parallel model."""
model_id = id(model)
if model_id in _model_to_pp_missing_layer_names:
return _model_to_pp_missing_layer_names[model_id]
missing_layer_names = []
for name, module in model.named_modules():
if isinstance(module, PPMissingLayer):
# NOTE: the trailing dot is used to match the prefix of the layer.
# without the dot, we could match a layer that is not missing,
# e.g., 'encoder.layer.1' would match 'encoder.layer.11'
missing_layer_names.append(name + '.')
_model_to_pp_missing_layer_names[model_id] = missing_layer_names
return missing_layer_names
def is_pp_missing_parameter(name: str, model: torch.nn.Module) -> bool:
"""Check if a parameter is missing in a pipeline parallel model."""
for missing_layer_name in get_pp_missing_layer_names(model):
if name.startswith(missing_layer_name):
return True
return False