Error for large dataset tokenization in create_dataset
Hi,
I tried to tokenize a dataset of 4M cells, but encountered this error in create_dataset due to overflow value of int32 when issuing Dataset.from_dict(dataset_dict)
:
tk = TranscriptomeTokenizer(
custom_attr_name_dict={
"cell_type": "cell_type",
"project_name": "project_name",
"adata_order": "adata_order",
"property_subject_id": "property_subject_id"},
nproc=8
)
#tokenizer expects to get paths to directories, not files
tk.tokenize_data(f"{HOME}/data/sc/preprocessed",
f"{HOME}/data/foundation_models/geneformer/tokenized",
"pbmcs_4m_cells",
file_format="h5ad")
gave this error:
---------------------------------------------------------------------------
ArrowInvalid Traceback (most recent call last)
Cell In[31], line 10
1 tk = TranscriptomeTokenizer(
2 custom_attr_name_dict={
3 "cell_type": "cell_type",
(...)
7 nproc=8
8 )
9 #tokenizer expects to get paths to directories, not files
---> 10 tk.tokenize_data(f"{HOME}/data/sc/preprocessed",
11 f"{HOME}/data/foundation_models/geneformer/tokenized",
12 "pbmcs_4m_cells",
13 file_format="h5ad")
File ~/envs/geneformer/lib/python3.10/site-packages/geneformer/tokenizer.py:407, in TranscriptomeTokenizer.tokenize_data(self, data_directory, output_directory, output_prefix, file_format, use_generator)
387 """
388 Tokenize .loom files in data_directory and save as tokenized .dataset in output_directory.
389
(...)
402
403 """
404 tokenized_cells, cell_metadata = self.tokenize_files(
405 Path(data_directory), file_format
406 )
--> 407 tokenized_dataset = self.create_dataset(
408 tokenized_cells,
409 cell_metadata,
410 use_generator=use_generator,
411 )
413 output_path = (Path(output_directory) / output_prefix).with_suffix(".dataset")
414 tokenized_dataset.save_to_disk(str(output_path))
File ~/envs/geneformer/lib/python3.10/site-packages/geneformer/tokenizer.py:631, in TranscriptomeTokenizer.create_dataset(self, tokenized_cells, cell_metadata, use_generator, keep_uncropped_input_ids)
629 output_dataset = Dataset.from_generator(dict_generator, num_proc=self.nproc)
630 else:
--> 631 output_dataset = Dataset.from_dict(dataset_dict)
633 def format_cell_features(example):
634 # Store original uncropped input_ids in separate feature
635 if keep_uncropped_input_ids:
File ~/envs/geneformer/lib/python3.10/site-packages/datasets/arrow_dataset.py:931, in Dataset.from_dict(cls, mapping, features, info, split)
929 arrow_typed_mapping[col] = data
930 mapping = arrow_typed_mapping
--> 931 pa_table = InMemoryTable.from_pydict(mapping=mapping)
932 if info is None:
933 info = DatasetInfo()
File ~/envs/geneformer/lib/python3.10/site-packages/datasets/table.py:757, in InMemoryTable.from_pydict(cls, *args, **kwargs)
741 @classmethod
742 def from_pydict(cls, *args, **kwargs):
743 """
744 Construct a Table from Arrow arrays or columns.
745
(...)
755 datasets.table.Table
756 """
--> 757 return cls(pa.Table.from_pydict(*args, **kwargs))
File ~/envs/geneformer/lib/python3.10/site-packages/pyarrow/table.pxi:1920, in pyarrow.lib._Tabular.from_pydict()
File ~/envs/geneformer/lib/python3.10/site-packages/pyarrow/table.pxi:6136, in pyarrow.lib._from_pydict()
File ~/envs/geneformer/lib/python3.10/site-packages/pyarrow/array.pxi:398, in pyarrow.lib.asarray()
File ~/envs/geneformer/lib/python3.10/site-packages/pyarrow/array.pxi:248, in pyarrow.lib.array()
File ~/envs/geneformer/lib/python3.10/site-packages/pyarrow/array.pxi:112, in pyarrow.lib._handle_arrow_array_protocol()
File ~/envs/geneformer/lib/python3.10/site-packages/datasets/arrow_writer.py:185, in TypedSequence.__arrow_array__(self, type)
183 out = numpy_to_pyarrow_listarray(data)
184 elif isinstance(data, list) and data and isinstance(first_non_null_value(data)[1], np.ndarray):
--> 185 out = list_of_np_array_to_pyarrow_listarray(data)
186 else:
187 trying_cast_to_python_objects = True
File ~/envs/geneformer/lib/python3.10/site-packages/datasets/features/features.py:1533, in list_of_np_array_to_pyarrow_listarray(l_arr, type)
1531 """Build a PyArrow ListArray from a possibly nested list of NumPy arrays"""
1532 if len(l_arr) > 0:
-> 1533 return list_of_pa_arrays_to_pyarrow_listarray(
1534 [numpy_to_pyarrow_listarray(arr, type=type) if arr is not None else None for arr in l_arr]
1535 )
1536 else:
1537 return pa.array([], type=type)
File ~/envs/geneformer/lib/python3.10/site-packages/datasets/features/features.py:1525, in list_of_pa_arrays_to_pyarrow_listarray(l_arr)
1521 offsets = np.cumsum(
1522 [0] + [len(arr) for arr in l_arr], dtype=object
1523 ) # convert to dtype object to allow None insertion
1524 offsets = np.insert(offsets, null_indices, None)
-> 1525 offsets = pa.array(offsets, type=pa.int32())
1526 values = pa.concat_arrays(l_arr)
1527 return pa.ListArray.from_arrays(offsets, values)
File ~/envs/geneformer/lib/python3.10/site-packages/pyarrow/array.pxi:358, in pyarrow.lib.array()
File ~/envs/geneformer/lib/python3.10/site-packages/pyarrow/array.pxi:85, in pyarrow.lib._ndarray_to_array()
File ~/envs/geneformer/lib/python3.10/site-packages/pyarrow/error.pxi:92, in pyarrow.lib.check_status()
ArrowInvalid: Value 2147483656 too large to fit in C integer type
It is basically since the offsets is a cumsum yielding a very large number, too large for int32. I managed to solve it by altering the code and splitting the dataset_dict into chucks, processing the chucks, then concatenating instead of using datasets as it currently uses, here's part of the code:
@staticmethod
def split_and_process_dataset(dataset_dict, chunk_size=100000):
input_ids = dataset_dict["input_ids"]
total_size = len(input_ids)
# Process in chunks
chunked_datasets = []
for start_idx in trange(0, total_size, chunk_size, desc="Processing chunks"):
end_idx = min(start_idx + chunk_size, total_size)
# Extract the chunk of input_ids and other metadata
chunk_input_ids = input_ids[start_idx:end_idx]
chunk_metadata = {key: value[start_idx:end_idx] for key, value in dataset_dict.items() if key != "input_ids"}
# Create ListArray for input_ids in this chunk
arrays = [pa.array(arr.astype(np.int64)) for arr in chunk_input_ids] # Ensure int64
offsets = np.cumsum([0] + [len(arr) for arr in chunk_input_ids], dtype=np.int32) # Use int32 offsets for each chunk
list_array = pa.ListArray.from_arrays(pa.array(offsets, type=pa.int32()), pa.concat_arrays(arrays))
# Create chunk dataset
chunk_metadata["input_ids"] = list_array
pa_table = pa.table(chunk_metadata)
chunk_dataset = Dataset(pa_table)
chunked_datasets.append(chunk_dataset)
# Optionally, merge the chunked datasets (if needed) after processing each chunk
output_dataset = concatenate_datasets(chunked_datasets)
return output_dataset
def create_dataset(
self,
tokenized_cells,
cell_metadata,
use_generator=False,
keep_uncropped_input_ids=False,
):
print("Creating dataset.")
# create dict for dataset creation
dataset_dict = {"input_ids": tokenized_cells}
if self.custom_attr_name_dict is not None:
dataset_dict.update(cell_metadata)
# create dataset
if use_generator:
def dict_generator():
for i in range(len(tokenized_cells)):
yield {k: dataset_dict[k][i] for k in dataset_dict.keys()}
output_dataset = Dataset.from_generator(dict_generator, num_proc=self.nproc)
else:
#output_dataset = Dataset.from_dict(dataset_dict)
output_dataset = self.split_and_process_dataset(dataset_dict, chunk_size=100000)
I was wondering how this worked previously as obviously you used larger dataset without any error?
Would it be useful to issue a PR on this?
Thanks for your question! We did not encounter this but others have, so the code was updated to use the dict_generator to avoid this error. We recommend setting use_generator to True if you encounter this error. Your method may or may not be faster - we would recommend doing a timed comparison x3 to determine this. If your method is faster, we can add a dict_chunk_size parameter for the dataset (the current chunk_size if for the anndata processing) to use your method if the chunk_size is not None.
Linking prior discussions here: https://huggingface.co/ctheodoris/Geneformer/discussions/315