Error for large dataset tokenization in create_dataset

#437
by elinor-med - opened

Hi,

I tried to tokenize a dataset of 4M cells, but encountered this error in create_dataset due to overflow value of int32 when issuing Dataset.from_dict(dataset_dict):

tk = TranscriptomeTokenizer(
    custom_attr_name_dict={
        "cell_type": "cell_type",
        "project_name": "project_name",
        "adata_order": "adata_order",
        "property_subject_id": "property_subject_id"},
    nproc=8
)
#tokenizer expects to get paths to directories, not files
tk.tokenize_data(f"{HOME}/data/sc/preprocessed", 
                 f"{HOME}/data/foundation_models/geneformer/tokenized", 
                 "pbmcs_4m_cells", 
                 file_format="h5ad")

gave this error:

---------------------------------------------------------------------------
ArrowInvalid                              Traceback (most recent call last)
Cell In[31], line 10
      1 tk = TranscriptomeTokenizer(
      2     custom_attr_name_dict={
      3         "cell_type": "cell_type",
   (...)
      7     nproc=8
      8 )
      9 #tokenizer expects to get paths to directories, not files
---> 10 tk.tokenize_data(f"{HOME}/data/sc/preprocessed", 
     11                  f"{HOME}/data/foundation_models/geneformer/tokenized", 
     12                  "pbmcs_4m_cells", 
     13                  file_format="h5ad")

File ~/envs/geneformer/lib/python3.10/site-packages/geneformer/tokenizer.py:407, in TranscriptomeTokenizer.tokenize_data(self, data_directory, output_directory, output_prefix, file_format, use_generator)
    387 """
    388 Tokenize .loom files in data_directory and save as tokenized .dataset in output_directory.
    389 
   (...)
    402 
    403 """
    404 tokenized_cells, cell_metadata = self.tokenize_files(
    405     Path(data_directory), file_format
    406 )
--> 407 tokenized_dataset = self.create_dataset(
    408     tokenized_cells,
    409     cell_metadata,
    410     use_generator=use_generator,
    411 )
    413 output_path = (Path(output_directory) / output_prefix).with_suffix(".dataset")
    414 tokenized_dataset.save_to_disk(str(output_path))

File ~/envs/geneformer/lib/python3.10/site-packages/geneformer/tokenizer.py:631, in TranscriptomeTokenizer.create_dataset(self, tokenized_cells, cell_metadata, use_generator, keep_uncropped_input_ids)
    629     output_dataset = Dataset.from_generator(dict_generator, num_proc=self.nproc)
    630 else:
--> 631     output_dataset = Dataset.from_dict(dataset_dict)
    633 def format_cell_features(example):
    634     # Store original uncropped input_ids in separate feature
    635     if keep_uncropped_input_ids:

File ~/envs/geneformer/lib/python3.10/site-packages/datasets/arrow_dataset.py:931, in Dataset.from_dict(cls, mapping, features, info, split)
    929     arrow_typed_mapping[col] = data
    930 mapping = arrow_typed_mapping
--> 931 pa_table = InMemoryTable.from_pydict(mapping=mapping)
    932 if info is None:
    933     info = DatasetInfo()

File ~/envs/geneformer/lib/python3.10/site-packages/datasets/table.py:757, in InMemoryTable.from_pydict(cls, *args, **kwargs)
    741 @classmethod
    742 def from_pydict(cls, *args, **kwargs):
    743     """
    744     Construct a Table from Arrow arrays or columns.
    745 
   (...)
    755         datasets.table.Table
    756     """
--> 757     return cls(pa.Table.from_pydict(*args, **kwargs))

File ~/envs/geneformer/lib/python3.10/site-packages/pyarrow/table.pxi:1920, in pyarrow.lib._Tabular.from_pydict()

File ~/envs/geneformer/lib/python3.10/site-packages/pyarrow/table.pxi:6136, in pyarrow.lib._from_pydict()

File ~/envs/geneformer/lib/python3.10/site-packages/pyarrow/array.pxi:398, in pyarrow.lib.asarray()

File ~/envs/geneformer/lib/python3.10/site-packages/pyarrow/array.pxi:248, in pyarrow.lib.array()

File ~/envs/geneformer/lib/python3.10/site-packages/pyarrow/array.pxi:112, in pyarrow.lib._handle_arrow_array_protocol()

File ~/envs/geneformer/lib/python3.10/site-packages/datasets/arrow_writer.py:185, in TypedSequence.__arrow_array__(self, type)
    183     out = numpy_to_pyarrow_listarray(data)
    184 elif isinstance(data, list) and data and isinstance(first_non_null_value(data)[1], np.ndarray):
--> 185     out = list_of_np_array_to_pyarrow_listarray(data)
    186 else:
    187     trying_cast_to_python_objects = True

File ~/envs/geneformer/lib/python3.10/site-packages/datasets/features/features.py:1533, in list_of_np_array_to_pyarrow_listarray(l_arr, type)
   1531 """Build a PyArrow ListArray from a possibly nested list of NumPy arrays"""
   1532 if len(l_arr) > 0:
-> 1533     return list_of_pa_arrays_to_pyarrow_listarray(
   1534         [numpy_to_pyarrow_listarray(arr, type=type) if arr is not None else None for arr in l_arr]
   1535     )
   1536 else:
   1537     return pa.array([], type=type)

File ~/envs/geneformer/lib/python3.10/site-packages/datasets/features/features.py:1525, in list_of_pa_arrays_to_pyarrow_listarray(l_arr)
   1521 offsets = np.cumsum(
   1522     [0] + [len(arr) for arr in l_arr], dtype=object
   1523 )  # convert to dtype object to allow None insertion
   1524 offsets = np.insert(offsets, null_indices, None)
-> 1525 offsets = pa.array(offsets, type=pa.int32())
   1526 values = pa.concat_arrays(l_arr)
   1527 return pa.ListArray.from_arrays(offsets, values)

File ~/envs/geneformer/lib/python3.10/site-packages/pyarrow/array.pxi:358, in pyarrow.lib.array()

File ~/envs/geneformer/lib/python3.10/site-packages/pyarrow/array.pxi:85, in pyarrow.lib._ndarray_to_array()

File ~/envs/geneformer/lib/python3.10/site-packages/pyarrow/error.pxi:92, in pyarrow.lib.check_status()

ArrowInvalid: Value 2147483656 too large to fit in C integer type

It is basically since the offsets is a cumsum yielding a very large number, too large for int32. I managed to solve it by altering the code and splitting the dataset_dict into chucks, processing the chucks, then concatenating instead of using datasets as it currently uses, here's part of the code:

    @staticmethod
    def split_and_process_dataset(dataset_dict, chunk_size=100000):
        input_ids = dataset_dict["input_ids"]
        total_size = len(input_ids)
        
        # Process in chunks
        chunked_datasets = []
        for start_idx in trange(0, total_size, chunk_size, desc="Processing chunks"):
            end_idx = min(start_idx + chunk_size, total_size)
            
            # Extract the chunk of input_ids and other metadata
            chunk_input_ids = input_ids[start_idx:end_idx]
            chunk_metadata = {key: value[start_idx:end_idx] for key, value in dataset_dict.items() if key != "input_ids"}
            
            # Create ListArray for input_ids in this chunk
            arrays = [pa.array(arr.astype(np.int64)) for arr in chunk_input_ids]  # Ensure int64
            offsets = np.cumsum([0] + [len(arr) for arr in chunk_input_ids], dtype=np.int32)  # Use int32 offsets for each chunk
            list_array = pa.ListArray.from_arrays(pa.array(offsets, type=pa.int32()), pa.concat_arrays(arrays))
            
            # Create chunk dataset
            chunk_metadata["input_ids"] = list_array
            pa_table = pa.table(chunk_metadata)
            chunk_dataset = Dataset(pa_table)
            chunked_datasets.append(chunk_dataset)
            # Optionally, merge the chunked datasets (if needed) after processing each chunk
            output_dataset = concatenate_datasets(chunked_datasets)
        return output_dataset


    def create_dataset(
        self,
        tokenized_cells,
        cell_metadata,
        use_generator=False,
        keep_uncropped_input_ids=False,
    ):
        print("Creating dataset.")
        # create dict for dataset creation
        dataset_dict = {"input_ids": tokenized_cells}
        if self.custom_attr_name_dict is not None:
            dataset_dict.update(cell_metadata)

        # create dataset
        if use_generator:

            def dict_generator():
                for i in range(len(tokenized_cells)):
                    yield {k: dataset_dict[k][i] for k in dataset_dict.keys()}

            output_dataset = Dataset.from_generator(dict_generator, num_proc=self.nproc)
        else:
            #output_dataset = Dataset.from_dict(dataset_dict)
            output_dataset = self.split_and_process_dataset(dataset_dict, chunk_size=100000)

I was wondering how this worked previously as obviously you used larger dataset without any error?
Would it be useful to issue a PR on this?

Thanks for your question! We did not encounter this but others have, so the code was updated to use the dict_generator to avoid this error. We recommend setting use_generator to True if you encounter this error. Your method may or may not be faster - we would recommend doing a timed comparison x3 to determine this. If your method is faster, we can add a dict_chunk_size parameter for the dataset (the current chunk_size if for the anndata processing) to use your method if the chunk_size is not None.

Linking prior discussions here: https://huggingface.co/ctheodoris/Geneformer/discussions/315

ctheodoris changed discussion status to closed

Sign up or log in to comment