|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
"""Processing data for megatron pretraining.""" |
|
|
|
import argparse |
|
import glob |
|
|
|
from nemo.collections.nlp.data.language_modeling.text_memmap_dataset import build_index_files |
|
|
|
|
|
def main(): |
|
parser = argparse.ArgumentParser(description="Builds index files for a list of text files",) |
|
parser.add_argument( |
|
'dataset_paths', type=str, nargs='+', help='Input text files (support glob)', |
|
) |
|
parser.add_argument( |
|
'--newline_int', type=int, default=10, help='Int value to split text (default: newline "\\n"', |
|
) |
|
parser.add_argument( |
|
'--workers', |
|
type=int, |
|
default=None, |
|
help='Number of workers to parse files in parallel (default: max(cpu num // 2, 1)', |
|
) |
|
args = parser.parse_args() |
|
|
|
|
|
dataset_paths = [] |
|
for ds in args.dataset_paths: |
|
dataset_paths.extend(glob.glob(ds)) |
|
|
|
|
|
build_index_files( |
|
dataset_paths=dataset_paths, newline_int=args.newline_int, workers=args.workers, |
|
) |
|
|
|
|
|
if __name__ == '__main__': |
|
main() |
|
|