crystal-technologies
/

CRYSTAL-R1

Upload 1287 files

2d8da09 about 1 year ago

1.44 kB

	# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	import pickle

	import pandas as pd
	from omegaconf import OmegaConf

	from nemo.collections.common.tokenizers.column_coder import ColumnCodes
	from nemo.core.config import hydra_runner
	from nemo.utils import logging


	@hydra_runner(config_path="conf", config_name="tabular_data_tokenizer")
	def main(cfg) -> None:
	logging.info("\n\n************ Experiment configuration *********")
	logging.info(OmegaConf.to_yaml(cfg))
	table = pd.read_csv(cfg.table_csv_file)
	example_arrays = {}
	for col in cfg.table_structure:
	col_name = col['name']
	example_arrays[col_name] = table[col_name].dropna().unique()
	cc = ColumnCodes.get_column_codes(cfg.table_structure, example_arrays)
	with open(cfg.tokenizer_file, 'wb') as handle:
	pickle.dump(cc, handle)


	if __name__ == '__main__':
	main()