import os os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3" from transformers import ( LlamaConfig, LlamaForSequenceClassification, LlamaForCausalLM, GPT2Config, GPT2ForSequenceClassification, GPT2LMHeadModel, PreTrainedTokenizerFast ) from tokenizers import Tokenizer from tokenizers.models import BPE from src.const import ACTION_SPACE, VOCAB class RookTokenizer(PreTrainedTokenizerFast): # TODO: make it easier to use checkpoints from the hub # https://huggingface.co/docs/transformers/custom_models#sending-the-code-to-the-hub def __call__(self, *args, **kwargs): kwargs["return_token_type_ids"] = False return super().__call__(*args, **kwargs) def make_model(config_dict, arch="llama"): if config_dict["finetuning_task"] == "text-classification": return make_model_clf(config_dict, arch=arch) elif config_dict["finetuning_task"] == "text-generation": return make_model_lm(config_dict, arch=arch) else: raise ValueError(f"Unknown config finetuning_task: {config_dict['finetuning_task']}") def make_model_clf(config_dict, arch): if arch == "llama": Config = LlamaConfig Model = LlamaForSequenceClassification if arch == "gpt2": Config = GPT2Config Model = GPT2ForSequenceClassification # pad to multiple of 128 config_dict["vocab_size"] = ((len(VOCAB) + 127) // 128) * 128 config = Config(**config_dict) label_to_id = {v: i for i, v in enumerate(ACTION_SPACE)} config.num_labels = len(ACTION_SPACE) config.label2id = label_to_id config.id2label = {id: label for label, id in label_to_id.items()} model = Model(config=config) return model def make_model_lm(config_dict, arch): if arch == "llama": Config = LlamaConfig Model = LlamaForCausalLM if arch == "gpt2": Config = GPT2Config Model = GPT2LMHeadModel # pad to multiple of 128 config_dict["vocab_size"] = ((len(VOCAB) + len(ACTION_SPACE) + 4 + 127) // 128) * 128 config = Config(**config_dict) model = Model(config=config) return model def make_tokenizer(task="clf"): if task == "clf": return make_tokenizer_clf(model_max_length=78) elif task == "lm": return make_tokenizer_lm(model_max_length=79) elif task == "lm-cot": return make_tokenizer_lm(model_max_length=116) else: raise ValueError(f"Unknown task: {task}") def make_tokenizer_clf(model_max_length): single_char_vocab = [e for e in VOCAB if len(e) == 1] multi_char_vocab = [e for e in VOCAB if len(e) > 1] merges = [tuple(e) for e in multi_char_vocab] print(merges[:5]) tokenizer = Tokenizer(BPE( vocab=dict(zip(single_char_vocab, range(len(single_char_vocab)))), merges=merges) ) fast_tokenizer = RookTokenizer( tokenizer_object=tokenizer, model_max_length=model_max_length, pad_token="[PAD]", cls_token="[CLS]", sep_token="[SEP]", mask_token="[MASK]", clean_up_tokenization_spaces=False ) return fast_tokenizer def make_tokenizer_lm(model_max_length): vocab = VOCAB + ACTION_SPACE vocab += ["[OPTIONS]", "[VALUES]", "[ACTION]", "0000"] single_char_vocab = [e for e in vocab if len(e) == 1] multi_char_vocab = [e for e in vocab if len(e) > 1] merges = [] tokenizer = Tokenizer(BPE( vocab=dict(zip(single_char_vocab, range(len(single_char_vocab)))), merges=merges) ) tokenizer.add_special_tokens(multi_char_vocab) fast_tokenizer = RookTokenizer( tokenizer_object=tokenizer, model_max_length=model_max_length, pad_token="[PAD]", cls_token="[CLS]", sep_token="[SEP]", mask_token="[MASK]", clean_up_tokenization_spaces=False ) return fast_tokenizer