import codecs from SmilesPE.tokenizer import * def load_vocabulary_to_dict(vocabulary_path): vocab_dict = {} with codecs.open(vocabulary_path, 'r', 'utf-8') as file: for index, line in enumerate(file): token = line.strip().split()[0] # Assuming first item is the token vocab_dict[token] = index # Or use the token itself as ID if preferable return vocab_dict def smilespe_tokenizer(smiles_string, vocab_dict): # Initialize SPE_Tokenizer with the vocabulary spe_vob = codecs.open('chembl_smiles_tokenizer30000.txt', 'r', 'utf-8') spe = SPE_Tokenizer(spe_vob) # Tokenize the SMILES string tokenized = spe.tokenize(smiles_string) # Convert tokens to IDs using the vocab_dict token_ids = [vocab_dict[token] for token in tokenized if token in vocab_dict] return tokenized, token_ids # Load the vocabulary into a dictionary # vocab_path = 'chembl_smiles_tokenizer30000.txt' # vocab_dict = load_vocabulary_to_dict(vocab_path) # # Example usage # smiles_string = 'Brc1cccc(Nc2ncnc3ccncc23)c1NCCN1CCOCC1' # tokens, token_ids = smilespe_tokenizer(smiles_string, vocab_dict) # print("Tokens:", tokens) # print("Token IDs:", token_ids)