# imports import pandas as pd import re # replace text with multiple replacements def replace_text(string, dict_of_replacements): ''' replace multiple substrings in a string with a dictionary of replacements to be used if replacements are fixed and do not require regex as replace() is faster than re.sub() for regex replacements use clean_text() arguments: string (str): string for replacement dict_of_replacements (dict): dictionary of substring to replace and replacement e.g. {'to replace this':'with this',...} returns: a string with substrings replaced ''' # loop through dict for key, value in dict_of_replacements.items(): # perform replacement string = string.replace(key, value) # return return string # clean text string def clean_text(text_string, list_of_replacements, lowercase=True, ignorecase=False): ''' clean text string lower case string regex sub user defined patterns with user defined replacements arguments: text_string (str): text string to clean list_of_replacements (list): a list of tuples consisting of regex pattern and replacement value e.g. [('[^a-z\s]+', ''), ...] lowercase (bool): default to True, if True, convert text to lowercase ignorecase (bool): default to False, if True, ignore case when applying re.sub() returns: a cleaned text string ''' # check lowercase argument if lowercase: # lower case text string clean_string = text_string.lower() else: # keep text as is clean_string = text_string if ignorecase: # loop through each pattern and replacement for pattern, replacement in list_of_replacements: # replace defined pattern with defined replacement value clean_string = re.sub(pattern, replacement, clean_string, flags=re.IGNORECASE) else: # loop through each pattern and replacement for pattern, replacement in list_of_replacements: # replace defined pattern with defined replacement value clean_string = re.sub(pattern, replacement, clean_string) # return return clean_string # convert transformer model zero shot classification prediction into dataframe def convert_zero_shot_classification_output_to_dataframe(model_output): ''' convert zero shot classification output to dataframe model's prediction is a list dictionaries e.g. each prediction consists of the sequence being predicted, the user defined labels, and the respective scores. [ {'sequence': 'the organisation is generally...', 'labels': ['rewards', 'resourcing', 'leadership'], 'scores': [0.905086100101471, 0.06712279468774796, 0.027791114524006844]}, ... ] the function pairs the label and scores and stores it as a dataframe it also identifies the label with the highest score arguments: model_output (list): output from transformer.pipeline(task='zero-shot-classification') returns: a dataframe of label and scores for each prediction ''' # store results as dataframe results = pd.DataFrame(model_output) # zip labels and scores as dictionary results['labels_scores'] = results.apply(lambda x: dict(zip(x['labels'], x['scores'])), axis=1) # convert labels_scores to dataframe labels_scores = pd.json_normalize(results['labels_scores']) # get label of maximum score as new column labels_scores['label'] = labels_scores.idxmax(axis=1) # get score of maximum score as new column labels_scores['score'] = labels_scores.max(axis=1) # concat labels_scores to results results = pd.concat([results, labels_scores], axis=1) # drop unused columns results = results.drop(['labels', 'scores'], axis=1) # return return results # convert transformer model sentiment classification prediction into dataframe def convert_sentiment_classification_output_to_dataframe(text_input, model_output): ''' convert sentiment classification output into a dataframe the model used distilbert-base-uncased-finetuned-sst-2-english outputs a list of lists with two dictionaries, within each dictionary is a label negative or postive and the respective score [ [ {'label': 'NEGATIVE', 'score': 0.18449656665325165}, {'label': 'POSITIVE', 'score': 0.8155034780502319} ], ... ] the scores sum up to 1, and we extract only the positive score in this function, append the scores to the model's input and return a dataframe arguments: text_input (list): a list of sequences that is input for the model model_output (list): a list of labels and scores return: a dataframe of sequences and sentiment score ''' # store model positive scores as dataframe results = pd.DataFrame(model_output)[[1]] # get score from column results = results[1].apply(lambda x: x.get('score')) # store input sequences and scores as dataframe results = pd.DataFrame({'sequence':text_input, 'score':results}) # return return results