import re | |
def custom_tokenizer(text): | |
# Lowercase the text | |
text = text.lower() | |
# Remove punctuation | |
text = re.sub(r'[^\w\s]', '', text) | |
# Split the text into tokens | |
tokens = text.split() | |
# Add special tokens for the question | |
tokens.insert(0, '<question>') | |
tokens.append('<endquestion>') | |
return tokens | |