File size: 346 Bytes
2c07569 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 |
import re
def custom_tokenizer(text):
# Lowercase the text
text = text.lower()
# Remove punctuation
text = re.sub(r'[^\w\s]', '', text)
# Split the text into tokens
tokens = text.split()
# Add special tokens for the question
tokens.insert(0, '<question>')
tokens.append('<endquestion>')
return tokens
|