highdeff1 / tokenizer.py
highdeff's picture
Upload 16 files
2c07569
raw
history blame contribute delete
346 Bytes
import re
def custom_tokenizer(text):
# Lowercase the text
text = text.lower()
# Remove punctuation
text = re.sub(r'[^\w\s]', '', text)
# Split the text into tokens
tokens = text.split()
# Add special tokens for the question
tokens.insert(0, '<question>')
tokens.append('<endquestion>')
return tokens