File size: 346 Bytes

2c07569

import re

def custom_tokenizer(text):
    # Lowercase the text
    text = text.lower()

    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)

    # Split the text into tokens
    tokens = text.split()

    # Add special tokens for the question
    tokens.insert(0, '<question>')
    tokens.append('<endquestion>')

    return tokens