File size: 346 Bytes
2c07569
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
import re

def custom_tokenizer(text):
    # Lowercase the text
    text = text.lower()

    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)

    # Split the text into tokens
    tokens = text.split()

    # Add special tokens for the question
    tokens.insert(0, '<question>')
    tokens.append('<endquestion>')

    return tokens