File size: 1,274 Bytes
a9fe74f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
import csv 
import tiktoken

questions_and_passages = []

with open("sample.csv") as f: 
    reader = csv.reader(f)
    next(f)
    for row in reader:
        entry = []
        entry.append((row[1] + " " + row[2]).strip()) # optional prompt + question
        entry.append(row[9]) # first of 10 passages
        for i in range(10, 19): entry[1] += " " + row[i] # next 9 passages all separated with a space
        questions_and_passages.append(entry)

enc = tiktoken.encoding_for_model("gpt-4o")

question_tokens = 0
question_passage_tokens = 0
max_qt = 0
max_pt = 0
max_qpt = 0

for entry in questions_and_passages:
    qt = len(enc.encode(entry[0]))
    question_tokens += qt 
    if qt > max_qt: max_qt = qt
    pt = len(enc.encode(entry[1]))
    question_passage_tokens += qt + pt
    if pt > max_pt: max_pt = pt
    if qt + pt > max_qpt: max_qpt = qt + pt

print("Average question length, gpt-4o tokens: " + str(question_tokens / len(questions_and_passages)))
print("Longest question (tokens): " + str(max_qt))
print("Average question + 10 passages length, gpt-4o tokens: " + str(question_passage_tokens / len(questions_and_passages)))
print("Longest set of ten passages (tokens): " + str(max_pt))
print("Longest combination of question and passages: " + str(max_qpt))