Spaces:
Running
Running
import json | |
import matplotlib.pyplot as plt | |
import numpy as np | |
from transformers import ( | |
AutoTokenizer, | |
PreTrainedTokenizer, | |
PreTrainedTokenizerBase, | |
PreTrainedTokenizerFast, | |
) | |
# Open datasets | |
file_paths = ["ShareGPT_V3_filtered.json", "ShareGPT_V3_filtered_500.json"] | |
names = [file_path[:-5] for file_path in file_paths] | |
data_lists = [] | |
for file_path in file_paths: | |
with open(file_path, "r", encoding="utf-8") as file: | |
data_list = json.load(file) | |
data_lists.append(data_list) | |
for name, data_list in zip(names, data_lists): | |
print(f"{name}: {len(data_list)}") | |
# Get prompt lengths using tokenizer | |
tokenizer = AutoTokenizer.from_pretrained("huggyllama/llama-7b") | |
all_prompts = [ | |
[data["conversations"][0]["value"] for data in data_lists if data["conversations"][0]["from"] == "human"] | |
for data_lists in data_lists | |
] | |
all_token_ids_per_prompts = [tokenizer(prompts).input_ids for prompts in all_prompts] | |
all_prompt_lens = [ | |
[len(token_ids) for token_ids in token_ids_per_prompt] | |
for token_ids_per_prompt in all_token_ids_per_prompts | |
] | |
# Plotting the histograms | |
for name, prompt_lens in zip(names, all_prompt_lens): | |
plt.hist( | |
prompt_lens, | |
bins=range(min(prompt_lens), max(prompt_lens) + 1), | |
edgecolor="black", | |
) | |
plt.xlabel("Prompt Length (number of tokens)") | |
plt.ylabel("Frequency") | |
plt.title(f"Histogram of {name}") | |
plt.savefig(f"{name}_distribution.png") | |
plt.close() | |
# Plotting the CDF | |
for name, prompt_lens in zip(names, all_prompt_lens): | |
values, counts = np.unique(prompt_lens, return_counts=True) | |
relative_frequencies = counts / len(prompt_lens) | |
sorted_data = np.sort(values) | |
cumulative_frequencies = np.cumsum(relative_frequencies) | |
plt.step(sorted_data, cumulative_frequencies, where="post", label=name) | |
plt.title(f"Cumulative Distribution Function (CDF) Overlayed") | |
plt.xlabel("Prompt Length (number of tokens)") | |
plt.ylabel("Cumulative Probability") | |
plt.savefig(f"{name}_cdf.png") | |
plt.close() | |