{"shakespeare_char": {"means": {"final_train_loss_mean": 1.238865852355957, "best_val_loss_mean": 1.4940879344940186, "total_train_time_mean": 87.57891074816386, "avg_inference_tokens_per_second_mean": 534.558911601877}, "stderrs": {"final_train_loss_stderr": 0.0014499765776533108, "best_val_loss_stderr": 0.0009075531854184589, "total_train_time_stderr": 0.5840464251435701, "avg_inference_tokens_per_second_stderr": 1.2804654723649376}, "final_info_dict": {"final_train_loss": [1.2369663715362549, 1.2448828220367432, 1.234748363494873], "best_val_loss": [1.4925225973129272, 1.4979171752929688, 1.4918240308761597], "total_train_time": [90.01761507987976, 85.97938895225525, 86.73972821235657], "avg_inference_tokens_per_second": [529.1281715071688, 537.1526832623192, 537.395880036143]}}, "enwik8": {"means": {"final_train_loss_mean": 1.159803867340088, "best_val_loss_mean": 1.0032024383544922, "total_train_time_mean": 969.5262658596039, "avg_inference_tokens_per_second_mean": 531.1808650137853}, "stderrs": {"final_train_loss_stderr": 0.0, "best_val_loss_stderr": 0.0, "total_train_time_stderr": 0.0, "avg_inference_tokens_per_second_stderr": 0.0}, "final_info_dict": {"final_train_loss": [1.159803867340088], "best_val_loss": [1.0032024383544922], "total_train_time": [969.5262658596039], "avg_inference_tokens_per_second": [531.1808650137853]}}, "text8": {"means": {"final_train_loss_mean": 1.11098313331604, "best_val_loss_mean": 0.9339989423751831, "total_train_time_mean": 966.2461061477661, "avg_inference_tokens_per_second_mean": 530.6660717341676}, "stderrs": {"final_train_loss_stderr": 0.0, "best_val_loss_stderr": 0.0, "total_train_time_stderr": 0.0, "avg_inference_tokens_per_second_stderr": 0.0}, "final_info_dict": {"final_train_loss": [1.11098313331604], "best_val_loss": [0.9339989423751831], "total_train_time": [966.2461061477661], "avg_inference_tokens_per_second": [530.6660717341676]}}} |