Davide Ghilardi commited on
Commit
8c1164c
1 Parent(s): aa1cb38

Initial commit

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. layer_0/final_61440000/cfg.json +1 -0
  2. layer_0/final_61440000/h3i20ez6/final_61440000/cfg.json +1 -0
  3. layer_0/final_61440000/h3i20ez6/final_61440000/sparsity.safetensors +3 -0
  4. layer_0/final_61440000/sae_weights.safetensors +3 -0
  5. layer_0/final_61440000/sparsity.safetensors +3 -0
  6. layer_1/final_61440000/cfg.json +1 -0
  7. layer_1/final_61440000/sparsity.safetensors +3 -0
  8. layer_10/final_61440000/cfg.json +1 -0
  9. layer_10/final_61440000/sae_weights.safetensors +3 -0
  10. layer_10/final_61440000/sparsity.safetensors +3 -0
  11. layer_11/final_61440000/cfg.json +1 -0
  12. layer_11/final_61440000/sae_weights.safetensors +3 -0
  13. layer_11/final_61440000/sparsity.safetensors +3 -0
  14. layer_2/final_61440000/cfg.json +1 -0
  15. layer_2/final_61440000/sparsity.safetensors +3 -0
  16. layer_3/final_61440000/cfg.json +1 -0
  17. layer_3/final_61440000/sae_weights.safetensors +3 -0
  18. layer_3/final_61440000/sparsity.safetensors +3 -0
  19. layer_4/.DS_Store +0 -0
  20. layer_4/final_61440000/cfg.json +1 -0
  21. layer_4/final_61440000/sparsity.safetensors +3 -0
  22. layer_5/final_61440000/cfg.json +1 -0
  23. layer_5/final_61440000/sae_weights.safetensors +3 -0
  24. layer_5/final_61440000/sparsity.safetensors +3 -0
  25. layer_6/final_61440000/cfg.json +1 -0
  26. layer_6/final_61440000/sparsity.safetensors +3 -0
  27. layer_7/final_61440000/cfg.json +1 -0
  28. layer_7/final_61440000/sae_weights.safetensors +3 -0
  29. layer_7/final_61440000/sparsity.safetensors +3 -0
  30. layer_8/final_61440000/cfg.json +1 -0
  31. layer_8/final_61440000/sae_weights.safetensors +3 -0
  32. layer_8/final_61440000/sparsity.safetensors +3 -0
  33. layer_9/final_61440000/cfg.json +1 -0
  34. layer_9/final_61440000/sparsity.safetensors +3 -0
  35. transfer_layer_1/byp42y0x/final_61440000/cfg.json +1 -0
  36. transfer_layer_1/byp42y0x/final_61440000/sparsity.safetensors +3 -0
  37. transfer_layer_10/pl4vbzcz/final_61440000/cfg.json +1 -0
  38. transfer_layer_10/pl4vbzcz/final_61440000/sparsity.safetensors +3 -0
  39. transfer_layer_11/2ie78it4/final_61440000/cfg.json +1 -0
  40. transfer_layer_11/2ie78it4/final_61440000/sparsity.safetensors +3 -0
  41. transfer_layer_2/ahj749k3/final_61440000/cfg.json +1 -0
  42. transfer_layer_2/ahj749k3/final_61440000/sparsity.safetensors +3 -0
  43. transfer_layer_3/hpzj1ceq/final_61440000/cfg.json +1 -0
  44. transfer_layer_3/hpzj1ceq/final_61440000/sparsity.safetensors +3 -0
  45. transfer_layer_4/ajth2o0a/final_61440000/cfg.json +1 -0
  46. transfer_layer_4/ajth2o0a/final_61440000/sparsity.safetensors +3 -0
  47. transfer_layer_5/ilva7jii/final_61440000/cfg.json +1 -0
  48. transfer_layer_5/ilva7jii/final_61440000/sparsity.safetensors +3 -0
  49. transfer_layer_6/g99cepn0/final_61440000/cfg.json +1 -0
  50. transfer_layer_6/g99cepn0/final_61440000/sparsity.safetensors +3 -0
layer_0/final_61440000/cfg.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"model_name": "gpt2", "model_class_name": "HookedTransformer", "hook_point": "blocks.0.hook_attn_out", "hook_point_eval": "blocks.{layer}.attn.pattern", "hook_point_layer": 0, "hook_point_head_index": null, "dataset_path": "apollo-research/Skylion007-openwebtext-tokenizer-gpt2", "streaming": true, "is_dataset_tokenized": true, "context_size": 128, "use_cached_activations": false, "cached_activations_path": null, "d_in": 768, "d_sae": 49152, "b_dec_init_method": "zeros", "expansion_factor": 64, "activation_fn": "relu", "normalize_sae_decoder": false, "noise_scale": 0.0, "from_pretrained_path": null, "apply_b_dec_to_input": false, "decoder_orthogonal_init": false, "decoder_heuristic_init": true, "init_encoder_as_decoder_transpose": true, "n_batches_in_buffer": 128, "training_tokens": 61440000, "finetuning_tokens": 0, "store_batch_size_prompts": 32, "train_batch_size_tokens": 4096, "normalize_activations": false, "device": "cuda", "act_store_device": "cuda", "seed": 42, "dtype": "torch.float32", "prepend_bos": true, "autocast": true, "autocast_lm": false, "compile_llm": true, "llm_compilation_mode": null, "compile_sae": true, "sae_compilation_mode": null, "adam_beta1": 0.9, "adam_beta2": 0.999, "mse_loss_normalization": null, "l1_coefficient": 1, "lp_norm": 1.0, "scale_sparsity_penalty_by_decoder_norm": true, "l1_warm_up_steps": 1500, "lr": 5e-05, "lr_scheduler_name": "constant", "lr_warm_up_steps": 0, "lr_end": 5e-06, "lr_decay_steps": 0, "n_restart_cycles": 1, "finetuning_method": null, "use_ghost_grads": false, "feature_sampling_window": 1000, "dead_feature_window": 1000, "dead_feature_threshold": 0.0001, "n_eval_batches": 10, "eval_batch_size_prompts": 2, "log_to_wandb": true, "log_activations_store_to_wandb": false, "log_optimizer_state_to_wandb": false, "wandb_project": "transfer-learning", "wandb_id": null, "run_name": "49152-L1-1-LR-5e-05-Tokens-6.144e+07", "wandb_entity": null, "wandb_log_frequency": 50, "eval_every_n_wandb_logs": 10, "resume": false, "n_checkpoints": 0, "checkpoint_path": "sae-transfer-learning/first-attn-transfer-gpt2/rjpnxi8e", "verbose": true, "model_kwargs": {}, "model_from_pretrained_kwargs": {}, "sae_lens_version": "2.1.3", "sae_lens_training_version": "2.1.3", "tokens_per_buffer": 67108864}
layer_0/final_61440000/h3i20ez6/final_61440000/cfg.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"model_name": "gpt2", "model_class_name": "HookedTransformer", "hook_point": "blocks.1.hook_attn_out", "hook_point_eval": "blocks.{layer}.attn.pattern", "hook_point_layer": 1, "hook_point_head_index": null, "dataset_path": "apollo-research/Skylion007-openwebtext-tokenizer-gpt2", "streaming": true, "is_dataset_tokenized": true, "context_size": 128, "use_cached_activations": false, "cached_activations_path": null, "d_in": 768, "d_sae": 49152, "b_dec_init_method": "zeros", "expansion_factor": 64, "activation_fn": "relu", "normalize_sae_decoder": false, "noise_scale": 0.0, "from_pretrained_path": null, "apply_b_dec_to_input": false, "decoder_orthogonal_init": false, "decoder_heuristic_init": true, "init_encoder_as_decoder_transpose": true, "n_batches_in_buffer": 128, "training_tokens": 61440000, "finetuning_tokens": 0, "store_batch_size_prompts": 32, "train_batch_size_tokens": 4096, "normalize_activations": false, "device": "cuda", "act_store_device": "cuda", "seed": 42, "dtype": "torch.float32", "prepend_bos": true, "autocast": true, "autocast_lm": false, "compile_llm": true, "llm_compilation_mode": null, "compile_sae": true, "sae_compilation_mode": null, "adam_beta1": 0.9, "adam_beta2": 0.999, "mse_loss_normalization": null, "l1_coefficient": 1, "lp_norm": 1.0, "scale_sparsity_penalty_by_decoder_norm": true, "l1_warm_up_steps": 1500, "lr": 5e-05, "lr_scheduler_name": "constant", "lr_warm_up_steps": 0, "lr_end": 5e-06, "lr_decay_steps": 0, "n_restart_cycles": 1, "finetuning_method": null, "use_ghost_grads": false, "feature_sampling_window": 1000, "dead_feature_window": 1000, "dead_feature_threshold": 0.0001, "n_eval_batches": 10, "eval_batch_size_prompts": 2, "log_to_wandb": true, "log_activations_store_to_wandb": false, "log_optimizer_state_to_wandb": false, "wandb_project": "transfer-learning", "wandb_id": null, "run_name": "49152-L1-1-LR-5e-05-Tokens-6.144e+07", "wandb_entity": null, "wandb_log_frequency": 50, "eval_every_n_wandb_logs": 10, "resume": false, "n_checkpoints": 0, "checkpoint_path": "sae-transfer-learning/first-attn-transfer-gpt2/layer_0/final_61440000//h3i20ez6", "verbose": true, "model_kwargs": {}, "model_from_pretrained_kwargs": {}, "sae_lens_version": "2.1.3", "sae_lens_training_version": "2.1.3", "tokens_per_buffer": 67108864}
layer_0/final_61440000/h3i20ez6/final_61440000/sparsity.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bb5168849c80c6bbd7b93c4560ed3535bf3c7645d3d7660895ae2fae52f3e05a
3
+ size 196688
layer_0/final_61440000/sae_weights.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a436ebc324b3a7ab23e1acbc90685668a2523485260ef87e5104a2d11cf1aa5d
3
+ size 302386584
layer_0/final_61440000/sparsity.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4eba37ad1f8cc83219c05d813f64849353e1fe1fa19c0f6d8b7eb6751c9b8140
3
+ size 196688
layer_1/final_61440000/cfg.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"model_name": "gpt2", "model_class_name": "HookedTransformer", "hook_point": "blocks.1.hook_attn_out", "hook_point_eval": "blocks.{layer}.attn.pattern", "hook_point_layer": 1, "hook_point_head_index": null, "dataset_path": "apollo-research/Skylion007-openwebtext-tokenizer-gpt2", "streaming": true, "is_dataset_tokenized": true, "context_size": 128, "use_cached_activations": false, "cached_activations_path": null, "d_in": 768, "d_sae": 49152, "b_dec_init_method": "zeros", "expansion_factor": 64, "activation_fn": "relu", "normalize_sae_decoder": false, "noise_scale": 0.0, "from_pretrained_path": null, "apply_b_dec_to_input": false, "decoder_orthogonal_init": false, "decoder_heuristic_init": true, "init_encoder_as_decoder_transpose": true, "n_batches_in_buffer": 128, "training_tokens": 61440000, "finetuning_tokens": 0, "store_batch_size_prompts": 32, "train_batch_size_tokens": 4096, "normalize_activations": false, "device": "cuda", "act_store_device": "cuda", "seed": 42, "dtype": "torch.float32", "prepend_bos": true, "autocast": true, "autocast_lm": false, "compile_llm": true, "llm_compilation_mode": null, "compile_sae": true, "sae_compilation_mode": null, "adam_beta1": 0.9, "adam_beta2": 0.999, "mse_loss_normalization": null, "l1_coefficient": 1, "lp_norm": 1.0, "scale_sparsity_penalty_by_decoder_norm": true, "l1_warm_up_steps": 1500, "lr": 5e-05, "lr_scheduler_name": "constant", "lr_warm_up_steps": 0, "lr_end": 5e-06, "lr_decay_steps": 0, "n_restart_cycles": 1, "finetuning_method": null, "use_ghost_grads": false, "feature_sampling_window": 1000, "dead_feature_window": 1000, "dead_feature_threshold": 0.0001, "n_eval_batches": 10, "eval_batch_size_prompts": 2, "log_to_wandb": true, "log_activations_store_to_wandb": false, "log_optimizer_state_to_wandb": false, "wandb_project": "transfer-learning", "wandb_id": null, "run_name": "49152-L1-1-LR-5e-05-Tokens-6.144e+07", "wandb_entity": null, "wandb_log_frequency": 50, "eval_every_n_wandb_logs": 10, "resume": false, "n_checkpoints": 0, "checkpoint_path": "sae-transfer-learning/first-attn-transfer-gpt2/eq1spjmd", "verbose": true, "model_kwargs": {}, "model_from_pretrained_kwargs": {}, "sae_lens_version": "2.1.3", "sae_lens_training_version": "2.1.3", "tokens_per_buffer": 67108864}
layer_1/final_61440000/sparsity.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3fde6f2dab2f4319816c694e04139d9c12b027bf9fa8332d52c4ef4447c9b2ee
3
+ size 196688
layer_10/final_61440000/cfg.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"model_name": "gpt2", "model_class_name": "HookedTransformer", "hook_point": "blocks.10.hook_attn_out", "hook_point_eval": "blocks.{layer}.attn.pattern", "hook_point_layer": 10, "hook_point_head_index": null, "dataset_path": "apollo-research/Skylion007-openwebtext-tokenizer-gpt2", "streaming": true, "is_dataset_tokenized": true, "context_size": 128, "use_cached_activations": false, "cached_activations_path": null, "d_in": 768, "d_sae": 49152, "b_dec_init_method": "zeros", "expansion_factor": 64, "activation_fn": "relu", "normalize_sae_decoder": false, "noise_scale": 0.0, "from_pretrained_path": null, "apply_b_dec_to_input": false, "decoder_orthogonal_init": false, "decoder_heuristic_init": true, "init_encoder_as_decoder_transpose": true, "n_batches_in_buffer": 128, "training_tokens": 61440000, "finetuning_tokens": 0, "store_batch_size_prompts": 32, "train_batch_size_tokens": 4096, "normalize_activations": false, "device": "cuda", "act_store_device": "cuda", "seed": 42, "dtype": "torch.float32", "prepend_bos": true, "autocast": true, "autocast_lm": false, "compile_llm": true, "llm_compilation_mode": null, "compile_sae": true, "sae_compilation_mode": null, "adam_beta1": 0.9, "adam_beta2": 0.999, "mse_loss_normalization": null, "l1_coefficient": 1, "lp_norm": 1.0, "scale_sparsity_penalty_by_decoder_norm": true, "l1_warm_up_steps": 1500, "lr": 5e-05, "lr_scheduler_name": "constant", "lr_warm_up_steps": 0, "lr_end": 5e-06, "lr_decay_steps": 0, "n_restart_cycles": 1, "finetuning_method": null, "use_ghost_grads": false, "feature_sampling_window": 1000, "dead_feature_window": 1000, "dead_feature_threshold": 0.0001, "n_eval_batches": 10, "eval_batch_size_prompts": 2, "log_to_wandb": true, "log_activations_store_to_wandb": false, "log_optimizer_state_to_wandb": false, "wandb_project": "transfer-learning", "wandb_id": null, "run_name": "49152-L1-1-LR-5e-05-Tokens-6.144e+07", "wandb_entity": null, "wandb_log_frequency": 50, "eval_every_n_wandb_logs": 10, "resume": false, "n_checkpoints": 0, "checkpoint_path": "sae-transfer-learning/first-attn-transfer-gpt2/e37amtur", "verbose": true, "model_kwargs": {}, "model_from_pretrained_kwargs": {}, "sae_lens_version": "2.1.3", "sae_lens_training_version": "2.1.3", "tokens_per_buffer": 67108864}
layer_10/final_61440000/sae_weights.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0c5f881eee4179b2ef7860be9be1e629709cf8ad3523e2d3504f6cef83e371ba
3
+ size 302386584
layer_10/final_61440000/sparsity.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:44576ea5ef70b8eb25243b13bc9ac377f7bd9011772567e41ca9ad099902c92e
3
+ size 196688
layer_11/final_61440000/cfg.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"model_name": "gpt2", "model_class_name": "HookedTransformer", "hook_point": "blocks.11.hook_attn_out", "hook_point_eval": "blocks.{layer}.attn.pattern", "hook_point_layer": 11, "hook_point_head_index": null, "dataset_path": "apollo-research/Skylion007-openwebtext-tokenizer-gpt2", "streaming": true, "is_dataset_tokenized": true, "context_size": 128, "use_cached_activations": false, "cached_activations_path": null, "d_in": 768, "d_sae": 49152, "b_dec_init_method": "zeros", "expansion_factor": 64, "activation_fn": "relu", "normalize_sae_decoder": false, "noise_scale": 0.0, "from_pretrained_path": null, "apply_b_dec_to_input": false, "decoder_orthogonal_init": false, "decoder_heuristic_init": true, "init_encoder_as_decoder_transpose": true, "n_batches_in_buffer": 128, "training_tokens": 61440000, "finetuning_tokens": 0, "store_batch_size_prompts": 32, "train_batch_size_tokens": 4096, "normalize_activations": false, "device": "cuda", "act_store_device": "cuda", "seed": 42, "dtype": "torch.float32", "prepend_bos": true, "autocast": true, "autocast_lm": false, "compile_llm": true, "llm_compilation_mode": null, "compile_sae": true, "sae_compilation_mode": null, "adam_beta1": 0.9, "adam_beta2": 0.999, "mse_loss_normalization": null, "l1_coefficient": 1, "lp_norm": 1.0, "scale_sparsity_penalty_by_decoder_norm": true, "l1_warm_up_steps": 1500, "lr": 5e-05, "lr_scheduler_name": "constant", "lr_warm_up_steps": 0, "lr_end": 5e-06, "lr_decay_steps": 0, "n_restart_cycles": 1, "finetuning_method": null, "use_ghost_grads": false, "feature_sampling_window": 1000, "dead_feature_window": 1000, "dead_feature_threshold": 0.0001, "n_eval_batches": 10, "eval_batch_size_prompts": 2, "log_to_wandb": true, "log_activations_store_to_wandb": false, "log_optimizer_state_to_wandb": false, "wandb_project": "transfer-learning", "wandb_id": null, "run_name": "49152-L1-1-LR-5e-05-Tokens-6.144e+07", "wandb_entity": null, "wandb_log_frequency": 50, "eval_every_n_wandb_logs": 10, "resume": false, "n_checkpoints": 0, "checkpoint_path": "sae-transfer-learning/first-attn-transfer-gpt2/k3pe8pbh", "verbose": true, "model_kwargs": {}, "model_from_pretrained_kwargs": {}, "sae_lens_version": "2.1.3", "sae_lens_training_version": "2.1.3", "tokens_per_buffer": 67108864}
layer_11/final_61440000/sae_weights.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9613090acc18f008741bc0b820cad20e0ccf29866cf86aaa148a7bf35858eba2
3
+ size 302386584
layer_11/final_61440000/sparsity.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9db5d7131c3695cd3086487cfb104f22f0e64b12d6824ba9d0d370c0067df551
3
+ size 196688
layer_2/final_61440000/cfg.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"model_name": "gpt2", "model_class_name": "HookedTransformer", "hook_point": "blocks.2.hook_attn_out", "hook_point_eval": "blocks.{layer}.attn.pattern", "hook_point_layer": 2, "hook_point_head_index": null, "dataset_path": "apollo-research/Skylion007-openwebtext-tokenizer-gpt2", "streaming": true, "is_dataset_tokenized": true, "context_size": 128, "use_cached_activations": false, "cached_activations_path": null, "d_in": 768, "d_sae": 49152, "b_dec_init_method": "zeros", "expansion_factor": 64, "activation_fn": "relu", "normalize_sae_decoder": false, "noise_scale": 0.0, "from_pretrained_path": null, "apply_b_dec_to_input": false, "decoder_orthogonal_init": false, "decoder_heuristic_init": true, "init_encoder_as_decoder_transpose": true, "n_batches_in_buffer": 128, "training_tokens": 61440000, "finetuning_tokens": 0, "store_batch_size_prompts": 32, "train_batch_size_tokens": 4096, "normalize_activations": false, "device": "cuda", "act_store_device": "cuda", "seed": 42, "dtype": "torch.float32", "prepend_bos": true, "autocast": true, "autocast_lm": false, "compile_llm": true, "llm_compilation_mode": null, "compile_sae": true, "sae_compilation_mode": null, "adam_beta1": 0.9, "adam_beta2": 0.999, "mse_loss_normalization": null, "l1_coefficient": 1, "lp_norm": 1.0, "scale_sparsity_penalty_by_decoder_norm": true, "l1_warm_up_steps": 1500, "lr": 5e-05, "lr_scheduler_name": "constant", "lr_warm_up_steps": 0, "lr_end": 5e-06, "lr_decay_steps": 0, "n_restart_cycles": 1, "finetuning_method": null, "use_ghost_grads": false, "feature_sampling_window": 1000, "dead_feature_window": 1000, "dead_feature_threshold": 0.0001, "n_eval_batches": 10, "eval_batch_size_prompts": 2, "log_to_wandb": true, "log_activations_store_to_wandb": false, "log_optimizer_state_to_wandb": false, "wandb_project": "transfer-learning", "wandb_id": null, "run_name": "49152-L1-1-LR-5e-05-Tokens-6.144e+07", "wandb_entity": null, "wandb_log_frequency": 50, "eval_every_n_wandb_logs": 10, "resume": false, "n_checkpoints": 0, "checkpoint_path": "sae-transfer-learning/first-attn-transfer-gpt2/jmepy8tw", "verbose": true, "model_kwargs": {}, "model_from_pretrained_kwargs": {}, "sae_lens_version": "2.1.3", "sae_lens_training_version": "2.1.3", "tokens_per_buffer": 67108864}
layer_2/final_61440000/sparsity.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7d9f70de56c64e78693a8d35bb6f13efc6a02e23ac0887e7970043abae766da3
3
+ size 196688
layer_3/final_61440000/cfg.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"model_name": "gpt2", "model_class_name": "HookedTransformer", "hook_point": "blocks.3.hook_attn_out", "hook_point_eval": "blocks.{layer}.attn.pattern", "hook_point_layer": 3, "hook_point_head_index": null, "dataset_path": "apollo-research/Skylion007-openwebtext-tokenizer-gpt2", "streaming": true, "is_dataset_tokenized": true, "context_size": 128, "use_cached_activations": false, "cached_activations_path": null, "d_in": 768, "d_sae": 49152, "b_dec_init_method": "zeros", "expansion_factor": 64, "activation_fn": "relu", "normalize_sae_decoder": false, "noise_scale": 0.0, "from_pretrained_path": null, "apply_b_dec_to_input": false, "decoder_orthogonal_init": false, "decoder_heuristic_init": true, "init_encoder_as_decoder_transpose": true, "n_batches_in_buffer": 128, "training_tokens": 61440000, "finetuning_tokens": 0, "store_batch_size_prompts": 32, "train_batch_size_tokens": 4096, "normalize_activations": false, "device": "cuda", "act_store_device": "cuda", "seed": 42, "dtype": "torch.float32", "prepend_bos": true, "autocast": true, "autocast_lm": false, "compile_llm": true, "llm_compilation_mode": null, "compile_sae": true, "sae_compilation_mode": null, "adam_beta1": 0.9, "adam_beta2": 0.999, "mse_loss_normalization": null, "l1_coefficient": 1, "lp_norm": 1.0, "scale_sparsity_penalty_by_decoder_norm": true, "l1_warm_up_steps": 1500, "lr": 5e-05, "lr_scheduler_name": "constant", "lr_warm_up_steps": 0, "lr_end": 5e-06, "lr_decay_steps": 0, "n_restart_cycles": 1, "finetuning_method": null, "use_ghost_grads": false, "feature_sampling_window": 1000, "dead_feature_window": 1000, "dead_feature_threshold": 0.0001, "n_eval_batches": 10, "eval_batch_size_prompts": 2, "log_to_wandb": true, "log_activations_store_to_wandb": false, "log_optimizer_state_to_wandb": false, "wandb_project": "transfer-learning", "wandb_id": null, "run_name": "49152-L1-1-LR-5e-05-Tokens-6.144e+07", "wandb_entity": null, "wandb_log_frequency": 50, "eval_every_n_wandb_logs": 10, "resume": false, "n_checkpoints": 0, "checkpoint_path": "sae-transfer-learning/first-attn-transfer-gpt2/bl1t1nc8", "verbose": true, "model_kwargs": {}, "model_from_pretrained_kwargs": {}, "sae_lens_version": "2.1.3", "sae_lens_training_version": "2.1.3", "tokens_per_buffer": 67108864}
layer_3/final_61440000/sae_weights.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4ae390265ce58f75ef73f332f38f12516e3899f90f2b889fff3213f2dce7ae9c
3
+ size 302386584
layer_3/final_61440000/sparsity.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:499f3f734fc66fa84e5ada6113d63cfa121b7911cbaaf839a2687ecfb3502a7c
3
+ size 196688
layer_4/.DS_Store ADDED
Binary file (6.15 kB). View file
 
layer_4/final_61440000/cfg.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"model_name": "gpt2", "model_class_name": "HookedTransformer", "hook_point": "blocks.4.hook_attn_out", "hook_point_eval": "blocks.{layer}.attn.pattern", "hook_point_layer": 4, "hook_point_head_index": null, "dataset_path": "apollo-research/Skylion007-openwebtext-tokenizer-gpt2", "streaming": true, "is_dataset_tokenized": true, "context_size": 128, "use_cached_activations": false, "cached_activations_path": null, "d_in": 768, "d_sae": 49152, "b_dec_init_method": "zeros", "expansion_factor": 64, "activation_fn": "relu", "normalize_sae_decoder": false, "noise_scale": 0.0, "from_pretrained_path": null, "apply_b_dec_to_input": false, "decoder_orthogonal_init": false, "decoder_heuristic_init": true, "init_encoder_as_decoder_transpose": true, "n_batches_in_buffer": 128, "training_tokens": 61440000, "finetuning_tokens": 0, "store_batch_size_prompts": 32, "train_batch_size_tokens": 4096, "normalize_activations": false, "device": "cuda", "act_store_device": "cuda", "seed": 42, "dtype": "torch.float32", "prepend_bos": true, "autocast": true, "autocast_lm": false, "compile_llm": true, "llm_compilation_mode": null, "compile_sae": true, "sae_compilation_mode": null, "adam_beta1": 0.9, "adam_beta2": 0.999, "mse_loss_normalization": null, "l1_coefficient": 1, "lp_norm": 1.0, "scale_sparsity_penalty_by_decoder_norm": true, "l1_warm_up_steps": 1500, "lr": 5e-05, "lr_scheduler_name": "constant", "lr_warm_up_steps": 0, "lr_end": 5e-06, "lr_decay_steps": 0, "n_restart_cycles": 1, "finetuning_method": null, "use_ghost_grads": false, "feature_sampling_window": 1000, "dead_feature_window": 1000, "dead_feature_threshold": 0.0001, "n_eval_batches": 10, "eval_batch_size_prompts": 2, "log_to_wandb": true, "log_activations_store_to_wandb": false, "log_optimizer_state_to_wandb": false, "wandb_project": "transfer-learning", "wandb_id": null, "run_name": "49152-L1-1-LR-5e-05-Tokens-6.144e+07", "wandb_entity": null, "wandb_log_frequency": 50, "eval_every_n_wandb_logs": 10, "resume": false, "n_checkpoints": 0, "checkpoint_path": "sae-transfer-learning/first-attn-transfer-gpt2/mhp05ibg", "verbose": true, "model_kwargs": {}, "model_from_pretrained_kwargs": {}, "sae_lens_version": "2.1.3", "sae_lens_training_version": "2.1.3", "tokens_per_buffer": 67108864}
layer_4/final_61440000/sparsity.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1c0b4124624d8320ec06e5c48edc75f48d14e5553124e13b4e7e06b4e5919c33
3
+ size 196688
layer_5/final_61440000/cfg.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"model_name": "gpt2", "model_class_name": "HookedTransformer", "hook_point": "blocks.5.hook_attn_out", "hook_point_eval": "blocks.{layer}.attn.pattern", "hook_point_layer": 5, "hook_point_head_index": null, "dataset_path": "apollo-research/Skylion007-openwebtext-tokenizer-gpt2", "streaming": true, "is_dataset_tokenized": true, "context_size": 128, "use_cached_activations": false, "cached_activations_path": null, "d_in": 768, "d_sae": 49152, "b_dec_init_method": "zeros", "expansion_factor": 64, "activation_fn": "relu", "normalize_sae_decoder": false, "noise_scale": 0.0, "from_pretrained_path": null, "apply_b_dec_to_input": false, "decoder_orthogonal_init": false, "decoder_heuristic_init": true, "init_encoder_as_decoder_transpose": true, "n_batches_in_buffer": 128, "training_tokens": 61440000, "finetuning_tokens": 0, "store_batch_size_prompts": 32, "train_batch_size_tokens": 4096, "normalize_activations": false, "device": "cuda", "act_store_device": "cuda", "seed": 42, "dtype": "torch.float32", "prepend_bos": true, "autocast": true, "autocast_lm": false, "compile_llm": true, "llm_compilation_mode": null, "compile_sae": true, "sae_compilation_mode": null, "adam_beta1": 0.9, "adam_beta2": 0.999, "mse_loss_normalization": null, "l1_coefficient": 1, "lp_norm": 1.0, "scale_sparsity_penalty_by_decoder_norm": true, "l1_warm_up_steps": 1500, "lr": 5e-05, "lr_scheduler_name": "constant", "lr_warm_up_steps": 0, "lr_end": 5e-06, "lr_decay_steps": 0, "n_restart_cycles": 1, "finetuning_method": null, "use_ghost_grads": false, "feature_sampling_window": 1000, "dead_feature_window": 1000, "dead_feature_threshold": 0.0001, "n_eval_batches": 10, "eval_batch_size_prompts": 2, "log_to_wandb": true, "log_activations_store_to_wandb": false, "log_optimizer_state_to_wandb": false, "wandb_project": "transfer-learning", "wandb_id": null, "run_name": "49152-L1-1-LR-5e-05-Tokens-6.144e+07", "wandb_entity": null, "wandb_log_frequency": 50, "eval_every_n_wandb_logs": 10, "resume": false, "n_checkpoints": 0, "checkpoint_path": "sae-transfer-learning/first-attn-transfer-gpt2/s2bdiskq", "verbose": true, "model_kwargs": {}, "model_from_pretrained_kwargs": {}, "sae_lens_version": "2.1.3", "sae_lens_training_version": "2.1.3", "tokens_per_buffer": 67108864}
layer_5/final_61440000/sae_weights.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f60cfe2bd6e62a40e0d4b4a7c9d35e7cd87487ef4925c79903dd589ab322d5a5
3
+ size 302386584
layer_5/final_61440000/sparsity.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:160f0c422bc29cf617d5eca41589772d60aa720cfd16db9dbb088cacf4bc53f0
3
+ size 196688
layer_6/final_61440000/cfg.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"model_name": "gpt2", "model_class_name": "HookedTransformer", "hook_point": "blocks.6.hook_attn_out", "hook_point_eval": "blocks.{layer}.attn.pattern", "hook_point_layer": 6, "hook_point_head_index": null, "dataset_path": "apollo-research/Skylion007-openwebtext-tokenizer-gpt2", "streaming": true, "is_dataset_tokenized": true, "context_size": 128, "use_cached_activations": false, "cached_activations_path": null, "d_in": 768, "d_sae": 49152, "b_dec_init_method": "zeros", "expansion_factor": 64, "activation_fn": "relu", "normalize_sae_decoder": false, "noise_scale": 0.0, "from_pretrained_path": null, "apply_b_dec_to_input": false, "decoder_orthogonal_init": false, "decoder_heuristic_init": true, "init_encoder_as_decoder_transpose": true, "n_batches_in_buffer": 128, "training_tokens": 61440000, "finetuning_tokens": 0, "store_batch_size_prompts": 32, "train_batch_size_tokens": 4096, "normalize_activations": false, "device": "cuda", "act_store_device": "cuda", "seed": 42, "dtype": "torch.float32", "prepend_bos": true, "autocast": true, "autocast_lm": false, "compile_llm": true, "llm_compilation_mode": null, "compile_sae": true, "sae_compilation_mode": null, "adam_beta1": 0.9, "adam_beta2": 0.999, "mse_loss_normalization": null, "l1_coefficient": 1, "lp_norm": 1.0, "scale_sparsity_penalty_by_decoder_norm": true, "l1_warm_up_steps": 1500, "lr": 5e-05, "lr_scheduler_name": "constant", "lr_warm_up_steps": 0, "lr_end": 5e-06, "lr_decay_steps": 0, "n_restart_cycles": 1, "finetuning_method": null, "use_ghost_grads": false, "feature_sampling_window": 1000, "dead_feature_window": 1000, "dead_feature_threshold": 0.0001, "n_eval_batches": 10, "eval_batch_size_prompts": 2, "log_to_wandb": true, "log_activations_store_to_wandb": false, "log_optimizer_state_to_wandb": false, "wandb_project": "transfer-learning", "wandb_id": null, "run_name": "49152-L1-1-LR-5e-05-Tokens-6.144e+07", "wandb_entity": null, "wandb_log_frequency": 50, "eval_every_n_wandb_logs": 10, "resume": false, "n_checkpoints": 0, "checkpoint_path": "sae-transfer-learning/first-attn-transfer-gpt2/elr38yp7", "verbose": true, "model_kwargs": {}, "model_from_pretrained_kwargs": {}, "sae_lens_version": "2.1.3", "sae_lens_training_version": "2.1.3", "tokens_per_buffer": 67108864}
layer_6/final_61440000/sparsity.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d2a8049eb2fca10e901b4a640cc20fc1e1cae5c5f2066348af5d134c80fff174
3
+ size 196688
layer_7/final_61440000/cfg.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"model_name": "gpt2", "model_class_name": "HookedTransformer", "hook_point": "blocks.7.hook_attn_out", "hook_point_eval": "blocks.{layer}.attn.pattern", "hook_point_layer": 7, "hook_point_head_index": null, "dataset_path": "apollo-research/Skylion007-openwebtext-tokenizer-gpt2", "streaming": true, "is_dataset_tokenized": true, "context_size": 128, "use_cached_activations": false, "cached_activations_path": null, "d_in": 768, "d_sae": 49152, "b_dec_init_method": "zeros", "expansion_factor": 64, "activation_fn": "relu", "normalize_sae_decoder": false, "noise_scale": 0.0, "from_pretrained_path": null, "apply_b_dec_to_input": false, "decoder_orthogonal_init": false, "decoder_heuristic_init": true, "init_encoder_as_decoder_transpose": true, "n_batches_in_buffer": 128, "training_tokens": 61440000, "finetuning_tokens": 0, "store_batch_size_prompts": 32, "train_batch_size_tokens": 4096, "normalize_activations": false, "device": "cuda", "act_store_device": "cuda", "seed": 42, "dtype": "torch.float32", "prepend_bos": true, "autocast": true, "autocast_lm": false, "compile_llm": true, "llm_compilation_mode": null, "compile_sae": true, "sae_compilation_mode": null, "adam_beta1": 0.9, "adam_beta2": 0.999, "mse_loss_normalization": null, "l1_coefficient": 1, "lp_norm": 1.0, "scale_sparsity_penalty_by_decoder_norm": true, "l1_warm_up_steps": 1500, "lr": 5e-05, "lr_scheduler_name": "constant", "lr_warm_up_steps": 0, "lr_end": 5e-06, "lr_decay_steps": 0, "n_restart_cycles": 1, "finetuning_method": null, "use_ghost_grads": false, "feature_sampling_window": 1000, "dead_feature_window": 1000, "dead_feature_threshold": 0.0001, "n_eval_batches": 10, "eval_batch_size_prompts": 2, "log_to_wandb": true, "log_activations_store_to_wandb": false, "log_optimizer_state_to_wandb": false, "wandb_project": "transfer-learning", "wandb_id": null, "run_name": "49152-L1-1-LR-5e-05-Tokens-6.144e+07", "wandb_entity": null, "wandb_log_frequency": 50, "eval_every_n_wandb_logs": 10, "resume": false, "n_checkpoints": 0, "checkpoint_path": "sae-transfer-learning/first-attn-transfer-gpt2/w8k4xulq", "verbose": true, "model_kwargs": {}, "model_from_pretrained_kwargs": {}, "sae_lens_version": "2.1.3", "sae_lens_training_version": "2.1.3", "tokens_per_buffer": 67108864}
layer_7/final_61440000/sae_weights.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:777459fdd18244ed96474eafc2046b833358b6c2a3b9d20495ea799fadcd41d3
3
+ size 302386584
layer_7/final_61440000/sparsity.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4e7ba95535d4c00a969053879c94252c2424ee145acfba5fad835c2227b8571a
3
+ size 196688
layer_8/final_61440000/cfg.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"model_name": "gpt2", "model_class_name": "HookedTransformer", "hook_point": "blocks.8.hook_attn_out", "hook_point_eval": "blocks.{layer}.attn.pattern", "hook_point_layer": 8, "hook_point_head_index": null, "dataset_path": "apollo-research/Skylion007-openwebtext-tokenizer-gpt2", "streaming": true, "is_dataset_tokenized": true, "context_size": 128, "use_cached_activations": false, "cached_activations_path": null, "d_in": 768, "d_sae": 49152, "b_dec_init_method": "zeros", "expansion_factor": 64, "activation_fn": "relu", "normalize_sae_decoder": false, "noise_scale": 0.0, "from_pretrained_path": null, "apply_b_dec_to_input": false, "decoder_orthogonal_init": false, "decoder_heuristic_init": true, "init_encoder_as_decoder_transpose": true, "n_batches_in_buffer": 128, "training_tokens": 61440000, "finetuning_tokens": 0, "store_batch_size_prompts": 32, "train_batch_size_tokens": 4096, "normalize_activations": false, "device": "cuda", "act_store_device": "cuda", "seed": 42, "dtype": "torch.float32", "prepend_bos": true, "autocast": true, "autocast_lm": false, "compile_llm": true, "llm_compilation_mode": null, "compile_sae": true, "sae_compilation_mode": null, "adam_beta1": 0.9, "adam_beta2": 0.999, "mse_loss_normalization": null, "l1_coefficient": 1, "lp_norm": 1.0, "scale_sparsity_penalty_by_decoder_norm": true, "l1_warm_up_steps": 1500, "lr": 5e-05, "lr_scheduler_name": "constant", "lr_warm_up_steps": 0, "lr_end": 5e-06, "lr_decay_steps": 0, "n_restart_cycles": 1, "finetuning_method": null, "use_ghost_grads": false, "feature_sampling_window": 1000, "dead_feature_window": 1000, "dead_feature_threshold": 0.0001, "n_eval_batches": 10, "eval_batch_size_prompts": 2, "log_to_wandb": true, "log_activations_store_to_wandb": false, "log_optimizer_state_to_wandb": false, "wandb_project": "transfer-learning", "wandb_id": null, "run_name": "49152-L1-1-LR-5e-05-Tokens-6.144e+07", "wandb_entity": null, "wandb_log_frequency": 50, "eval_every_n_wandb_logs": 10, "resume": false, "n_checkpoints": 0, "checkpoint_path": "sae-transfer-learning/first-attn-transfer-gpt2/43bteiaw", "verbose": true, "model_kwargs": {}, "model_from_pretrained_kwargs": {}, "sae_lens_version": "2.1.3", "sae_lens_training_version": "2.1.3", "tokens_per_buffer": 67108864}
layer_8/final_61440000/sae_weights.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:965bfdb200915ba0bc98dab4b3614d1326b032e4ebe3f1ebe56e94119bd312a5
3
+ size 302386584
layer_8/final_61440000/sparsity.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b4b39eb4a466308f3e243446e8a18ae6f7d2bdcd8d76e14415d6e988a13de185
3
+ size 196688
layer_9/final_61440000/cfg.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"model_name": "gpt2", "model_class_name": "HookedTransformer", "hook_point": "blocks.9.hook_attn_out", "hook_point_eval": "blocks.{layer}.attn.pattern", "hook_point_layer": 9, "hook_point_head_index": null, "dataset_path": "apollo-research/Skylion007-openwebtext-tokenizer-gpt2", "streaming": true, "is_dataset_tokenized": true, "context_size": 128, "use_cached_activations": false, "cached_activations_path": null, "d_in": 768, "d_sae": 49152, "b_dec_init_method": "zeros", "expansion_factor": 64, "activation_fn": "relu", "normalize_sae_decoder": false, "noise_scale": 0.0, "from_pretrained_path": null, "apply_b_dec_to_input": false, "decoder_orthogonal_init": false, "decoder_heuristic_init": true, "init_encoder_as_decoder_transpose": true, "n_batches_in_buffer": 128, "training_tokens": 61440000, "finetuning_tokens": 0, "store_batch_size_prompts": 32, "train_batch_size_tokens": 4096, "normalize_activations": false, "device": "cuda", "act_store_device": "cuda", "seed": 42, "dtype": "torch.float32", "prepend_bos": true, "autocast": true, "autocast_lm": false, "compile_llm": true, "llm_compilation_mode": null, "compile_sae": true, "sae_compilation_mode": null, "adam_beta1": 0.9, "adam_beta2": 0.999, "mse_loss_normalization": null, "l1_coefficient": 1, "lp_norm": 1.0, "scale_sparsity_penalty_by_decoder_norm": true, "l1_warm_up_steps": 1500, "lr": 5e-05, "lr_scheduler_name": "constant", "lr_warm_up_steps": 0, "lr_end": 5e-06, "lr_decay_steps": 0, "n_restart_cycles": 1, "finetuning_method": null, "use_ghost_grads": false, "feature_sampling_window": 1000, "dead_feature_window": 1000, "dead_feature_threshold": 0.0001, "n_eval_batches": 10, "eval_batch_size_prompts": 2, "log_to_wandb": true, "log_activations_store_to_wandb": false, "log_optimizer_state_to_wandb": false, "wandb_project": "transfer-learning", "wandb_id": null, "run_name": "49152-L1-1-LR-5e-05-Tokens-6.144e+07", "wandb_entity": null, "wandb_log_frequency": 50, "eval_every_n_wandb_logs": 10, "resume": false, "n_checkpoints": 0, "checkpoint_path": "sae-transfer-learning/first-attn-transfer-gpt2/d5lwdd1y", "verbose": true, "model_kwargs": {}, "model_from_pretrained_kwargs": {}, "sae_lens_version": "2.1.3", "sae_lens_training_version": "2.1.3", "tokens_per_buffer": 67108864}
layer_9/final_61440000/sparsity.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0af8fbaff4fe46e6a1de9b8cb7d10cec10d0d840216cf7419383ac1b47b7ecfe
3
+ size 196688
transfer_layer_1/byp42y0x/final_61440000/cfg.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"model_name": "gpt2", "model_class_name": "HookedTransformer", "hook_point": "blocks.1.hook_attn_out", "hook_point_eval": "blocks.{layer}.attn.pattern", "hook_point_layer": 1, "hook_point_head_index": null, "dataset_path": "apollo-research/Skylion007-openwebtext-tokenizer-gpt2", "streaming": true, "is_dataset_tokenized": true, "context_size": 128, "use_cached_activations": false, "cached_activations_path": null, "d_in": 768, "d_sae": 49152, "b_dec_init_method": "zeros", "expansion_factor": 64, "activation_fn": "relu", "normalize_sae_decoder": false, "noise_scale": 0.0, "from_pretrained_path": null, "apply_b_dec_to_input": false, "decoder_orthogonal_init": false, "decoder_heuristic_init": true, "init_encoder_as_decoder_transpose": true, "n_batches_in_buffer": 128, "training_tokens": 61440000, "finetuning_tokens": 0, "store_batch_size_prompts": 32, "train_batch_size_tokens": 4096, "normalize_activations": false, "device": "cuda", "act_store_device": "cuda", "seed": 42, "dtype": "torch.float32", "prepend_bos": true, "autocast": true, "autocast_lm": false, "compile_llm": true, "llm_compilation_mode": null, "compile_sae": true, "sae_compilation_mode": null, "adam_beta1": 0.9, "adam_beta2": 0.999, "mse_loss_normalization": null, "l1_coefficient": 1, "lp_norm": 1.0, "scale_sparsity_penalty_by_decoder_norm": true, "l1_warm_up_steps": 1500, "lr": 5e-05, "lr_scheduler_name": "constant", "lr_warm_up_steps": 0, "lr_end": 5e-06, "lr_decay_steps": 0, "n_restart_cycles": 1, "finetuning_method": null, "use_ghost_grads": false, "feature_sampling_window": 1000, "dead_feature_window": 1000, "dead_feature_threshold": 0.0001, "n_eval_batches": 10, "eval_batch_size_prompts": 2, "log_to_wandb": true, "log_activations_store_to_wandb": false, "log_optimizer_state_to_wandb": false, "wandb_project": "transfer-learning", "wandb_id": null, "run_name": "49152-L1-1-LR-5e-05-Tokens-6.144e+07", "wandb_entity": null, "wandb_log_frequency": 50, "eval_every_n_wandb_logs": 10, "resume": false, "n_checkpoints": 0, "checkpoint_path": "sae-transfer-learning/first-attn-transfer-gpt2/rjpnxi8e/byp42y0x", "verbose": false, "model_kwargs": {}, "model_from_pretrained_kwargs": {}, "sae_lens_version": "2.1.3", "sae_lens_training_version": "2.1.3", "tokens_per_buffer": 67108864}
transfer_layer_1/byp42y0x/final_61440000/sparsity.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ada17ec976d970ef0bd69b467bec390d87dfe5629f22f8def9a8b1c513e93723
3
+ size 196688
transfer_layer_10/pl4vbzcz/final_61440000/cfg.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"model_name": "gpt2", "model_class_name": "HookedTransformer", "hook_point": "blocks.10.hook_attn_out", "hook_point_eval": "blocks.{layer}.attn.pattern", "hook_point_layer": 10, "hook_point_head_index": null, "dataset_path": "apollo-research/Skylion007-openwebtext-tokenizer-gpt2", "streaming": true, "is_dataset_tokenized": true, "context_size": 128, "use_cached_activations": false, "cached_activations_path": null, "d_in": 768, "d_sae": 49152, "b_dec_init_method": "zeros", "expansion_factor": 64, "activation_fn": "relu", "normalize_sae_decoder": false, "noise_scale": 0.0, "from_pretrained_path": null, "apply_b_dec_to_input": false, "decoder_orthogonal_init": false, "decoder_heuristic_init": true, "init_encoder_as_decoder_transpose": true, "n_batches_in_buffer": 128, "training_tokens": 61440000, "finetuning_tokens": 0, "store_batch_size_prompts": 32, "train_batch_size_tokens": 4096, "normalize_activations": false, "device": "cuda", "act_store_device": "cuda", "seed": 42, "dtype": "torch.float32", "prepend_bos": true, "autocast": true, "autocast_lm": false, "compile_llm": true, "llm_compilation_mode": null, "compile_sae": true, "sae_compilation_mode": null, "adam_beta1": 0.9, "adam_beta2": 0.999, "mse_loss_normalization": null, "l1_coefficient": 1, "lp_norm": 1.0, "scale_sparsity_penalty_by_decoder_norm": true, "l1_warm_up_steps": 1500, "lr": 5e-05, "lr_scheduler_name": "constant", "lr_warm_up_steps": 0, "lr_end": 5e-06, "lr_decay_steps": 0, "n_restart_cycles": 1, "finetuning_method": null, "use_ghost_grads": false, "feature_sampling_window": 1000, "dead_feature_window": 1000, "dead_feature_threshold": 0.0001, "n_eval_batches": 10, "eval_batch_size_prompts": 2, "log_to_wandb": true, "log_activations_store_to_wandb": false, "log_optimizer_state_to_wandb": false, "wandb_project": "transfer-learning", "wandb_id": null, "run_name": "49152-L1-1-LR-5e-05-Tokens-6.144e+07", "wandb_entity": null, "wandb_log_frequency": 50, "eval_every_n_wandb_logs": 10, "resume": false, "n_checkpoints": 0, "checkpoint_path": "sae-transfer-learning/first-attn-transfer-gpt2/d5lwdd1y/pl4vbzcz", "verbose": false, "model_kwargs": {}, "model_from_pretrained_kwargs": {}, "sae_lens_version": "2.1.3", "sae_lens_training_version": "2.1.3", "tokens_per_buffer": 67108864}
transfer_layer_10/pl4vbzcz/final_61440000/sparsity.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:408a1a800a2ea00fcdd644cded70b7618645a06d6fbe7ea82064533a68fa641f
3
+ size 196688
transfer_layer_11/2ie78it4/final_61440000/cfg.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"model_name": "gpt2", "model_class_name": "HookedTransformer", "hook_point": "blocks.11.hook_attn_out", "hook_point_eval": "blocks.{layer}.attn.pattern", "hook_point_layer": 11, "hook_point_head_index": null, "dataset_path": "apollo-research/Skylion007-openwebtext-tokenizer-gpt2", "streaming": true, "is_dataset_tokenized": true, "context_size": 128, "use_cached_activations": false, "cached_activations_path": null, "d_in": 768, "d_sae": 49152, "b_dec_init_method": "zeros", "expansion_factor": 64, "activation_fn": "relu", "normalize_sae_decoder": false, "noise_scale": 0.0, "from_pretrained_path": null, "apply_b_dec_to_input": false, "decoder_orthogonal_init": false, "decoder_heuristic_init": true, "init_encoder_as_decoder_transpose": true, "n_batches_in_buffer": 128, "training_tokens": 61440000, "finetuning_tokens": 0, "store_batch_size_prompts": 32, "train_batch_size_tokens": 4096, "normalize_activations": false, "device": "cuda", "act_store_device": "cuda", "seed": 42, "dtype": "torch.float32", "prepend_bos": true, "autocast": true, "autocast_lm": false, "compile_llm": true, "llm_compilation_mode": null, "compile_sae": true, "sae_compilation_mode": null, "adam_beta1": 0.9, "adam_beta2": 0.999, "mse_loss_normalization": null, "l1_coefficient": 1, "lp_norm": 1.0, "scale_sparsity_penalty_by_decoder_norm": true, "l1_warm_up_steps": 1500, "lr": 5e-05, "lr_scheduler_name": "constant", "lr_warm_up_steps": 0, "lr_end": 5e-06, "lr_decay_steps": 0, "n_restart_cycles": 1, "finetuning_method": null, "use_ghost_grads": false, "feature_sampling_window": 1000, "dead_feature_window": 1000, "dead_feature_threshold": 0.0001, "n_eval_batches": 10, "eval_batch_size_prompts": 2, "log_to_wandb": true, "log_activations_store_to_wandb": false, "log_optimizer_state_to_wandb": false, "wandb_project": "transfer-learning", "wandb_id": null, "run_name": "49152-L1-1-LR-5e-05-Tokens-6.144e+07", "wandb_entity": null, "wandb_log_frequency": 50, "eval_every_n_wandb_logs": 10, "resume": false, "n_checkpoints": 0, "checkpoint_path": "sae-transfer-learning/first-attn-transfer-gpt2/e37amtur/2ie78it4", "verbose": false, "model_kwargs": {}, "model_from_pretrained_kwargs": {}, "sae_lens_version": "2.1.3", "sae_lens_training_version": "2.1.3", "tokens_per_buffer": 67108864}
transfer_layer_11/2ie78it4/final_61440000/sparsity.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:21b21888ecbdc357a5ce8a3b762b2ef616f1257a7a6b5c97e1e072a9232a49da
3
+ size 196688
transfer_layer_2/ahj749k3/final_61440000/cfg.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"model_name": "gpt2", "model_class_name": "HookedTransformer", "hook_point": "blocks.2.hook_attn_out", "hook_point_eval": "blocks.{layer}.attn.pattern", "hook_point_layer": 2, "hook_point_head_index": null, "dataset_path": "apollo-research/Skylion007-openwebtext-tokenizer-gpt2", "streaming": true, "is_dataset_tokenized": true, "context_size": 128, "use_cached_activations": false, "cached_activations_path": null, "d_in": 768, "d_sae": 49152, "b_dec_init_method": "zeros", "expansion_factor": 64, "activation_fn": "relu", "normalize_sae_decoder": false, "noise_scale": 0.0, "from_pretrained_path": null, "apply_b_dec_to_input": false, "decoder_orthogonal_init": false, "decoder_heuristic_init": true, "init_encoder_as_decoder_transpose": true, "n_batches_in_buffer": 128, "training_tokens": 61440000, "finetuning_tokens": 0, "store_batch_size_prompts": 32, "train_batch_size_tokens": 4096, "normalize_activations": false, "device": "cuda", "act_store_device": "cuda", "seed": 42, "dtype": "torch.float32", "prepend_bos": true, "autocast": true, "autocast_lm": false, "compile_llm": true, "llm_compilation_mode": null, "compile_sae": true, "sae_compilation_mode": null, "adam_beta1": 0.9, "adam_beta2": 0.999, "mse_loss_normalization": null, "l1_coefficient": 1, "lp_norm": 1.0, "scale_sparsity_penalty_by_decoder_norm": true, "l1_warm_up_steps": 1500, "lr": 5e-05, "lr_scheduler_name": "constant", "lr_warm_up_steps": 0, "lr_end": 5e-06, "lr_decay_steps": 0, "n_restart_cycles": 1, "finetuning_method": null, "use_ghost_grads": false, "feature_sampling_window": 1000, "dead_feature_window": 1000, "dead_feature_threshold": 0.0001, "n_eval_batches": 10, "eval_batch_size_prompts": 2, "log_to_wandb": true, "log_activations_store_to_wandb": false, "log_optimizer_state_to_wandb": false, "wandb_project": "transfer-learning", "wandb_id": null, "run_name": "49152-L1-1-LR-5e-05-Tokens-6.144e+07", "wandb_entity": null, "wandb_log_frequency": 50, "eval_every_n_wandb_logs": 10, "resume": false, "n_checkpoints": 0, "checkpoint_path": "sae-transfer-learning/first-attn-transfer-gpt2/eq1spjmd/ahj749k3", "verbose": false, "model_kwargs": {}, "model_from_pretrained_kwargs": {}, "sae_lens_version": "2.1.3", "sae_lens_training_version": "2.1.3", "tokens_per_buffer": 67108864}
transfer_layer_2/ahj749k3/final_61440000/sparsity.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e2c89dc9a06ad0a646f2be9fdd0a587d2dd3721c76362d90854d9490c4c2d61b
3
+ size 196688
transfer_layer_3/hpzj1ceq/final_61440000/cfg.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"model_name": "gpt2", "model_class_name": "HookedTransformer", "hook_point": "blocks.3.hook_attn_out", "hook_point_eval": "blocks.{layer}.attn.pattern", "hook_point_layer": 3, "hook_point_head_index": null, "dataset_path": "apollo-research/Skylion007-openwebtext-tokenizer-gpt2", "streaming": true, "is_dataset_tokenized": true, "context_size": 128, "use_cached_activations": false, "cached_activations_path": null, "d_in": 768, "d_sae": 49152, "b_dec_init_method": "zeros", "expansion_factor": 64, "activation_fn": "relu", "normalize_sae_decoder": false, "noise_scale": 0.0, "from_pretrained_path": null, "apply_b_dec_to_input": false, "decoder_orthogonal_init": false, "decoder_heuristic_init": true, "init_encoder_as_decoder_transpose": true, "n_batches_in_buffer": 128, "training_tokens": 61440000, "finetuning_tokens": 0, "store_batch_size_prompts": 32, "train_batch_size_tokens": 4096, "normalize_activations": false, "device": "cuda", "act_store_device": "cuda", "seed": 42, "dtype": "torch.float32", "prepend_bos": true, "autocast": true, "autocast_lm": false, "compile_llm": true, "llm_compilation_mode": null, "compile_sae": true, "sae_compilation_mode": null, "adam_beta1": 0.9, "adam_beta2": 0.999, "mse_loss_normalization": null, "l1_coefficient": 1, "lp_norm": 1.0, "scale_sparsity_penalty_by_decoder_norm": true, "l1_warm_up_steps": 1500, "lr": 5e-05, "lr_scheduler_name": "constant", "lr_warm_up_steps": 0, "lr_end": 5e-06, "lr_decay_steps": 0, "n_restart_cycles": 1, "finetuning_method": null, "use_ghost_grads": false, "feature_sampling_window": 1000, "dead_feature_window": 1000, "dead_feature_threshold": 0.0001, "n_eval_batches": 10, "eval_batch_size_prompts": 2, "log_to_wandb": true, "log_activations_store_to_wandb": false, "log_optimizer_state_to_wandb": false, "wandb_project": "transfer-learning", "wandb_id": null, "run_name": "49152-L1-1-LR-5e-05-Tokens-6.144e+07", "wandb_entity": null, "wandb_log_frequency": 50, "eval_every_n_wandb_logs": 10, "resume": false, "n_checkpoints": 0, "checkpoint_path": "sae-transfer-learning/first-attn-transfer-gpt2/jmepy8tw/hpzj1ceq", "verbose": false, "model_kwargs": {}, "model_from_pretrained_kwargs": {}, "sae_lens_version": "2.1.3", "sae_lens_training_version": "2.1.3", "tokens_per_buffer": 67108864}
transfer_layer_3/hpzj1ceq/final_61440000/sparsity.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:35db127e31cff62b523a0099b689259437f02028c43bb8b8ae04f5ccec49cf1c
3
+ size 196688
transfer_layer_4/ajth2o0a/final_61440000/cfg.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"model_name": "gpt2", "model_class_name": "HookedTransformer", "hook_point": "blocks.4.hook_attn_out", "hook_point_eval": "blocks.{layer}.attn.pattern", "hook_point_layer": 4, "hook_point_head_index": null, "dataset_path": "apollo-research/Skylion007-openwebtext-tokenizer-gpt2", "streaming": true, "is_dataset_tokenized": true, "context_size": 128, "use_cached_activations": false, "cached_activations_path": null, "d_in": 768, "d_sae": 49152, "b_dec_init_method": "zeros", "expansion_factor": 64, "activation_fn": "relu", "normalize_sae_decoder": false, "noise_scale": 0.0, "from_pretrained_path": null, "apply_b_dec_to_input": false, "decoder_orthogonal_init": false, "decoder_heuristic_init": true, "init_encoder_as_decoder_transpose": true, "n_batches_in_buffer": 128, "training_tokens": 61440000, "finetuning_tokens": 0, "store_batch_size_prompts": 32, "train_batch_size_tokens": 4096, "normalize_activations": false, "device": "cuda", "act_store_device": "cuda", "seed": 42, "dtype": "torch.float32", "prepend_bos": true, "autocast": true, "autocast_lm": false, "compile_llm": true, "llm_compilation_mode": null, "compile_sae": true, "sae_compilation_mode": null, "adam_beta1": 0.9, "adam_beta2": 0.999, "mse_loss_normalization": null, "l1_coefficient": 1, "lp_norm": 1.0, "scale_sparsity_penalty_by_decoder_norm": true, "l1_warm_up_steps": 1500, "lr": 5e-05, "lr_scheduler_name": "constant", "lr_warm_up_steps": 0, "lr_end": 5e-06, "lr_decay_steps": 0, "n_restart_cycles": 1, "finetuning_method": null, "use_ghost_grads": false, "feature_sampling_window": 1000, "dead_feature_window": 1000, "dead_feature_threshold": 0.0001, "n_eval_batches": 10, "eval_batch_size_prompts": 2, "log_to_wandb": true, "log_activations_store_to_wandb": false, "log_optimizer_state_to_wandb": false, "wandb_project": "transfer-learning", "wandb_id": null, "run_name": "49152-L1-1-LR-5e-05-Tokens-6.144e+07", "wandb_entity": null, "wandb_log_frequency": 50, "eval_every_n_wandb_logs": 10, "resume": false, "n_checkpoints": 0, "checkpoint_path": "sae-transfer-learning/first-attn-transfer-gpt2/bl1t1nc8/ajth2o0a", "verbose": false, "model_kwargs": {}, "model_from_pretrained_kwargs": {}, "sae_lens_version": "2.1.3", "sae_lens_training_version": "2.1.3", "tokens_per_buffer": 67108864}
transfer_layer_4/ajth2o0a/final_61440000/sparsity.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:658b43b9807541b84c641b245ae27b79d312fa4cb28a0043e283a06f1ac03deb
3
+ size 196688
transfer_layer_5/ilva7jii/final_61440000/cfg.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"model_name": "gpt2", "model_class_name": "HookedTransformer", "hook_point": "blocks.5.hook_attn_out", "hook_point_eval": "blocks.{layer}.attn.pattern", "hook_point_layer": 5, "hook_point_head_index": null, "dataset_path": "apollo-research/Skylion007-openwebtext-tokenizer-gpt2", "streaming": true, "is_dataset_tokenized": true, "context_size": 128, "use_cached_activations": false, "cached_activations_path": null, "d_in": 768, "d_sae": 49152, "b_dec_init_method": "zeros", "expansion_factor": 64, "activation_fn": "relu", "normalize_sae_decoder": false, "noise_scale": 0.0, "from_pretrained_path": null, "apply_b_dec_to_input": false, "decoder_orthogonal_init": false, "decoder_heuristic_init": true, "init_encoder_as_decoder_transpose": true, "n_batches_in_buffer": 128, "training_tokens": 61440000, "finetuning_tokens": 0, "store_batch_size_prompts": 32, "train_batch_size_tokens": 4096, "normalize_activations": false, "device": "cuda", "act_store_device": "cuda", "seed": 42, "dtype": "torch.float32", "prepend_bos": true, "autocast": true, "autocast_lm": false, "compile_llm": true, "llm_compilation_mode": null, "compile_sae": true, "sae_compilation_mode": null, "adam_beta1": 0.9, "adam_beta2": 0.999, "mse_loss_normalization": null, "l1_coefficient": 1, "lp_norm": 1.0, "scale_sparsity_penalty_by_decoder_norm": true, "l1_warm_up_steps": 1500, "lr": 5e-05, "lr_scheduler_name": "constant", "lr_warm_up_steps": 0, "lr_end": 5e-06, "lr_decay_steps": 0, "n_restart_cycles": 1, "finetuning_method": null, "use_ghost_grads": false, "feature_sampling_window": 1000, "dead_feature_window": 1000, "dead_feature_threshold": 0.0001, "n_eval_batches": 10, "eval_batch_size_prompts": 2, "log_to_wandb": true, "log_activations_store_to_wandb": false, "log_optimizer_state_to_wandb": false, "wandb_project": "transfer-learning", "wandb_id": null, "run_name": "49152-L1-1-LR-5e-05-Tokens-6.144e+07", "wandb_entity": null, "wandb_log_frequency": 50, "eval_every_n_wandb_logs": 10, "resume": false, "n_checkpoints": 0, "checkpoint_path": "sae-transfer-learning/first-attn-transfer-gpt2/mhp05ibg/ilva7jii", "verbose": false, "model_kwargs": {}, "model_from_pretrained_kwargs": {}, "sae_lens_version": "2.1.3", "sae_lens_training_version": "2.1.3", "tokens_per_buffer": 67108864}
transfer_layer_5/ilva7jii/final_61440000/sparsity.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:82020ebeda0c242bd682f6c5f8cde7576b33974906b14fbf45ba8fc13c5f326a
3
+ size 196688
transfer_layer_6/g99cepn0/final_61440000/cfg.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"model_name": "gpt2", "model_class_name": "HookedTransformer", "hook_point": "blocks.6.hook_attn_out", "hook_point_eval": "blocks.{layer}.attn.pattern", "hook_point_layer": 6, "hook_point_head_index": null, "dataset_path": "apollo-research/Skylion007-openwebtext-tokenizer-gpt2", "streaming": true, "is_dataset_tokenized": true, "context_size": 128, "use_cached_activations": false, "cached_activations_path": null, "d_in": 768, "d_sae": 49152, "b_dec_init_method": "zeros", "expansion_factor": 64, "activation_fn": "relu", "normalize_sae_decoder": false, "noise_scale": 0.0, "from_pretrained_path": null, "apply_b_dec_to_input": false, "decoder_orthogonal_init": false, "decoder_heuristic_init": true, "init_encoder_as_decoder_transpose": true, "n_batches_in_buffer": 128, "training_tokens": 61440000, "finetuning_tokens": 0, "store_batch_size_prompts": 32, "train_batch_size_tokens": 4096, "normalize_activations": false, "device": "cuda", "act_store_device": "cuda", "seed": 42, "dtype": "torch.float32", "prepend_bos": true, "autocast": true, "autocast_lm": false, "compile_llm": true, "llm_compilation_mode": null, "compile_sae": true, "sae_compilation_mode": null, "adam_beta1": 0.9, "adam_beta2": 0.999, "mse_loss_normalization": null, "l1_coefficient": 1, "lp_norm": 1.0, "scale_sparsity_penalty_by_decoder_norm": true, "l1_warm_up_steps": 1500, "lr": 5e-05, "lr_scheduler_name": "constant", "lr_warm_up_steps": 0, "lr_end": 5e-06, "lr_decay_steps": 0, "n_restart_cycles": 1, "finetuning_method": null, "use_ghost_grads": false, "feature_sampling_window": 1000, "dead_feature_window": 1000, "dead_feature_threshold": 0.0001, "n_eval_batches": 10, "eval_batch_size_prompts": 2, "log_to_wandb": true, "log_activations_store_to_wandb": false, "log_optimizer_state_to_wandb": false, "wandb_project": "transfer-learning", "wandb_id": null, "run_name": "49152-L1-1-LR-5e-05-Tokens-6.144e+07", "wandb_entity": null, "wandb_log_frequency": 50, "eval_every_n_wandb_logs": 10, "resume": false, "n_checkpoints": 0, "checkpoint_path": "sae-transfer-learning/first-attn-transfer-gpt2/s2bdiskq/g99cepn0", "verbose": false, "model_kwargs": {}, "model_from_pretrained_kwargs": {}, "sae_lens_version": "2.1.3", "sae_lens_training_version": "2.1.3", "tokens_per_buffer": 67108864}
transfer_layer_6/g99cepn0/final_61440000/sparsity.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9b230fa33b3cbbe268e2d2b0bf74b40367ccaf019cbbdbb5d208397f64d5e281
3
+ size 196688