nanotron
/

mistral-nanotron

nanotron

Model card Files Files and versions Community

nouamanetazi HF staff commited on Jan 30

Commit

59010b3

•

1 Parent(s): 869e25b

Delete config_tiny_mistral.yaml

Browse files

Files changed (1) hide show

config_tiny_mistral.yaml +0 -90

config_tiny_mistral.yaml DELETED Viewed

@@ -1,90 +0,0 @@
-checkpoints:
-  checkpoint_interval: 10
-  checkpoints_path: /fsx/nouamane/projects/nanotron/checkpoints
-  checkpoints_path_is_shared_file_system: false
-  resume_checkpoint_path: null
-  save_initial_state: false
-data:
-  dataset:
-    dataset_overwrite_cache: false
-    dataset_processing_num_proc_per_process: 1
-    hf_dataset_config_name: null
-    hf_dataset_or_datasets: HuggingFaceH4/testing_alpaca_small
-    hf_dataset_splits: train
-    text_column_name: completion
-  num_loading_workers: 1
-  seed: 42
-general:
-  benchmark_csv_path: null
-  consumed_train_samples: null
-  ignore_sanity_checks: false
-  project: debug
-  run: tiny_mistral
-  seed: 42
-  step: null
-logging:
-  iteration_step_info_interval: 1
-  log_level: info
-  log_level_replica: info
-model:
-  ddp_bucket_cap_mb: 25
-  dtype: bfloat16
-  init_method:
-    std: 0.025
-  make_vocab_size_divisible_by: 1
-  model_config:
-    bos_token_id: 1
-    eos_token_id: 2
-    hidden_act: silu
-    hidden_size: 16
-    initializer_range: 0.02
-    intermediate_size: 64
-    is_mistral_config: true
-    max_position_embeddings: 256
-    num_attention_heads: 4
-    num_hidden_layers: 2
-    num_key_value_heads: 4
-    pad_token_id: null
-    pretraining_tp: 1
-    rms_norm_eps: 1.0e-05
-    rope_scaling: null
-    tie_word_embeddings: true
-    use_cache: true
-    vocab_size: 256
-optimizer:
-  accumulate_grad_in_fp32: true
-  adam_beta1: 0.9
-  adam_beta2: 0.95
-  adam_eps: 1.0e-08
-  clip_grad: 1.0
-  learning_rate_scheduler:
-    learning_rate: 0.0003
-    lr_decay_steps: 8
-    lr_decay_style: cosine
-    lr_warmup_steps: 2
-    lr_warmup_style: linear
-    min_decay_lr: 1.0e-05
-  torch_adam_is_fused: true
-  weight_decay: 0.01
-  zero_stage: 0
-parallelism:
-  dp: 2
-  pp: 2
-  pp_engine: 1f1b
-  recompute_granularity: SELECTIVE
-  tp: 2
-  tp_linear_async_communication: true
-  tp_mode: REDUCE_SCATTER
-profiler: null
-tokenizer:
-  tokenizer_max_length: null
-  tokenizer_name_or_path: gpt2
-  tokenizer_revision: null
-tokens:
-  batch_accumulation_per_replica: 1
-  limit_test_batches: 0
-  limit_val_batches: 0
-  micro_batch_size: 2
-  sequence_length: 32
-  train_steps: 10
-  val_check_interval: -1