Update model with 2x training data and more efficient vocabulary

Files changed (14) hide show

README.md CHANGED Viewed

@@ -2,16 +2,22 @@
 library_name: transformers
 license: apache-2.0
 datasets:
-- bigscience-data/roots_zh-tw_wikipedia
-- bigscience-data/roots_en_wikipedia
 language:
 - zh
 ---
 # Model Card for Chinese-OpenELM-270M
-Finetuned from [apple/OpenELM-270M](https://huggingface.co/apple/OpenELM-270M):
-* Extended vocabulary from 32000 to 75873 with sentencepiece bpe trained on [bigscience-data/roots_zh-tw_wikipedia](https://huggingface.co/datasets/bigscience-data/roots_zh-tw_wikipedia) and used average embedding to initialize the new embeddings.
-* Continual pre-trained with a mix of [bigscience-data/roots_zh-tw_wikipedia](https://huggingface.co/datasets/bigscience-data/roots_zh-tw_wikipedia) and [bigscience-data/roots_en_wikipedia](https://huggingface.co/datasets/bigscience-data/roots_en_wikipedia).
-* Evaluation ppl = 1.6644828403646825 (split 3% training data as evaluation set)

 library_name: transformers
 license: apache-2.0
 datasets:
+- liswei/zhtw-news-and-articles-2B
+base_model: apple/OpenELM-270M
 language:
 - zh
 ---
 # Model Card for Chinese-OpenELM-270M
+Continual pre-trained from [apple/OpenELM-270M](https://huggingface.co/apple/OpenELM-270M) with [liswei/zhtw-news-and-articles-2B](https://huggingface.co/datasets/liswei/zhtw-news-and-articles-2B):
+* Extended vocabulary from 32000 to 61758 tokens with additional Traditional Chinese characters.
+  * Tokenizer is trained on [liswei/zhtw-news-and-articles-2B](https://huggingface.co/datasets/liswei/zhtw-news-and-articles-2B) and pruned from 96000 to 61758 tokens while maintaining 95% coverage on the pre-training dataset.
+  * Additional token embeddings are initialized with the mean vector of existing embeddings.
+* Traditional Chinese perplexity = 1.6871 on held-out evaluation dataset.
+* Applied [GaLore](https://arxiv.org/abs/2403.03507) for efficient training with following hyperparameters:
+  * Rank: 1024
+  * Scale: 4.0
+  * Update interval: 200
+  * Layer-wise training: False

all_results.json ADDED Viewed

+{
+    "epoch": 2.999944821497545,
+    "total_flos": 3.6968537365004943e+18,
+    "train_loss": 3.121767396105489,
+    "train_runtime": 856366.0469,
+    "train_samples_per_second": 3.047,
+    "train_steps_per_second": 0.048
+}

config.json CHANGED Viewed

@@ -1,12 +1,13 @@
 {
-  "_name_or_path": "./OpenELM-270M-Llama-2-Chinese-7b-hf-30K",
   "activation_fn_name": "swish",
   "architectures": [
     "OpenELMForCausalLM"
   ],
   "auto_map": {
     "AutoConfig": "configuration_openelm.OpenELMConfig",
-    "AutoModelForCausalLM": "modeling_openelm.OpenELMForCausalLM"
   },
   "bos_token_id": 1,
   "eos_token_id": 2,
@@ -84,6 +85,6 @@
   "share_input_output_layers": true,
   "torch_dtype": "float32",
   "transformers_version": "4.40.1",
-  "use_cache": true,
-  "vocab_size": 75873
 }

 {
+  "_name_or_path": "saves/OpenELM-270M/zh-base",
   "activation_fn_name": "swish",
   "architectures": [
     "OpenELMForCausalLM"
   ],
   "auto_map": {
     "AutoConfig": "configuration_openelm.OpenELMConfig",
+    "AutoModel": "modeling_openelm.OpenELMForCausalLM",
+    "AutoModelForCausalLM": "apple/OpenELM-270M--modeling_openelm.OpenELMForCausalLM"
   },
   "bos_token_id": 1,
   "eos_token_id": 2,
   "share_input_output_layers": true,
   "torch_dtype": "float32",
   "transformers_version": "4.40.1",
+  "use_cache": false,
+  "vocab_size": 61758
 }

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e893122321808c6295424c777b0347c56822371a2143a5d1c0a38096e799bd17
-size 1310752944

 version https://git-lfs.github.com/spec/v1
+oid sha256:45254a3c27ee9d76e391ca4117b11f8e9bf7fb627abb882c67e421dfbbafbad6
+size 1238484144

runs/May14_00-15-56_coconut/events.out.tfevents.1715616965.coconut.1514196.0 ADDED Viewed

+version https://git-lfs.github.com/spec/v1
+oid sha256:74bc86dfc2d4bd97da6ae6ee4e20f61c7371c346902086624e7ee9d4fe426e99
+size 875943

special_tokens_map.json CHANGED Viewed

@@ -13,6 +13,7 @@
     "rstrip": false,
     "single_word": false
   },
   "unk_token": {
     "content": "<unk>",
     "lstrip": false,

     "rstrip": false,
     "single_word": false
   },
+  "pad_token": "</s>",
   "unk_token": {
     "content": "<unk>",
     "lstrip": false,

tokenizer.json CHANGED Viewed

The diff for this file is too large to render. See raw diff

tokenizer.model CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:eec12a397f4191b45c4698b454b3c6582ebc0f52552e0291f6875a053e1c8319
-size 1200584

 version https://git-lfs.github.com/spec/v1
+oid sha256:489162c61bf22fed27ac6d11033cb270715cb83b4de4409569e7858c6c56b844
+size 966919

tokenizer_config.json CHANGED Viewed

@@ -29,13 +29,16 @@
     }
   },
   "bos_token": "<s>",
   "clean_up_tokenization_spaces": false,
   "eos_token": "</s>",
   "legacy": true,
   "model_max_length": 1000000000000000019884624838656,
-  "pad_token": null,
   "sp_model_kwargs": {},
   "spaces_between_special_tokens": false,
   "tokenizer_class": "LlamaTokenizer",
   "unk_token": "<unk>",
   "use_default_system_prompt": false

     }
   },
   "bos_token": "<s>",
+  "chat_template": "{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ system_message }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ content }}{% elif message['role'] == 'assistant' %}{{ content }}{% endif %}{% endfor %}",
   "clean_up_tokenization_spaces": false,
   "eos_token": "</s>",
   "legacy": true,
   "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "</s>",
+  "padding_side": "right",
   "sp_model_kwargs": {},
   "spaces_between_special_tokens": false,
+  "split_special_tokens": false,
   "tokenizer_class": "LlamaTokenizer",
   "unk_token": "<unk>",
   "use_default_system_prompt": false

train_results.json ADDED Viewed

+{
+    "epoch": 2.999944821497545,
+    "total_flos": 3.6968537365004943e+18,
+    "train_loss": 3.121767396105489,
+    "train_runtime": 856366.0469,
+    "train_samples_per_second": 3.047,
+    "train_steps_per_second": 0.048
+}

trainer_log.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

trainer_state.json ADDED Viewed

The diff for this file is too large to render. See raw diff

training_args.bin ADDED Viewed

+version https://git-lfs.github.com/spec/v1
+oid sha256:45616b1f1573d940ede26d5d97433d9906cce9b27c89a1c73b999deb5426836b
+size 5176

training_loss.png ADDED Viewed