Upload 10 files

Browse files

Files changed (10) hide show

LMConfig.py +58 -0
README.md +27 -24
README_en.md +96 -52
config.json +31 -0
generation_config.json +4 -0
model.py +420 -0
pytorch_model.bin +3 -0
special_tokens_map.json +23 -0
tokenizer.json +0 -0
tokenizer_config.json +44 -0

LMConfig.py ADDED Viewed

	@@ -0,0 +1,58 @@

+from transformers import PretrainedConfig
+from typing import List
+class LMConfig(PretrainedConfig):
+    model_type = "minimind"
+    def __init__(
+            self,
+            dim: int = 512,
+            n_layers: int = 8,
+            n_heads: int = 16,
+            n_kv_heads: int = 8,
+            vocab_size: int = 6400,
+            hidden_dim: int = None,
+            multiple_of: int = 64,
+            norm_eps: float = 1e-5,
+            max_seq_len: int = 512,
+            dropout: float = 0.0,
+            flash_attn: bool = True,
+            ####################################################
+            # Here are the specific configurations of MOE
+            # When use_moe is false, the following is invalid
+            ####################################################
+            use_moe: bool = True,
+            num_experts_per_tok=2,
+            n_routed_experts=4,
+            n_shared_experts: bool = True,
+            scoring_func='softmax',
+            aux_loss_alpha=0.01,
+            seq_aux=True,
+            norm_topk_prob=True,
+            **kwargs,
+    ):
+        self.dim = dim
+        self.n_layers = n_layers
+        self.n_heads = n_heads
+        self.n_kv_heads = n_kv_heads
+        self.vocab_size = vocab_size
+        self.hidden_dim = hidden_dim
+        self.multiple_of = multiple_of
+        self.norm_eps = norm_eps
+        self.max_seq_len = max_seq_len
+        self.dropout = dropout
+        self.flash_attn = flash_attn
+        ####################################################
+        # Here are the specific configurations of MOE
+        # When use_moe is false, the following is invalid
+        ####################################################
+        self.use_moe = use_moe
+        self.num_experts_per_tok = num_experts_per_tok  # 每个token选择的专家数量
+        self.n_routed_experts = n_routed_experts  # 总的专家数量
+        self.n_shared_experts = n_shared_experts  # 共享专家
+        self.scoring_func = scoring_func  # 评分函数，默认为'softmax'
+        self.aux_loss_alpha = aux_loss_alpha  # 辅助损失的alpha参数
+        self.seq_aux = seq_aux  # 是否在序列级别上计算辅助损失
+        self.norm_topk_prob = norm_topk_prob  # 是否标准化top-k概率
+        super().__init__(**kwargs)

README.md CHANGED Viewed

@@ -57,7 +57,7 @@ https://github.com/user-attachments/assets/88b98128-636e-43bc-a419-b1b1403c2055
 | 模型 (大小)                 | tokenizer长度 | 推理占用   | release    | 主观评分（/100） |
 |-------------------------|-------------|--------|------------|------------|
-| minimind-v1-small (26M)  | 6400        | 0.5 GB | 2024.08.28 | 50'        |
 | minimind-v1-moe (4×26M) | 6400        | 1.0 GB | 2024.09.17 | 55'        |
 | minimind-v1 (108M)      | 6400        | 1.0 GB | 2024.09.01 | 60'        |
@@ -320,12 +320,11 @@ MiniMind的整体结构一致，只是在RoPE计算、推理函数和FFN层的
 修改模型配置见[./model/LMConfig.py](./model/LMConfig.py)。
 minimind目前训练的模型版本见下表：
-| Model Name       | params | len_vocab | n_layers | d_model | kv_heads | q_heads | share+route | TopK |
-|------------------|--------|-----------|----------|---------|----------|---------|-------------|------|
 | minimind-v1-small | 26M    | 6400      | 8        | 512     | 8        | 16      | -           | -    |
-| minimind-v1-moe  | 4×26M  | 6400      | 8        | 512     | 8        | 16      | 2+4         | 2    |
-| minimind-v1      | 108M   | 6400      | 16       | 768     | 8        | 16      | -           | -    |
 # 📌 Experiment
@@ -336,11 +335,11 @@ CPU: Intel(R) Core(TM) i9-10980XE CPU @ 3.00GHz
 环境：python 3.9 + Torch 2.1.2 + DDP多卡训练
 ```
-| Model Name       | params | len_vocab | batch_size | pretrain_time     | sft_single_time   | sft_multi_time      |
-|------------------|--------|-----------|------------|-------------------|-------------------|---------------------|
 | minimind-v1-small | 26M    | 6400      | 64         | ≈2 hour (1 epoch) | ≈2 hour (1 epoch) | ≈0.5 hour (1 epoch) |
-| minimind-v1-moe  | 4×26M  | 6400      | 40         | ≈6 hour (1 epoch) | ≈5 hour (1 epoch) | ≈1 hour (1 epoch)   |
-| minimind-v1      | 108M   | 6400      | 16         | ≈6 hour (1 epoch) | ≈4 hour (1 epoch) | ≈1 hour (1 epoch)   |
 ---
@@ -382,6 +381,7 @@ CPU: Intel(R) Core(TM) i9-10980XE CPU @ 3.00GHz
     ```bash
     python 5-dpo_train.py
     ```
 ---
 📋关于LLM的参数配置，有一篇很有意思的论文[MobileLLM](https://arxiv.org/pdf/2402.14905)做了详细的研究和实验。
@@ -410,16 +410,18 @@ MobileLLM提出架构的深度比宽度更重要，「深而窄」的「瘦长
 ![gpt3_config.png](./images/gpt3_config.png)
 ---
 ### 训练完成的模型权重
-| Model Name        | params | Config                      | pretrain_model | single_sft_model                                               | multi_sft_model                                                |
-|-------------------|--------|-----------------------------|----------------|----------------------------------------------------------------|----------------------------------------------------------------|
-| minimind-v1-small | 26M    | d_model=512<br/>n_layers=8  | -              | [链接](https://pan.baidu.com/s/1_COe0FQRDmeapSsvArahCA?pwd=6666) | [链接](https://pan.baidu.com/s/1GsGsWSL0Dckl0YPRXiBIFQ?pwd=6666) |
-| minimind-v1-moe   | 4×26M  | d_model=512<br/>n_layers=8  | -              | -                                                              | -                                                              |
-| minimind-v1       | 108M   | d_model=768<br/>n_layers=16 | -              | [链接](https://pan.baidu.com/s/1p713loS7EfwHQf3G9eYI3Q?pwd=6666) | [链接](https://pan.baidu.com/s/12iHGpAs6R0kqsOnGtgK6vQ?pwd=6666) |
----
 # 📌 Eval
@@ -427,7 +429,7 @@ MobileLLM提出架构的深度比宽度更重要，「深而窄」的「瘦长
 > 以下测试于2024-09-17完成，此日期后发布的新模型，无特殊需要时将不加入测试。
 [A] [minimind-v1-small(0.02B)](https://pan.baidu.com/s/1_COe0FQRDmeapSsvArahCA?pwd=6666)<br/>
-[B] minimind-v1-moe(0.1B)<br/>
 [C] [minimind-v1(0.1B)](https://pan.baidu.com/s/1p713loS7EfwHQf3G9eYI3Q?pwd=6666)<br/>
 [D] [baby-llama2-chinese(0.2B)](https://github.com/DLLXW/baby-llama2-chinese)<br/>
 [E] [chatlm-mini-chinese(0.2B)](https://github.com/charent/ChatLM-mini-Chinese)<br/>
@@ -435,7 +437,7 @@ MobileLLM提出架构的深度比宽度更重要，「深而窄」的「瘦长
 ```text
 [Q]: 你知道长江吗？
 [A]: 是的，长江是中国最长的河流，全长超过3300千米。
-[B]: 是的，长江是中国最长的河流，全长约7100公里。它的流域面积约为3328万平方公里，是中国最大的河流之一，也是世界第三大河流，总长度约为6300千米。
 [C]: 是的，长江是中国最长的河流之一，也是世界第三大河流。长江发源于青藏高原，流经11个省份，最终注入东海。长江流域是中国最大的河流流域之一，拥有着丰富的自然资源和文化遗产，如长江大桥、长江海洋博物馆、长江古城等。
 [D]: 同时也知道，中国是世界上最长的城市之一，拥有超过3400万人口。
 [E]: 是的，长江是中国最长的河流，也是世界上最长的河流之一。它发源于青藏高原，流经九个省区，最终注入东海。长江流域的气候温和，四季分明，是中国重要的农业和工业基地。长江流域的水产资源丰富，是中国的重要渔业资源之一。
@@ -443,7 +445,7 @@ MobileLLM提出架构的深度比宽度更重要，「深而窄」的「瘦长
 [Q]: 第一颗人造卫星是哪个国家发射的？
 [A]: 第一颗人造卫星是斯普特尼克1号。
-[B]: 第一颗人造卫星是美国发射的。
 [C]: 第一颗人造卫星是苏联发射的，它是由苏联于1957年10月发射的。
 [D]: 这颗名为“阿姆斯特朗”的卫星，于1961年11月20日发射，是第一个成功将人类送入太空的卫星。
 [E]: 第一颗人造卫星是苏联于1957年4月4日发射的，它是由苏联宇航员尤里·加加林（Yuri Gagarin）和苏联工程师罗尔德·谢尔盖耶夫（Roald Skohley）共同发射的。在发射过程中，加加林和谢尔盖耶夫使用了一个名为“斯普特尼克1号”的小型卫星，它的直径为4.3米，重量为1,051公斤。这枚卫星被称为“斯普特尼克1号”，是苏联太空竞赛中的一个里程碑，也是人类历史上第一次在太空中进行太空行走。
@@ -451,15 +453,16 @@ MobileLLM提出架构的深度比宽度更重要，「深而窄」的「瘦长
 [Q]: 你知道泰山在哪里吗？
 [A]: 泰山位于中国东部地区，是中国五岳之一。
-[B]: 泰山位于山东省泰安市，是中国五岳之一。
 [C]: 泰山位于中国山东省泰安市，是中国五岳之一。
 [D]: 作为一个人工智能助手，我无法访问互联网，因此无法提供关于你所在位置的信息。不过，你可以通过搜索或搜索相关关键词来获取有关你所在位置的信息，或者查看相关新闻或网站来获取有关你所在位置的信息。
 [E]: 泰山位于中国山东省泰安市境内，是中国五岳之一，也是中国文化和历史���重要象征之一。它是中国五岳之一，也是中国道教和佛教的发源地之一。泰山的海拔高度为1,545米，是中国五岳之一，也是中国最著名的旅游景点之一。它的历史可以追溯到公元前5世纪，是中国古代帝王祭祀的场所。泰山也是中国文化的重要组成部分，被誉为“五岳之首”，是中国文化的重要遗产之一。
 [Q]: 世界上最高的山峰是什么？
 [A]: 世界上最高的山峰是珠穆朗玛峰，位于尼泊尔和中国的边境上，海拔8848米。
-[B]: 珠穆朗玛峰是世界上最高的山峰，位于喜马拉雅山脉，海拔8848米。
 [C]: 珠穆朗玛峰是世界上最高的山峰，海拔8848米。
 [D]: 世界上最高的山峰是珠穆朗玛峰，位于喜马拉雅山脉，海拔8,848.86米（29,031.69英尺）。
 [E]: 世界上最高的山峰是珠穆朗玛峰，也称为萨加玛塔，位于喜马拉雅山脉的尼泊尔和中国边境之间，海拔8,848.86米（29,031.69英尺）。它是世界上最著名的登山目标之一，吸引着来自世界各地的登山者前来挑战。珠穆朗玛峰的名字来自藏语，意思是“女神的额峰”，它是由两个不同的山峰组成的，其中一个是位于尼泊尔的珠穆朗玛峰，另一个则位于中国。它的高度是通过测量海拔高度来确定的，因为它是通过测量地球的平均高度来确定的。尽管珠穆朗玛峰是世界上最高的山峰，但它并不是地球上任何其他地方都能到达的。地球上的其他地方，如南美洲的安第斯山脉，也是世界上最高的山脉，但它并不是地球上最高的山峰。总之，珠穆朗玛峰是世界上最高的山峰，但它并不是地球上任何地方都能到达的。它是地球上最高的山脉，但它并不是地球上任何地方都能到达的。
@@ -467,7 +470,7 @@ MobileLLM提出架构的深度比宽度更重要，「深而窄」的「瘦长
 [Q]: 万有引力是谁提出的？
 [A]: 万有引力是由英国物理学家牛顿提出的。他认为，所有物体都会产生引力，这个引力是由物体的质量和距离决定的。这个理论揭示了质量和距离之间的关系，并证明了万有引力的本质。
-[B]: 万有引力是由牛顿发现的，它是由英国物理学家詹姆斯·瓦特于1879年提出的。
 [C]: 万有引力是英国物理学家艾萨克·牛顿在17世纪提出的。
 [D]: 引力是宇宙中一种力量，它是由质量或能量所组成的。根据牛顿的万有引力定律，一个物体之间的引力大小与它的质量成正比，与它的质量成反比。因此，牛顿的万有引力定律表明，一个物体之间的引力大小与它的质量成正比，与它的质量成反比。这个定律在物理学、天文学和物理学中都有广泛的应用。
 [E]: 万有引力是由英国物理学家艾萨克·牛顿于1687年提出的。他在1687年发表了一篇著名的论文，题为《自然哲学的数学原理》，其中提出了万有引力定律，指出任何两个物体之间都存在着一种引力，这种引力的大小与它们的质量成正比，与它们之间的距离的平方成反比。这个定律被称为万有引力定律，它描述了质量和距离之间的关系。它表明，两个物体之间的引力与它们的质量成正比，与它们之间的距离的平方成反比。这个定律对于解释宇宙中的许多现象和现象之间的相互作用非常重要，例如行星绕太阳运动、恒星之间的引力透镜效应等等。它也被广泛应用于天文学、物理学和工程学等领域，为我们理解宇宙的本质和演化提供了重要的理论基础。
@@ -475,7 +478,7 @@ MobileLLM提出架构的深度比宽度更重要，「深而窄」的「瘦长
 [Q]: 你知道大熊猫的主要食物是什么吗？
 [A]: 大熊猫的主要食物是竹子。它们通常吃竹子，每天需要吃掉大约30公斤的竹子。
-[B]: 大熊猫主要是竹子，它们的主要食物是竹子。大熊猫的消化系统比较特殊，它们每天需要消化大量的竹子才能维持其生存需要。除了竹子外，大熊猫还在野外生存。
 [C]: 大熊猫主要以竹子为食，也会偶尔食用其他的植物和动物。此外，大熊猫也会吃一些野果、小动物和植物，但这些都不属于它们的主要食物来源。
 [D]: 大熊猫的主要食物是竹子。
 [E]: 大熊猫是一种以竹子为主要食物的植食动物，它们的肠道结构适合消化纤维素含量高的竹子，但也会吃一些其他植物，如草本植物和小型哺乳动物。大熊猫的肠道结构也适应了高纤维的竹子，这使得它们能够消化大量的纤维素，从而帮助它们在野外生存。大熊猫的肠道中还含有一种叫做“纤维素酶”的酶类物质，这种酶能够帮助大熊猫分解纤维素，从而帮助它们消化大量的水分和营养物质。此外，大熊猫的肠道还含有一种叫做“纤维素蛋白”的蛋白质，这种蛋白质能够帮助大熊猫维持体内的能量水平，从而帮助它们在野外生存。总的来说，大熊猫的主要食物是竹子，但它们也会吃一些其他植物，如草本植物和小型哺乳动物。
@@ -483,7 +486,7 @@ MobileLLM提出架构的深度比宽度更重要，「深而窄」的「瘦长
 [Q]: 海水为什么是咸的？
 [A]: 海水中含有许多盐分，这些盐分可以导致海水咸度的增加。海水中的盐分主要来自于海水中的盐分，这些盐分可以从海水中的盐分中获得。海水中的氧化物和盐分可以与海水中的盐分形成氢键，从而使海水咸度增加。
-[B]: 海水中含有丰富的钙、磷、钾和磷等元素，这些元素在海洋中的数量和分布非常有限。此外，海水中还含有各种生物和微生物的营养物质，如贝壳和海藻等。
 [C]: 海水含有许多溶解的盐类，其中最主要的是氯化钠。当水中加入盐时，盐的分子会与水分子发生反应，形成氢氧化钠，这是一种盐。
 [D]: 因为水是许多生物的生存所必需的，它能够吸收和释放二氧化碳，释放氧气，维持大气中的水蒸气，以及维持水循环。水中的盐类和化学物质会释放出氧气，使地球上的生物呼吸。此外，水还具有许多其他功能，如保持水循环、维持水循环、调节气候和环境等。水是许多生物生存所必需的，它也是许多自然现象的重要基础。
 [E]: 海水是咸的主要原因是因为其中含有大量的溶解盐类，其中最主要的是氯化钠（NaCl）。氯化钠是一种非常常见的盐类，它存在于地球上的大部分地区，包括海洋、湖泊、河流和地下水。当水从海洋中蒸发时，它会带走一部分盐分，使海水变得咸。此外，海中还含有其他溶解盐类的物质，如硫酸盐、硫酸镁、硫酸钙、硫酸镁盐、硫酸镁钾、硫酸镁钠、硫酸镁钙、硫酸镁镁钙、硫酸镁镁钙、硫酸镁镁钙、硫酸镁镁钠、硫酸镁镁钙、硫酸镁镁镁钙、硫酸镁镁镁钙、硫酸镁镁镁钙、硫酸镁镁镁钙、硫酸镁镁镁钙、硫酸镁镁镁钙、硫酸镁镁镁钙、硫酸镁镁镁钙、硫酸镁镁镁钙、硫酸镁镁镁钙、硫酸镁镁镁镁钙、硫酸镁镁镁镁钙、硫酸镁镁镁镁钙、硫酸镁镁镁镁钙、硫酸镁镁镁镁钙、硫酸镁镁镁镁钙、硫酸镁镁镁镁镁钙、硫酸镁镁镁镁镁钙、硫酸镁镁镁镁镁钙、硫酸镁镁镁镁镁钙、硫酸镁镁镁镁镁镁钙、硫酸镁镁镁镁镁镁钙、硫酸镁镁镁镁镁镁钙、硫酸镁镁镁镁镁镁镁钙、硫酸镁镁镁镁

 | 模型 (大小)                 | tokenizer长度 | 推理占用   | release    | 主观评分（/100） |
 |-------------------------|-------------|--------|------------|------------|
+| minimind-v1-small (26M) | 6400        | 0.5 GB | 2024.08.28 | 50'        |
 | minimind-v1-moe (4×26M) | 6400        | 1.0 GB | 2024.09.17 | 55'        |
 | minimind-v1 (108M)      | 6400        | 1.0 GB | 2024.09.01 | 60'        |
 修改模型配置见[./model/LMConfig.py](./model/LMConfig.py)。
 minimind目前训练的模型版本见下表：
+| Model Name        | params | len_vocab | n_layers | d_model | kv_heads | q_heads | share+route | TopK |
+|-------------------|--------|-----------|----------|---------|----------|---------|-------------|------|
 | minimind-v1-small | 26M    | 6400      | 8        | 512     | 8        | 16      | -           | -    |
+| minimind-v1-moe   | 4×26M  | 6400      | 8        | 512     | 8        | 16      | 2+4         | 2    |
+| minimind-v1       | 108M   | 6400      | 16       | 768     | 8        | 16      | -           | -    |
 # 📌 Experiment
 环境：python 3.9 + Torch 2.1.2 + DDP多卡训练
 ```
+| Model Name        | params | len_vocab | batch_size | pretrain_time     | sft_single_time   | sft_multi_time      |
+|-------------------|--------|-----------|------------|-------------------|-------------------|---------------------|
 | minimind-v1-small | 26M    | 6400      | 64         | ≈2 hour (1 epoch) | ≈2 hour (1 epoch) | ≈0.5 hour (1 epoch) |
+| minimind-v1-moe   | 4×26M  | 6400      | 40         | ≈6 hour (1 epoch) | ≈5 hour (1 epoch) | ≈1 hour (1 epoch)   |
+| minimind-v1       | 108M   | 6400      | 16         | ≈6 hour (1 epoch) | ≈4 hour (1 epoch) | ≈1 hour (1 epoch)   |
 ---
     ```bash
     python 5-dpo_train.py
     ```
 ---
 📋关于LLM的参数配置，有一篇很有意思的论文[MobileLLM](https://arxiv.org/pdf/2402.14905)做了详细的研究和实验。
 ![gpt3_config.png](./images/gpt3_config.png)
 ---
 ### 训练完成的模型权重
+[百度网盘](https://pan.baidu.com/s/1KUfSzEkSXYbCCBj0Pw-9fA?pwd=6666)
+| Model Name        | params | Config                      | pretrain_model                                                 | single_sft_model                                               | multi_sft_model                                                |
+|-------------------|--------|-----------------------------|----------------------------------------------------------------|----------------------------------------------------------------|----------------------------------------------------------------|
+| minimind-v1-small | 26M    | d_model=512<br/>n_layers=8  | [链接](https://pan.baidu.com/s/1wP_cAIc8cgaJ6CxUmR9ECQ?pwd=6666) | [链接](https://pan.baidu.com/s/1_COe0FQRDmeapSsvArahCA?pwd=6666) | [链接](https://pan.baidu.com/s/1GsGsWSL0Dckl0YPRXiBIFQ?pwd=6666) |
+| minimind-v1-moe   | 4×26M  | d_model=512<br/>n_layers=8  | [链接](https://pan.baidu.com/s/1IZdkzPRhbZ_bSsRL8vInjg?pwd=6666) | [链接](https://pan.baidu.com/s/1tqB-GMvuiGQBvEl-yZ-oBw?pwd=6666) | [链接](https://pan.baidu.com/s/1GHJ2T4904EcT1u8l1rVqtg?pwd=6666) |
+| minimind-v1       | 108M   | d_model=768<br/>n_layers=16 | -                                                              | [链接](https://pan.baidu.com/s/1p713loS7EfwHQf3G9eYI3Q?pwd=6666) | [链接](https://pan.baidu.com/s/12iHGpAs6R0kqsOnGtgK6vQ?pwd=6666) |
+---
 # 📌 Eval
 > 以下测试于2024-09-17完成，此日期后发布的新模型，无特殊需要时将不加入测试。
 [A] [minimind-v1-small(0.02B)](https://pan.baidu.com/s/1_COe0FQRDmeapSsvArahCA?pwd=6666)<br/>
+[B] [minimind-v1-moe(0.1B)](https://pan.baidu.com/s/1tqB-GMvuiGQBvEl-yZ-oBw?pwd=6666)<br/>
 [C] [minimind-v1(0.1B)](https://pan.baidu.com/s/1p713loS7EfwHQf3G9eYI3Q?pwd=6666)<br/>
 [D] [baby-llama2-chinese(0.2B)](https://github.com/DLLXW/baby-llama2-chinese)<br/>
 [E] [chatlm-mini-chinese(0.2B)](https://github.com/charent/ChatLM-mini-Chinese)<br/>
 ```text
 [Q]: 你知道长江吗？
 [A]: 是的，长江是中国最长的河流，全长超过3300千米。
+[B]: 长江是中国最长的河流，总长度约为6300公里。它发源于青藏高原的青海湖，流经青海、四川、甘肃、宁夏、内蒙古、陕西和山西。长江流域是中国最大的农业区之一，也是中国的文化和经济中心。
 [C]: 是的，长江是中国最长的河流之一，也是世界第三大河流。长江发源于青藏高原，流经11个省份，最终注入东海。长江流域是中国最大的河流流域之一，拥有着丰富的自然资源和文化遗产，如长江大桥、长江海洋博物馆、长江古城等。
 [D]: 同时也知道，中国是世界上最长的城市之一，拥有超过3400万人口。
 [E]: 是的，长江是中国最长的河流，也是世界上最长的河流之一。它发源于青藏高原，流经九个省区，最终注入东海。长江流域的气候温和，四季分明，是中国重要的农业和工业基地。长江流域的水产资源丰富，是中国的重要渔业资源之一。
 [Q]: 第一颗人造卫星是哪个国家发射的？
 [A]: 第一颗人造卫星是斯普特尼克1号。
+[B]: 第一颗人造卫星是俄罗斯的国际卫星，于1957年发射。
 [C]: 第一颗人造卫星是苏联发射的，它是由苏联于1957年10月发射的。
 [D]: 这颗名为“阿姆斯特朗”的卫星，于1961年11月20日发射，是第一个成功将人类送入太空的卫星。
 [E]: 第一颗人造卫星是苏联于1957年4月4日发射的，它是由苏联宇航员尤里·加加林（Yuri Gagarin）和苏联工程师罗尔德·谢尔盖耶夫（Roald Skohley）共同发射的。在发射过程中，加加林和谢尔盖耶夫使用了一个名为“斯普特尼克1号”的小型卫星，它的直径为4.3米，重量为1,051公斤。这枚卫星被称为“斯普特尼克1号”，是苏联太空竞赛中的一个里程碑，也是人类历史上第一次在太空中进行太空行走。
 [Q]: 你知道泰山在哪里吗？
 [A]: 泰山位于中国东部地区，是中国五岳之一。
+[B]: 泰山位于山东省泰安市泰安市。泰安市是中国五岳之一，也是世界文化遗产和中国历史文化名山之一。泰安市有许多著名的景点，如五岳之一、玉皇顶、南天门、玉皇阁等。
 [C]: 泰山位于中国山东省泰安市，是中国五岳之一。
 [D]: 作为一个人工智能助手，我无法访问互联网，因此无法提供关于你所在位置的信息。不过，你可以通过搜索或搜索相关关键词来获取有关你所在位置的信息，或者查看相关新闻或网站来获取有关你所在位置的信息。
 [E]: 泰山位于中国山东省泰安市境内，是中国五岳之一，也是中国文化和历史���重要象征之一。它是中国五岳之一，也是中国道教和佛教的发源地之一。泰山的海拔高度为1,545米，是中国五岳之一，也是中国最著名的旅游景点之一。它的历史可以追溯到公元前5世纪，是中国古代帝王祭祀的场所。泰山也是中国文化的重要组成部分，被誉为“五岳之首”，是中国文化的重要遗产之一。
+[F]: 泰山位于中国山东省中部，是中国的五岳之一，也是世界文化和自然遗产。它不仅以其雄伟的自然景观闻名，还承载着丰富的历史文化，是中国古代帝王封禅的重要场所。泰山是中华民族的象征之一，体现了中国人民对自然和历史的尊重与传承。
 [Q]: 世界上最高的山峰是什么？
 [A]: 世界上最高的山峰是珠穆朗玛峰，位于尼泊尔和中国的边境上，海拔8848米。
+[B]: 世界上最高的山峰是珠穆朗玛峰，位于喜马拉雅山脉，海拔8,848米。
 [C]: 珠穆朗玛峰是世界上最高的山峰，海拔8848米。
 [D]: 世界上最高的山峰是珠穆朗玛峰，位于喜马拉雅山脉，海拔8,848.86米（29,031.69英尺）。
 [E]: 世界上最高的山峰是珠穆朗玛峰，也称为萨加玛塔，位于喜马拉雅山脉的尼泊尔和中国边境之间，海拔8,848.86米（29,031.69英尺）。它是世界上最著名的登山目标之一，吸引着来自世界各地的登山者前来挑战。珠穆朗玛峰的名字来自藏语，意思是“女神的额峰”，它是由两个不同的山峰组成的，其中一个是位于尼泊尔的珠穆朗玛峰，另一个则位于中国。它的高度是通过测量海拔高度来确定的，因为它是通过测量地球的平均高度来确定的。尽管珠穆朗玛峰是世界上最高的山峰，但它并不是地球上任何其他地方都能到达的。地球上的其他地方，如南美洲的安第斯山脉，也是世界上最高的山脉，但它并不是地球上最高的山峰。总之，珠穆朗玛峰是世界上最高的山峰，但它并不是地球上任何地方都能到达的。它是地球上最高的山脉，但它并不是地球上任何地方都能到达的。
 [Q]: 万有引力是谁提出的？
 [A]: 万有引力是由英国物理学家牛顿提出的。他认为，所有物体都会产生引力，这个引力是由物体的质量和距离决定的。这个理论揭示了质量和距离之间的关系，并证明了万有引力的本质。
+[B]: 万有引力是由牛顿在1687年提出的。它是宇宙中一个基本的物理现象，它描述了物体之间的相互作用。这个力被称为万有引力，它在所有物体之间都存在，这种力与物体的质量和距离有关。万有引力是自然界中最基本的力量之一，它不仅影响着物体的运动和变化，还对宇宙的结构和演化产生着重要的影响。
 [C]: 万有引力是英国物理学家艾萨克·牛顿在17世纪提出的。
 [D]: 引力是宇宙中一种力量，它是由质量或能量所组成的。根据牛顿的万有引力定律，一个物体之间的引力大小与它的质量成正比，与它的质量成反比。因此，牛顿的万有引力定律表明，一个物体之间的引力大小与它的质量成正比，与它的质量成反比。这个定律在物理学、天文学和物理学中都有广泛的应用。
 [E]: 万有引力是由英国物理学家艾萨克·牛顿于1687年提出的。他在1687年发表了一篇著名的论文，题为《自然哲学的数学原理》，其中提出了万有引力定律，指出任何两个物体之间都存在着一种引力，这种引力的大小与它们的质量成正比，与它们之间的距离的平方成反比。这个定律被称为万有引力定律，它描述了质量和距离之间的关系。它表明，两个物体之间的引力与它们的质量成正比，与它们之间的距离的平方成反比。这个定律对于解释宇宙中的许多现象和现象之间的相互作用非常重要，例如行星绕太阳运动、恒星之间的引力透镜效应等等。它也被广泛应用于天文学、物理学和工程学等领域，为我们理解宇宙的本质和演化提供了重要的理论基础。
 [Q]: 你知道大熊猫的主要食物是什么吗？
 [A]: 大熊猫的主要食物是竹子。它们通常吃竹子，每天需要吃掉大约30公斤的竹子。
+[B]: 是的，大熊猫主要以竹子为食。��们每天需要吃掉大量的竹子来维持生命，每天需要吃掉40至80公斤的竹子。此外，它们也会吃一些其他的植物，如草、果实、坚果和小型哺乳动物。
 [C]: 大熊猫主要以竹子为食，也会偶尔食用其他的植物和动物。此外，大熊猫也会吃一些野果、小动物和植物，但这些都不属于它们的主要食物来源。
 [D]: 大熊猫的主要食物是竹子。
 [E]: 大熊猫是一种以竹子为主要食物的植食动物，它们的肠道结构适合消化纤维素含量高的竹子，但也会吃一些其他植物，如草本植物和小型哺乳动物。大熊猫的肠道结构也适应了高纤维的竹子，这使得它们能够消化大量的纤维素，从而帮助它们在野外生存。大熊猫的肠道中还含有一种叫做“纤维素酶”的酶类物质，这种酶能够帮助大熊猫分解纤维素，从而帮助它们消化大量的水分和营养物质。此外，大熊猫的肠道还含有一种叫做“纤维素蛋白”的蛋白质，这种蛋白质能够帮助大熊猫维持体内的能量水平，从而帮助它们在野外生存。总的来说，大熊猫的主要食物是竹子，但它们也会吃一些其他植物，如草本植物和小型哺乳动物。
 [Q]: 海水为什么是咸的？
 [A]: 海水中含有许多盐分，这些盐分可以导致海水咸度的增加。海水中的盐分主要来自于海水中的盐分，这些盐分可以从海水中的盐分中获得。海水中的氧化物和盐分可以与海水中的盐分形成氢键，从而使海水咸度增加。
+[B]: 海水是咸的，这是因为它含有大量的盐分，其中包括氯离子和氯离子。这些离子在海洋中形成了大量的盐，这些盐分被称为盐分。海水中含有许多溶解在水中的盐，这些盐分会随着海水的升高而逐渐蒸发，最终导致海水中盐分的蒸发。这些盐分在海水中被完全消耗，然后被排除在海水中。此外，海水中还含有许多其他的盐分，但这些盐分是不同的，因为它们在水中的浮起和下降过程是不同的。海水中的盐分是从海底的岩石中提取出来的，这些盐分在海水中可以被利用。总的来说，海水是咸的，这也是为什么海水中含有许多盐分和盐分的原因。
 [C]: 海水含有许多溶解的盐类，其中最主要的是氯化钠。当水中加入盐时，盐的分子会与水分子发生反应，形成氢氧化钠，这是一种盐。
 [D]: 因为水是许多生物的生存所必需的，它能够吸收和释放二氧化碳，释放氧气，维持大气中的水蒸气，以及维持水循环。水中的盐类和化学物质会释放出氧气，使地球上的生物呼吸。此外，水还具有许多其他功能，如保持水循环、维持水循环、调节气候和环境等。水是许多生物生存所必需的，它也是许多自然现象的重要基础。
 [E]: 海水是咸的主要原因是因为其中含有大量的溶解盐类，其中最主要的是氯化钠（NaCl）。氯化钠是一种非常常见的盐类，它存在于地球上的大部分地区，包括海洋、湖泊、河流和地下水。当水从海洋中蒸发时，它会带走一部分盐分，使海水变得咸。此外，海中还含有其他溶解盐类的物质，如硫酸盐、硫酸镁、硫酸钙、硫酸镁盐、硫酸镁钾、硫酸镁钠、硫酸镁钙、硫酸镁镁钙、硫酸镁镁钙、硫酸镁镁钙、硫酸镁镁钠、硫酸镁镁钙、硫酸镁镁镁钙、硫酸镁镁镁钙、硫酸镁镁镁钙、硫酸镁镁镁钙、硫酸镁镁镁钙、硫酸镁镁镁钙、硫酸镁镁镁钙、硫酸镁镁镁钙、硫酸镁镁镁钙、硫酸镁镁镁钙、硫酸镁镁镁镁钙、硫酸镁镁镁镁钙、硫酸镁镁镁镁钙、硫酸镁镁镁镁钙、硫酸镁镁镁镁钙、硫酸镁镁镁镁钙、硫酸镁镁镁镁镁钙、硫酸镁镁镁镁镁钙、硫酸镁镁镁镁镁钙、硫酸镁镁镁镁镁钙、硫酸镁镁镁镁镁镁钙、硫酸镁镁镁镁镁镁钙、硫酸镁镁镁镁镁镁钙、硫酸镁镁镁镁镁镁镁钙、硫酸镁镁镁镁

README_en.md CHANGED Viewed

@@ -59,13 +59,14 @@ Therefore, the goal of this project is to lower the barrier to entry for working
 training an extremely lightweight language model from scratch.
 > [!CAUTION]
-> As of 2024-09-17, MiniMind has trained three model versions, with the smallest model requiring only 26M (0.02B) parameters to achieve smooth conversational abilities!
-| Model (Size)                  | Tokenizer Length | Inference Memory Usage | Release Date | Subjective Rating (/100) |
-|-------------------------------|------------------|------------------------|--------------|--------------------------|
-| minimind-v1-small (26M)        | 6400             | 0.5 GB                 | 2024.08.28   | 50'                      |
-| minimind-v1-moe (4×26M)       | 6400             | 1.0 GB                 | 2024.09.17   | 55'                      |
-| MiniMind-V1 (108M)            | 6400             | 1.0 GB                 | 2024.09.01   | 60'                      |
 > This analysis was run on an RTX 3090 GPU with Torch 2.1.2, CUDA 12.2, and Flash Attention 2.
@@ -84,18 +85,21 @@ The project includes:
 We hope this open-source project helps LLM beginners get started quickly!
 ### 👉**Recent Updates**
 <details close>
 <summary> <b>2024-09-17 (new🎉)</b> </summary>
 - Updated the minimind-v1-moe model
-- To prevent ambiguity, all mistral_tokenizer versions have been removed, and a custom minimind_tokenizer is now used as the tokenizer.
 </details>
 <details close>
 <summary> <b>2024-09-01</b> </summary>
-- Updated the MiniMind-V1 (108M) model, using minimind_tokenizer with 3 pre-training epochs and 10 SFT epochs for more thorough training and improved performance.
 - The project has been deployed to ModelScope's Creative Space and can be experienced on the website:
@@ -167,6 +171,7 @@ The project has been deployed to ModelScope makerspace, where you can experience
 *
     0. Install the required dependencies
 ```bash
   pip install -r requirements.txt
 ```
@@ -196,7 +201,8 @@ git clone https://github.com/jingyaogong/minimind.git
     3. Test model inference performance
     * Ensure that the required trained parameter weights are located in the `./out/` directory.
-    * You can also directly download and use the trained model weights from [Trained Model Weights](#Trained Model Weights).
        ```text
       out
       ├── multi_chat
@@ -261,10 +267,16 @@ git clone https://github.com/jingyaogong/minimind.git
     </table>
   > [!IMPORTANT]
-  > Update on 2024-09-17: To avoid ambiguity from previous versions and control the model size, all Minimind models now use the Minimind_tokenizer for tokenization, and all versions of the Mistral_tokenizer have been deprecated.
-  > Although the Minimind_tokenizer has a small length and its encoding/decoding efficiency is weaker compared to Chinese-friendly tokenizers like Qwen2 and GLM, the Minimind models have opted for their custom-trained Minimind_tokenizer to maintain a lightweight parameter structure and prevent an imbalance between encoding and computation layers. This is because the Minimind vocabulary size is only 6,400.
-  > Moreover, Minimind has not encountered any issues with decoding rare words in practical tests, and the performance has been satisfactory. Due to the custom vocabulary being compressed to 6,400 tokens, the total parameter size of the LLM is minimized to only 26M.
 ---
@@ -346,12 +358,11 @@ and FFN layer code. The structure is illustrated in the figure below (redrawn):
 Model configurations can be found in [./model/LMConfig.py](./model/LMConfig.py). The model types and parameters are
 shown in the table below:
-| Model Name       | params | len_vocab | n_layers | d_model | kv_heads | q_heads | share+route | TopK |
-|------------------|--------|-----------|----------|---------|----------|---------|-------------|------|
 | minimind-v1-small | 26M    | 6400      | 8        | 512     | 8        | 16      | -           | -    |
-| minimind-v1-moe  | 4×26M  | 6400      | 8        | 512     | 8        | 16      | 2+4         | 2    |
-| minimind-v1      | 108M   | 6400      | 16       | 768     | 8        | 16      | -           | -    |
 # 📌 Experiment
@@ -362,11 +373,11 @@ GPU: NVIDIA GeForce RTX 3090 (24GB) * 2
 Environment: python 3.9 + Torch 2.1.2 + DDP multi-GPU training
 ```
-| Model Name       | params | len_vocab | batch_size | pretrain_time     | sft_single_time   | sft_multi_time      |
-|------------------|--------|-----------|------------|-------------------|-------------------|---------------------|
 | minimind-v1-small | 26M    | 6400      | 64         | ≈2 hour (1 epoch) | ≈2 hour (1 epoch) | ≈0.5 hour (1 epoch) |
-| minimind-v1-moe  | 4×26M  | 6400      | 40         | ≈6 hour (1 epoch) | ≈5 hour (1 epoch) | ≈1 hour (1 epoch)   |
-| minimind-v1      | 108M   | 6400      | 16         | ≈6 hour (1 epoch) | ≈4 hour (1 epoch) | ≈1 hour (1 epoch)   |
 ---
@@ -428,43 +439,60 @@ Environment: python 3.9 + Torch 2.1.2 + DDP multi-GPU training
     ```bash
     python 5-dpo_train.py
     ```
 ---
-📋 Regarding LLM parameter configuration, an interesting paper [MobileLLM](https://arxiv.org/pdf/2402.14905) provides detailed research and experiments.
-The scaling law exhibits unique patterns in small models. The parameters that significantly influence the scaling of Transformer models are primarily `d_model` and `n_layers`.
 * `d_model`↑ + `n_layers`↓ -> Short and wide models
 * `d_model`↓ + `n_layers`↑ -> Tall and narrow models
-The Scaling Law proposed in 2020 posits that the amount of training data, parameter count, and training iterations are the key factors determining performance, with the influence of model architecture being nearly negligible. However, this law seems not to fully apply to small models.
-MobileLLM suggests that the depth of the architecture is more important than its width. A "deep and narrow" model can learn more abstract concepts compared to a "wide and shallow" model. For instance, when the model parameters are fixed at 125M or 350M, a 30–42 layer "narrow" model significantly outperforms a 12-layer "short and wide" model. This trend is observed across eight benchmark tests, including common sense reasoning, question answering, and reading comprehension.
-This is a fascinating discovery, as previously, few attempts were made to stack more than 12 layers when designing architectures for small models around the 100M parameter range. This aligns with the observations from MiniMind, where adjusting parameters between `d_model` and `n_layers` during training produced similar effects.
-However, "deep and narrow" has its limitations. When `d_model` < 512, the disadvantages of collapsing word embedding dimensions become very pronounced, and increasing layers does not compensate for the shortcomings in `d_head` caused by fixed `q_head`. Conversely, when `d_model` > 1536, increasing layers seems to have a higher priority than `d_model`, providing a better "cost-performance" ratio and effect gain.
-Therefore, MiniMind sets `d_model = 512` and `n_layers = 8` for the small model to achieve a balance between "minimal size <-> better performance." For greater performance gains, `d_model = 768` and `n_layers = 16` are set, aligning better with the scaling law for small models.
 > For reference, the configuration details for GPT-3 are shown in the table below:
 ![gpt3_config.png](./images/gpt3_config.png)
 ---
 ### Trained Model Weights
-| Model Name        | params | Config                      | pretrain_model | single_sft_model                                                | multi_sft_model                                                |
-|-------------------|--------|-----------------------------|----------------|-----------------------------------------------------------------|----------------------------------------------------------------|
-| minimind-v1-small | 26M    | d_model=512<br/>n_layers=8  | -              | [URL](https://pan.baidu.com/s/1_COe0FQRDmeapSsvArahCA?pwd=6666) | [URL](https://pan.baidu.com/s/1GsGsWSL0Dckl0YPRXiBIFQ?pwd=6666) |
-| minimind-v1-moe   | 4×26M  | d_model=512<br/>n_layers=8  | -              | -                                                               | -                                                              |
-| minimind-v1       | 108M   | d_model=768<br/>n_layers=16 | -              | [URL](https://pan.baidu.com/s/1p713loS7EfwHQf3G9eYI3Q?pwd=6666)  | [URL](https://pan.baidu.com/s/12iHGpAs6R0kqsOnGtgK6vQ?pwd=6666) |
 ---
 # 📌 Eval
 > [!TIP]
-> The following tests were completed on September 17, 2024. New models released after this date will not be included in the tests unless there is a special need.
 [A] [minimind-v1-small(0.02B)](https://pan.baidu.com/s/1_COe0FQRDmeapSsvArahCA?pwd=6666)<br/>
-[B] minimind-v1-moe(0.1B)<br/>
 [C] [minimind-v1(0.1B)](https://pan.baidu.com/s/1p713loS7EfwHQf3G9eYI3Q?pwd=6666)<br/>
 [D] [baby-llama2-chinese(0.2B)](https://github.com/DLLXW/baby-llama2-chinese)<br/>
 [E] [chatlm-mini-chinese(0.2B)](https://github.com/charent/ChatLM-mini-Chinese)<br/>
@@ -523,26 +551,33 @@ Therefore, MiniMind sets `d_model = 512` and `n_layers = 8` for the small model
 > 🙋‍♂️Directly throw the answer of the above model to GPT-4o and ask it to help score it:
 ---
 ### Model Performance Review:
 1. **Model A**:
-    - **Performance**: Model A's responses are usually concise and clear but lack detail and accuracy in some cases. For example, Model A provided incorrect information about the length of the Yangtze River.
     - **Score**: 60
 2. **Model B**:
-    - **Performance**: Model B provides additional information in some cases, but this information can sometimes be inaccurate or excessive. For instance, Model B gave incorrect figures for the length and drainage area of the Yangtze River.
     - **Score**: 65
 3. **Model C**:
-    - **Performance**: Model C typically provides detailed and accurate answers for most questions. For example, responses about the Yangtze River and Mount Tai were accurate.
     - **Score**: 75
 4. **Model D**:
-    - **Performance**: Model D’s responses sometimes appear disorganized and lack accuracy. For example, the answer about Mount Tai was completely off-topic.
     - **Score**: 50
 5. **Model E**:
-    - **Performance**: Model E’s responses are usually very detailed, but they can be overly verbose and contain unnecessary information. For instance, the answer on gravity was overly complex.
     - **Score**: 70
 #### Ranking (from highest to lowest):
@@ -555,13 +590,21 @@ Therefore, MiniMind sets `d_model = 512` and `n_layers = 8` for the small model
 ## 👉 Summary of Effects
-* The ranking of the minimind series (ABC) is intuitive, with minimind-v1(0.1B) scoring the highest and providing mostly accurate answers to common knowledge questions.
     * Surprisingly, minimind-v1-small (0.02B) with only 26M parameters performs close to minimind-v1(0.1B).
-    * Despite having less than 2 epochs of training, minimind-v1(0.1B) performed the best. This suggests that a larger model often yields better performance, even with limited training.
-    * minimind-v1-moe (0.1B) performed poorly, likely because it was terminated early to free up resources for smaller models. MoE models require more training epochs, and with only 2 epochs, it was under-trained. Previous experiments with a fully trained MoE model on Yi tokenizer showed visible improvements. Future versions, v2 and v3, will be updated with better training.
-* Model E’s responses appear the most complete, despite some instances of hallucination and overly verbose content. However, GPT-4o and Deepseek's evaluations suggest it is "overly verbose and repetitive, with some hallucinations."
-  This strict evaluation might penalize models with some hallucinations heavily. Due to F models having longer default text lengths and much larger datasets, the quality of responses depends significantly on the data rather than the model size alone.
 > 🙋‍♂️ Personal Subjective Evaluation: E>C>B≈A>D
@@ -604,7 +647,6 @@ answering, so results should be considered as reference only.
 | minimind-v1-small | 	   344	 |      1346      |  25.56%  |
 | minimind-v1       | 	   351	 |      1346      |  26.08%  |
 ### Model Performance Insights from GPT-4o
 ```text
@@ -708,10 +750,12 @@ your model with third-party UIs, such as fastgpt, OpenWebUI, etc.
 </a>
 -->
-<a href="https://github.com/jingyaogong"><img src="https://avatars.githubusercontent.com/u/62287848" width="70px" height="70px"/></a>&nbsp;
-<a href="https://github.com/MuWinds"><img src="https://avatars.githubusercontent.com/u/93832089" width="70px" height="70px"/></a>&nbsp;
-<a href="https://github.com/chuanzhubin"><img src="https://avatars.githubusercontent.com/u/2813798" width="70px" height="70px"/></a>&nbsp;
 ## 😊Thanks for

 training an extremely lightweight language model from scratch.
 > [!CAUTION]
+> As of 2024-09-17, MiniMind has trained three model versions, with the smallest model requiring only 26M (0.02B)
+> parameters to achieve smooth conversational abilities!
+| Model (Size)            | Tokenizer Length | Inference Memory Usage | Release Date | Subjective Rating (/100) |
+|-------------------------|------------------|------------------------|--------------|--------------------------|
+| minimind-v1-small (26M) | 6400             | 0.5 GB                 | 2024.08.28   | 50'                      |
+| minimind-v1-moe (4×26M) | 6400             | 1.0 GB                 | 2024.09.17   | 55'                      |
+| MiniMind-V1 (108M)      | 6400             | 1.0 GB                 | 2024.09.01   | 60'                      |
 > This analysis was run on an RTX 3090 GPU with Torch 2.1.2, CUDA 12.2, and Flash Attention 2.
 We hope this open-source project helps LLM beginners get started quickly!
 ### 👉**Recent Updates**
 <details close>
 <summary> <b>2024-09-17 (new🎉)</b> </summary>
 - Updated the minimind-v1-moe model
+- To prevent ambiguity, all mistral_tokenizer versions have been removed, and a custom minimind_tokenizer is now used as
+  the tokenizer.
 </details>
 <details close>
 <summary> <b>2024-09-01</b> </summary>
+- Updated the MiniMind-V1 (108M) model, using minimind_tokenizer with 3 pre-training epochs and 10 SFT epochs for more
+  thorough training and improved performance.
 - The project has been deployed to ModelScope's Creative Space and can be experienced on the website:
 *
     0. Install the required dependencies
 ```bash
   pip install -r requirements.txt
 ```
     3. Test model inference performance
     * Ensure that the required trained parameter weights are located in the `./out/` directory.
+    * You can also directly download and use the trained model weights
+      from [Trained Model Weights](#Trained Model Weights).
        ```text
       out
       ├── multi_chat
     </table>
   > [!IMPORTANT]
+  > Update on 2024-09-17: To avoid ambiguity from previous versions and control the model size, all Minimind models now
+  use the Minimind_tokenizer for tokenization, and all versions of the Mistral_tokenizer have been deprecated.
+  > Although the Minimind_tokenizer has a small length and its encoding/decoding efficiency is weaker compared to
+  Chinese-friendly tokenizers like Qwen2 and GLM, the Minimind models have opted for their custom-trained
+  Minimind_tokenizer to maintain a lightweight parameter structure and prevent an imbalance between encoding and
+  computation layers. This is because the Minimind vocabulary size is only 6,400.
+  > Moreover, Minimind has not encountered any issues with decoding rare words in practical tests, and the performance
+  has been satisfactory. Due to the custom vocabulary being compressed to 6,400 tokens, the total parameter size of the
+  LLM is minimized to only 26M.
 ---
 Model configurations can be found in [./model/LMConfig.py](./model/LMConfig.py). The model types and parameters are
 shown in the table below:
+| Model Name        | params | len_vocab | n_layers | d_model | kv_heads | q_heads | share+route | TopK |
+|-------------------|--------|-----------|----------|---------|----------|---------|-------------|------|
 | minimind-v1-small | 26M    | 6400      | 8        | 512     | 8        | 16      | -           | -    |
+| minimind-v1-moe   | 4×26M  | 6400      | 8        | 512     | 8        | 16      | 2+4         | 2    |
+| minimind-v1       | 108M   | 6400      | 16       | 768     | 8        | 16      | -           | -    |
 # 📌 Experiment
 Environment: python 3.9 + Torch 2.1.2 + DDP multi-GPU training
 ```
+| Model Name        | params | len_vocab | batch_size | pretrain_time     | sft_single_time   | sft_multi_time      |
+|-------------------|--------|-----------|------------|-------------------|-------------------|---------------------|
 | minimind-v1-small | 26M    | 6400      | 64         | ≈2 hour (1 epoch) | ≈2 hour (1 epoch) | ≈0.5 hour (1 epoch) |
+| minimind-v1-moe   | 4×26M  | 6400      | 40         | ≈6 hour (1 epoch) | ≈5 hour (1 epoch) | ≈1 hour (1 epoch)   |
+| minimind-v1       | 108M   | 6400      | 16         | ≈6 hour (1 epoch) | ≈4 hour (1 epoch) | ≈1 hour (1 epoch)   |
 ---
     ```bash
     python 5-dpo_train.py
     ```
 ---
+📋 Regarding LLM parameter configuration, an interesting paper [MobileLLM](https://arxiv.org/pdf/2402.14905) provides
+detailed research and experiments.
+The scaling law exhibits unique patterns in small models. The parameters that significantly influence the scaling of
+Transformer models are primarily `d_model` and `n_layers`.
 * `d_model`↑ + `n_layers`↓ -> Short and wide models
 * `d_model`↓ + `n_layers`↑ -> Tall and narrow models
+The Scaling Law proposed in 2020 posits that the amount of training data, parameter count, and training iterations are
+the key factors determining performance, with the influence of model architecture being nearly negligible. However, this
+law seems not to fully apply to small models.
+MobileLLM suggests that the depth of the architecture is more important than its width. A "deep and narrow" model can
+learn more abstract concepts compared to a "wide and shallow" model. For instance, when the model parameters are fixed
+at 125M or 350M, a 30–42 layer "narrow" model significantly outperforms a 12-layer "short and wide" model. This trend is
+observed across eight benchmark tests, including common sense reasoning, question answering, and reading comprehension.
+This is a fascinating discovery, as previously, few attempts were made to stack more than 12 layers when designing
+architectures for small models around the 100M parameter range. This aligns with the observations from MiniMind, where
+adjusting parameters between `d_model` and `n_layers` during training produced similar effects.
+However, "deep and narrow" has its limitations. When `d_model` < 512, the disadvantages of collapsing word embedding
+dimensions become very pronounced, and increasing layers does not compensate for the shortcomings in `d_head` caused by
+fixed `q_head`. Conversely, when `d_model` > 1536, increasing layers seems to have a higher priority than `d_model`,
+providing a better "cost-performance" ratio and effect gain.
+Therefore, MiniMind sets `d_model = 512` and `n_layers = 8` for the small model to achieve a balance between "minimal
+size <-> better performance." For greater performance gains, `d_model = 768` and `n_layers = 16` are set, aligning
+better with the scaling law for small models.
 > For reference, the configuration details for GPT-3 are shown in the table below:
 ![gpt3_config.png](./images/gpt3_config.png)
 ---
 ### Trained Model Weights
+[baidu](https://pan.baidu.com/s/1KUfSzEkSXYbCCBj0Pw-9fA?pwd=6666)
+| Model Name        | params | Config                      | pretrain_model                                                  | single_sft_model                                               | multi_sft_model                                                |
+|-------------------|--------|-----------------------------|-----------------------------------------------------------------|----------------------------------------------------------------|----------------------------------------------------------------|
+| minimind-v1-small | 26M    | d_model=512<br/>n_layers=8  | [URL](https://pan.baidu.com/s/1wP_cAIc8cgaJ6CxUmR9ECQ?pwd=6666) | [URL](https://pan.baidu.com/s/1_COe0FQRDmeapSsvArahCA?pwd=6666) | [URL](https://pan.baidu.com/s/1GsGsWSL0Dckl0YPRXiBIFQ?pwd=6666) |
+| minimind-v1-moe   | 4×26M  | d_model=512<br/>n_layers=8  | [URL](https://pan.baidu.com/s/1IZdkzPRhbZ_bSsRL8vInjg?pwd=6666)  | [URL](https://pan.baidu.com/s/1tqB-GMvuiGQBvEl-yZ-oBw?pwd=6666) | [URL](https://pan.baidu.com/s/1GHJ2T4904EcT1u8l1rVqtg?pwd=6666) |
+| minimind-v1       | 108M   | d_model=768<br/>n_layers=16 | -                                                               | [URL](https://pan.baidu.com/s/1p713loS7EfwHQf3G9eYI3Q?pwd=6666) | [URL](https://pan.baidu.com/s/12iHGpAs6R0kqsOnGtgK6vQ?pwd=6666) |
 ---
 # 📌 Eval
 > [!TIP]
+> The following tests were completed on September 17, 2024. New models released after this date will not be included in
+> the tests unless there is a special need.
 [A] [minimind-v1-small(0.02B)](https://pan.baidu.com/s/1_COe0FQRDmeapSsvArahCA?pwd=6666)<br/>
+[B] [minimind-v1-moe(0.1B)](https://pan.baidu.com/s/1tqB-GMvuiGQBvEl-yZ-oBw?pwd=6666)<br/>
 [C] [minimind-v1(0.1B)](https://pan.baidu.com/s/1p713loS7EfwHQf3G9eYI3Q?pwd=6666)<br/>
 [D] [baby-llama2-chinese(0.2B)](https://github.com/DLLXW/baby-llama2-chinese)<br/>
 [E] [chatlm-mini-chinese(0.2B)](https://github.com/charent/ChatLM-mini-Chinese)<br/>
 > 🙋‍♂️Directly throw the answer of the above model to GPT-4o and ask it to help score it:
 ---
 ### Model Performance Review:
 1. **Model A**:
+    - **Performance**: Model A's responses are usually concise and clear but lack detail and accuracy in some cases. For
+      example, Model A provided incorrect information about the length of the Yangtze River.
     - **Score**: 60
 2. **Model B**:
+    - **Performance**: Model B provides additional information in some cases, but this information can sometimes be
+      inaccurate or excessive. For instance, Model B gave incorrect figures for the length and drainage area of the
+      Yangtze River.
     - **Score**: 65
 3. **Model C**:
+    - **Performance**: Model C typically provides detailed and accurate answers for most questions. For example,
+      responses about the Yangtze River and Mount Tai were accurate.
     - **Score**: 75
 4. **Model D**:
+    - **Performance**: Model D’s responses sometimes appear disorganized and lack accuracy. For example, the answer
+      about Mount Tai was completely off-topic.
     - **Score**: 50
 5. **Model E**:
+    - **Performance**: Model E’s responses are usually very detailed, but they can be overly verbose and contain
+      unnecessary information. For instance, the answer on gravity was overly complex.
     - **Score**: 70
 #### Ranking (from highest to lowest):
 ## 👉 Summary of Effects
+* The ranking of the minimind series (ABC) is intuitive, with minimind-v1(0.1B) scoring the highest and providing mostly
+  accurate answers to common knowledge questions.
     * Surprisingly, minimind-v1-small (0.02B) with only 26M parameters performs close to minimind-v1(0.1B).
+    * Despite having less than 2 epochs of training, minimind-v1(0.1B) performed the best. This suggests that a larger
+      model often yields better performance, even with limited training.
+    * minimind-v1-moe (0.1B) performed poorly, likely because it was terminated early to free up resources for smaller
+      models. MoE models require more training epochs, and with only 2 epochs, it was under-trained. Previous
+      experiments with a fully trained MoE model on Yi tokenizer showed visible improvements. Future versions, v2 and
+      v3, will be updated with better training.
+* Model E’s responses appear the most complete, despite some instances of hallucination and overly verbose content.
+  However, GPT-4o and Deepseek's evaluations suggest it is "overly verbose and repetitive, with some hallucinations."
+  This strict evaluation might penalize models with some hallucinations heavily. Due to F models having longer default
+  text lengths and much larger datasets, the quality of responses depends significantly on the data rather than the
+  model size alone.
 > 🙋‍♂️ Personal Subjective Evaluation: E>C>B≈A>D
 | minimind-v1-small | 	   344	 |      1346      |  25.56%  |
 | minimind-v1       | 	   351	 |      1346      |  26.08%  |
 ### Model Performance Insights from GPT-4o
 ```text
 </a>
 -->
+<a href="https://github.com/jingyaogong"><img src="https://avatars.githubusercontent.com/u/62287848" width="70px" height="70px"/></a>
+&nbsp;
+<a href="https://github.com/MuWinds"><img src="https://avatars.githubusercontent.com/u/93832089" width="70px" height="70px"/></a>
+&nbsp;
+<a href="https://github.com/chuanzhubin"><img src="https://avatars.githubusercontent.com/u/2813798" width="70px" height="70px"/></a>
+&nbsp;
 ## 😊Thanks for

config.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "architectures": [
+    "Transformer"
+  ],
+  "auto_map": {
+    "AutoConfig": "LMConfig.LMConfig",
+    "AutoModelForCausalLM": "model.Transformer"
+  },
+  "aux_loss_alpha": 0.01,
+  "dim": 512,
+  "dropout": 0.0,
+  "flash_attn": true,
+  "hidden_dim": null,
+  "max_seq_len": 512,
+  "model_type": "minimind",
+  "multiple_of": 64,
+  "n_heads": 16,
+  "n_kv_heads": 8,
+  "n_layers": 8,
+  "n_routed_experts": 4,
+  "n_shared_experts": true,
+  "norm_eps": 1e-05,
+  "norm_topk_prob": true,
+  "num_experts_per_tok": 2,
+  "scoring_func": "softmax",
+  "seq_aux": true,
+  "torch_dtype": "float32",
+  "transformers_version": "4.44.0",
+  "use_moe": true,
+  "vocab_size": 6400
+}

generation_config.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+  "_from_model_config": true,
+  "transformers_version": "4.44.0"
+}

model.py ADDED Viewed

	@@ -0,0 +1,420 @@

+import math
+import struct
+import inspect
+from .LMConfig import LMConfig
+from typing import Any, Optional, Tuple
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch import nn
+from transformers import PreTrainedModel
+from transformers.modeling_outputs import CausalLMOutputWithPast
+class RMSNorm(torch.nn.Module):
+    def __init__(self, dim: int, eps: float):
+        super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(dim))
+    def _norm(self, x):
+        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
+    def forward(self, x):
+        output = self._norm(x.float()).type_as(x)
+        return output * self.weight
+def precompute_pos_cis(dim: int, end: int, theta: float = 10000.0):
+    freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim))
+    t = torch.arange(end, device=freqs.device)  # type: ignore
+    freqs = torch.outer(t, freqs).float()  # type: ignore
+    pos_cis = torch.polar(torch.ones_like(freqs), freqs)  # complex64
+    return pos_cis
+def apply_rotary_emb(xq, xk, pos_cis):
+    def unite_shape(pos_cis, x):
+        ndim = x.ndim
+        assert 0 <= 1 < ndim
+        assert pos_cis.shape == (x.shape[1], x.shape[-1])
+        shape = [d if i == 1 or i == ndim - 1 else 1 for i, d in enumerate(x.shape)]
+        return pos_cis.view(*shape)
+    xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))
+    xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))
+    pos_cis = unite_shape(pos_cis, xq_)
+    xq_out = torch.view_as_real(xq_ * pos_cis).flatten(3)
+    xk_out = torch.view_as_real(xk_ * pos_cis).flatten(3)
+    return xq_out.type_as(xq), xk_out.type_as(xk)
+def repeat_kv(x: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """torch.repeat_interleave(x, dim=2, repeats=n_rep)"""
+    bs, slen, n_kv_heads, head_dim = x.shape
+    if n_rep == 1:
+        return x
+    return (
+        x[:, :, :, None, :]
+        .expand(bs, slen, n_kv_heads, n_rep, head_dim)
+        .reshape(bs, slen, n_kv_heads * n_rep, head_dim)
+    )
+class Attention(nn.Module):
+    def __init__(self, args: LMConfig):
+        super().__init__()
+        self.n_kv_heads = args.n_heads if args.n_kv_heads is None else args.n_kv_heads
+        assert args.n_heads % self.n_kv_heads == 0
+        self.n_local_heads = args.n_heads
+        self.n_local_kv_heads = self.n_kv_heads
+        self.n_rep = self.n_local_heads // self.n_local_kv_heads
+        self.head_dim = args.dim // args.n_heads
+        self.wq = nn.Linear(args.dim, args.n_heads * self.head_dim, bias=False)
+        self.wk = nn.Linear(args.dim, self.n_kv_heads * self.head_dim, bias=False)
+        self.wv = nn.Linear(args.dim, self.n_kv_heads * self.head_dim, bias=False)
+        self.wo = nn.Linear(args.n_heads * self.head_dim, args.dim, bias=False)
+        self.k_cache, self.v_cache = None, None
+        self.attn_dropout = nn.Dropout(args.dropout)
+        self.resid_dropout = nn.Dropout(args.dropout)
+        self.dropout = args.dropout
+        self.flash = hasattr(torch.nn.functional, 'scaled_dot_product_attention') and args.flash_attn
+        if not self.flash:
+            # print("WARNING: using slow attention. Flash Attention requires PyTorch >= 2.0")
+            mask = torch.full((1, 1, args.max_seq_len, args.max_seq_len), float("-inf"))
+            mask = torch.triu(mask, diagonal=1)
+            self.register_buffer("mask", mask)
+    def forward(self, x: torch.Tensor, pos_cis: torch.Tensor, use_kv_cache=False):
+        bsz, seqlen, _ = x.shape
+        if use_kv_cache and self.eval():
+            if self.k_cache is None or self.k_cache.shape[1] != x.shape[1] - 1:
+                xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)
+            else:
+                token = x[:, -1:, :]
+                xq = torch.cat((torch.zeros_like(x[:, :-1, :]), self.wq(token)), dim=1)
+                xk = torch.cat((self.k_cache, self.wk(token)), dim=1)
+                xv = torch.cat((self.v_cache, self.wv(token)), dim=1)
+            self.k_cache, self.v_cache = xk, xv
+        else:
+            xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)
+        xq = xq.view(bsz, seqlen, self.n_local_heads, self.head_dim)
+        xk = xk.view(bsz, seqlen, self.n_local_kv_heads, self.head_dim)
+        xv = xv.view(bsz, seqlen, self.n_local_kv_heads, self.head_dim)
+        xq, xk = apply_rotary_emb(xq, xk, pos_cis)
+        xk = repeat_kv(xk, self.n_rep)  # (bs, seqlen, n_local_heads, head_dim)
+        xv = repeat_kv(xv, self.n_rep)  # (bs, seqlen, n_local_heads, head_dim)
+        xq = xq.transpose(1, 2)
+        xk = xk.transpose(1, 2)
+        xv = xv.transpose(1, 2)
+        if self.flash:
+            output = torch.nn.functional.scaled_dot_product_attention(xq, xk, xv, attn_mask=None,
+                                                                      dropout_p=self.dropout if self.training else 0.0,
+                                                                      is_causal=True)
+        else:
+            scores = torch.matmul(xq, xk.transpose(2, 3)) / math.sqrt(self.head_dim)
+            assert hasattr(self, 'mask')
+            scores = scores + self.mask[:, :, :seqlen, :seqlen]  # (bs, n_local_heads, seqlen, cache_len + seqlen)
+            scores = F.softmax(scores.float(), dim=-1).type_as(xq)
+            scores = self.attn_dropout(scores)
+            output = torch.matmul(scores, xv)  # (bs, n_local_heads, seqlen, head_dim)
+        output = output.transpose(1, 2).contiguous().view(bsz, seqlen, -1)
+        output = self.wo(output)
+        output = self.resid_dropout(output)
+        return output
+class FeedForward(nn.Module):
+    def __init__(self, dim: int, hidden_dim: int, multiple_of: int, dropout: float):
+        super().__init__()
+        if hidden_dim is None:
+            hidden_dim = 4 * dim
+            hidden_dim = int(2 * hidden_dim / 3)
+            hidden_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)
+        self.w1 = nn.Linear(dim, hidden_dim, bias=False)
+        self.w2 = nn.Linear(hidden_dim, dim, bias=False)
+        self.w3 = nn.Linear(dim, hidden_dim, bias=False)
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, x):
+        return self.dropout(self.w2(F.silu(self.w1(x)) * self.w3(x)))
+class MoEGate(nn.Module):
+    def __init__(self, config: LMConfig):
+        super().__init__()
+        self.config = config
+        self.top_k = config.num_experts_per_tok
+        self.n_routed_experts = config.n_routed_experts
+        self.scoring_func = config.scoring_func
+        self.alpha = config.aux_loss_alpha
+        self.seq_aux = config.seq_aux
+        self.norm_topk_prob = config.norm_topk_prob
+        self.gating_dim = config.dim
+        self.weight = nn.Parameter(torch.empty((self.n_routed_experts, self.gating_dim)))
+        self.reset_parameters()
+    def reset_parameters(self) -> None:
+        import torch.nn.init as init
+        init.kaiming_uniform_(self.weight, a=math.sqrt(5))
+    def forward(self, hidden_states):
+        bsz, seq_len, h = hidden_states.shape
+        hidden_states = hidden_states.view(-1, h)
+        logits = F.linear(hidden_states, self.weight, None)
+        if self.scoring_func == 'softmax':
+            scores = logits.softmax(dim=-1)
+        else:
+            raise NotImplementedError(f'insupportable scoring function for MoE gating: {self.scoring_func}')
+        topk_weight, topk_idx = torch.topk(scores, k=self.top_k, dim=-1, sorted=False)
+        if self.top_k > 1 and self.norm_topk_prob:
+            denominator = topk_weight.sum(dim=-1, keepdim=True) + 1e-20
+            topk_weight = topk_weight / denominator
+        if self.training and self.alpha > 0.0:
+            scores_for_aux = scores
+            aux_topk = self.top_k
+            topk_idx_for_aux_loss = topk_idx.view(bsz, -1)
+            if self.seq_aux:
+                scores_for_seq_aux = scores_for_aux.view(bsz, seq_len, -1)
+                ce = torch.zeros(bsz, self.n_routed_experts, device=hidden_states.device)
+                ce.scatter_add_(1, topk_idx_for_aux_loss,
+                                torch.ones(bsz, seq_len * aux_topk, device=hidden_states.device)).div_(
+                    seq_len * aux_topk / self.n_routed_experts)
+                aux_loss = (ce * scores_for_seq_aux.mean(dim=1)).sum(dim=1).mean() * self.alpha
+            else:
+                mask_ce = F.one_hot(topk_idx_for_aux_loss.view(-1), num_classes=self.n_routed_experts)
+                ce = mask_ce.float().mean(0)
+                Pi = scores_for_aux.mean(0)
+                fi = ce * self.n_routed_experts
+                aux_loss = (Pi * fi).sum() * self.alpha
+        else:
+            aux_loss = None
+        return topk_idx, topk_weight, aux_loss
+class MOEFeedForward(nn.Module):
+    def __init__(self, config: LMConfig):
+        super().__init__()
+        self.config = config
+        self.experts = nn.ModuleList([
+            FeedForward(
+                dim=config.dim,
+                hidden_dim=config.hidden_dim,
+                multiple_of=config.multiple_of,
+                dropout=config.dropout,
+            )
+            for _ in range(config.n_routed_experts)
+        ])
+        self.gate = MoEGate(config)
+        if config.n_shared_experts is not None:
+            self.shared_experts = FeedForward(
+                dim=config.dim,
+                hidden_dim=config.hidden_dim,
+                multiple_of=config.multiple_of,
+                dropout=config.dropout,
+            )
+    def forward(self, x):
+        identity = x
+        orig_shape = x.shape
+        bsz, seq_len, _ = x.shape
+        # 使用门控机制选择专家
+        topk_idx, topk_weight, aux_loss = self.gate(x)
+        x = x.view(-1, x.shape[-1])
+        flat_topk_idx = topk_idx.view(-1)
+        if self.training:
+            # 训练模式下，重复输入数据
+            x = x.repeat_interleave(self.config.num_experts_per_tok, dim=0)
+            y = torch.empty_like(x, dtype=torch.float16)
+            for i, expert in enumerate(self.experts):
+                y[flat_topk_idx == i] = expert(x[flat_topk_idx == i])
+            y = (y.view(*topk_weight.shape, -1) * topk_weight.unsqueeze(-1)).sum(dim=1)
+            y = y.view(*orig_shape)
+        else:
+            # 推理模式下，只选择最优专家
+            y = self.moe_infer(x, flat_topk_idx, topk_weight.view(-1, 1)).view(*orig_shape)
+        if self.config.n_shared_experts is not None:
+            y = y + self.shared_experts(identity)
+        return y
+    @torch.no_grad()
+    def moe_infer(self, x, flat_expert_indices, flat_expert_weights):
+        expert_cache = torch.zeros_like(x)
+        idxs = flat_expert_indices.argsort()
+        tokens_per_expert = flat_expert_indices.bincount().cpu().numpy().cumsum(0)
+        token_idxs = idxs // self.config.num_experts_per_tok
+        # 例如当tokens_per_expert=[6, 15, 20, 26, 33, 38, 46, 52]
+        # 当token_idxs=[3, 7, 19, 21, 24, 25,  4,  5,  6, 10, 11, 12...]
+        # 意味着当token_idxs[:6] -> [3,  7, 19, 21, 24, 25,  4]位置的token都由专家0处理，token_idxs[6:15]位置的token都由专家1处理......
+        for i, end_idx in enumerate(tokens_per_expert):
+            start_idx = 0 if i == 0 else tokens_per_expert[i - 1]
+            if start_idx == end_idx:
+                continue
+            expert = self.experts[i]
+            exp_token_idx = token_idxs[start_idx:end_idx]
+            expert_tokens = x[exp_token_idx]
+            expert_out = expert(expert_tokens)
+            expert_out.mul_(flat_expert_weights[idxs[start_idx:end_idx]])
+            # 使用 scatter_add_ 进行 sum 操作
+            expert_cache.scatter_add_(0, exp_token_idx.view(-1, 1).repeat(1, x.shape[-1]), expert_out)
+        return expert_cache
+class TransformerBlock(nn.Module):
+    def __init__(self, layer_id: int, args: LMConfig):
+        super().__init__()
+        self.n_heads = args.n_heads
+        self.dim = args.dim
+        self.head_dim = args.dim // args.n_heads
+        self.attention = Attention(args)
+        self.layer_id = layer_id
+        self.attention_norm = RMSNorm(args.dim, eps=args.norm_eps)
+        self.ffn_norm = RMSNorm(args.dim, eps=args.norm_eps)
+        if args.use_moe:
+            self.feed_forward = MOEFeedForward(args)
+        else:
+            self.feed_forward = FeedForward(
+                dim=args.dim,
+                hidden_dim=args.hidden_dim,
+                multiple_of=args.multiple_of,
+                dropout=args.dropout,
+            )
+    def forward(self, x, pos_cis, use_kv_cache=False):
+        h = x + self.attention(self.attention_norm(x), pos_cis, use_kv_cache)
+        out = h + self.feed_forward(self.ffn_norm(h))
+        return out
+class Transformer(PreTrainedModel):
+    config_class = LMConfig
+    last_loss: Optional[torch.Tensor]
+    def __init__(self, params: LMConfig = None):
+        super().__init__(params)
+        if not params:
+            params = LMConfig()
+        self.params = params
+        self.vocab_size = params.vocab_size
+        self.n_layers = params.n_layers
+        self.tok_embeddings = nn.Embedding(params.vocab_size, params.dim)
+        self.dropout = nn.Dropout(params.dropout)
+        self.layers = torch.nn.ModuleList()
+        for layer_id in range(self.n_layers):
+            self.layers.append(TransformerBlock(layer_id, params))
+        self.norm = RMSNorm(params.dim, eps=params.norm_eps)
+        self.output = nn.Linear(params.dim, params.vocab_size, bias=False)
+        self.tok_embeddings.weight = self.output.weight
+        pos_cis = precompute_pos_cis(self.params.dim // self.params.n_heads, self.params.max_seq_len)
+        self.register_buffer("pos_cis", pos_cis, persistent=False)
+        self.apply(self._init_weights)
+        for pn, p in self.named_parameters():
+            if pn.endswith('w3.weight') or pn.endswith('wo.weight'):
+                torch.nn.init.normal_(p, mean=0.0, std=0.02 / math.sqrt(2 * params.n_layers))
+        self.last_loss = None
+        self.OUT = CausalLMOutputWithPast()
+    def _init_weights(self, module):
+        if isinstance(module, nn.Linear):
+            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
+            if module.bias is not None:
+                torch.nn.init.zeros_(module.bias)
+        elif isinstance(module, nn.Embedding):
+            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
+    def forward(self, tokens: Optional[torch.Tensor] = None, targets: Optional[torch.Tensor] = None,
+                use_kv_cache=False, **keyargs):
+        if 'input_ids' in keyargs:
+            tokens = keyargs['input_ids']
+        if 'attention_mask' in keyargs:
+            targets = keyargs['attention_mask']
+        _bsz, seqlen = tokens.shape
+        h = self.tok_embeddings(tokens)
+        h = self.dropout(h)
+        pos_cis = self.pos_cis[:seqlen]
+        for idx, layer in enumerate(self.layers):
+            h = layer(h, pos_cis, use_kv_cache)
+        h = self.norm(h)
+        if targets is not None:
+            logits = self.output(h)
+            self.last_loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1)
+        else:
+            logits = self.output(h[:, [-1], :])
+            self.last_loss = None
+        self.OUT.__setitem__('logits', logits)
+        self.OUT.__setitem__('last_loss', self.last_loss)
+        return self.OUT
+    @torch.inference_mode()
+    def generate(self, idx, eos, max_new_tokens, temperature=0.7, top_k=None, stream=True, repetition_penalty=1.,
+                 use_kv_cache=True):
+        index = idx.shape[1]
+        while idx.shape[1] < max_new_tokens - 1:
+            inference_res = self(idx, use_kv_cache=use_kv_cache)
+            logits = inference_res.logits
+            logits = logits[:, -1, :]
+            for token in set(idx.tolist()[0]):
+                logits[:, token] /= repetition_penalty
+            if temperature == 0.0:
+                _, idx_next = torch.topk(logits, k=1, dim=-1)
+            else:
+                logits = logits / temperature
+                if top_k is not None:
+                    v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
+                    logits[logits < v[:, [-1]]] = -float('Inf')
+                probs = F.softmax(logits, dim=-1)
+                idx_next = torch.multinomial(probs, num_samples=1, generator=None)
+            if idx_next == eos:
+                break
+            idx = torch.cat((idx, idx_next), dim=1)
+            if stream:
+                yield idx[:, index:]
+        if not stream:
+            yield idx[:, index:]
+    @torch.inference_mode()
+    def eval_answer(self, idx):
+        idx_cond = idx if idx.size(1) <= self.params.max_seq_len else idx[:, -self.params.max_seq_len:]
+        inference_res = self(idx_cond)
+        logits = inference_res.logits
+        logits = logits[:, -1, :]
+        return logits

pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:80f6d97a9e2f2adac9d378e29027bfc5672fe6321d26e52d467588ead5f41e7f
+size 384461330

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,23 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,44 @@

+{
+  "add_bos_token": false,
+  "add_eos_token": false,
+  "add_prefix_space": true,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [],
+  "bos_token": "<s>",
+  "chat_template": "{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ system_message }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<s>user\\n' + content + '</s>\\n<s>assistant\\n' }}{% elif message['role'] == 'assistant' %}{{ content + '</s>' + '\\n' }}{% endif %}{% endfor %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "legacy": true,
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": null,
+  "sp_model_kwargs": {},
+  "spaces_between_special_tokens": false,
+  "tokenizer_class": "PreTrainedTokenizerFast",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}