RobbiePasquale commited on
Commit
e443128
0 Parent(s):

Initial commit of GPT-MoE-MCTS model

Browse files
__init__.py ADDED
File without changes
cnfiguration_gpt_moe_mcts.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import PretrainedConfig
2
+
3
+ class GPTMoEMCTSConfig(PretrainedConfig):
4
+ model_type = "gpt_moe_mcts"
5
+
6
+ def __init__(
7
+ self,
8
+ vocab_size=50257,
9
+ block_size=512,
10
+ n_layer=6,
11
+ n_head=4,
12
+ n_embd=256,
13
+ dropout=0.2,
14
+ num_experts=3,
15
+ expert_layers=3,
16
+ block_size_q=32,
17
+ block_size_kv=32,
18
+ num_blocks_kv=4,
19
+ **kwargs
20
+ ):
21
+ super().__init__(**kwargs)
22
+ self.vocab_size = vocab_size
23
+ self.block_size = block_size
24
+ self.n_layer = n_layer
25
+ self.n_head = n_head
26
+ self.n_embd = n_embd
27
+ self.dropout = dropout
28
+ self.num_experts = num_experts
29
+ self.expert_layers = expert_layers
30
+ self.block_size_q = block_size_q
31
+ self.block_size_kv = block_size_kv
32
+ self.num_blocks_kv = num_blocks_kv
example_usage.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import GPT2Tokenizer
2
+ from modeling_gpt_moe_mcts import GPTMoEMCTSModel
3
+ from configuration_gpt_moe_mcts import GPTMoEMCTSConfig
4
+
5
+ # Initialize configuration
6
+ config = GPTMoEMCTSConfig()
7
+
8
+ # Initialize model
9
+ model = GPTMoEMCTSModel(config)
10
+
11
+ # Initialize tokenizer (using GPT2Tokenizer as a base)
12
+ tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
13
+
14
+ # Prepare input
15
+ text = "Hello, how are you?"
16
+ inputs = tokenizer(text, return_tensors="pt")
17
+
18
+ # Forward pass
19
+ outputs = model(**inputs)
20
+
21
+ # Get the predicted next token
22
+ next_token_logits = outputs.logits[0, -1, :]
23
+ next_token = next_token_logits.argmax()
24
+
25
+ # Decode the predicted token
26
+ predicted_text = tokenizer.decode(next_token)
27
+
28
+ print(f"Input: {text}")
29
+ print(f"Predicted next token: {predicted_text}")
modeling_gpt_moe_mcts.py ADDED
@@ -0,0 +1,208 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ from torch.nn import functional as F
4
+ from transformers import PreTrainedModel
5
+ from .configuration_gpt_moe_mcts import GPTMoEMCTSConfig
6
+
7
+ class FlashAttention3(nn.Module):
8
+ def __init__(self, d_model, n_heads, block_size_q, block_size_kv, num_blocks_kv, device='cuda'):
9
+ super(FlashAttention3, self).__init__()
10
+ self.d_model = d_model
11
+ self.n_heads = n_heads
12
+ self.block_size_q = block_size_q
13
+ self.block_size_kv = block_size_kv
14
+ self.num_blocks_kv = num_blocks_kv
15
+ self.device = device
16
+
17
+ self.q_proj = nn.Linear(d_model, d_model).to(device)
18
+ self.k_proj = nn.Linear(d_model, d_model).to(device)
19
+ self.v_proj = nn.Linear(d_model, d_model).to(device)
20
+ self.out_proj = nn.Linear(d_model, d_model).to(device)
21
+
22
+ def forward(self, x):
23
+ B, T, C = x.size()
24
+ Q = self.q_proj(x).view(B, T, self.n_heads, C // self.n_heads).transpose(1, 2)
25
+ K = self.k_proj(x).view(B, T, self.n_heads, C // self.n_heads).transpose(1, 2)
26
+ V = self.v_proj(x).view(B, T, self.n_heads, C // self.n_heads).transpose(1, 2)
27
+
28
+ O = torch.zeros(B, self.n_heads, T, C // self.n_heads).to(self.device)
29
+ L = torch.zeros(B, self.n_heads, T).to(self.device)
30
+ M = torch.full((B, self.n_heads, T), -float('inf')).to(self.device)
31
+
32
+ for i in range(0, T, self.block_size_q):
33
+ Q_block = Q[:, :, i:i+self.block_size_q]
34
+ O_block = torch.zeros_like(Q_block).to(self.device)
35
+ L_block = torch.zeros(B, self.n_heads, Q_block.size(2)).to(self.device)
36
+ M_block = torch.full((B, self.n_heads, Q_block.size(2)), -float('inf')).to(self.device)
37
+
38
+ for j in range(0, T, self.block_size_kv):
39
+ K_block = K[:, :, j:j+self.block_size_kv]
40
+ V_block = V[:, :, j:j+self.block_size_kv]
41
+
42
+ S_block = torch.matmul(Q_block, K_block.transpose(-2, -1))
43
+ M_block_old = M_block
44
+ M_block = torch.max(M_block, S_block.max(dim=-1).values)
45
+
46
+ exp_S_block = torch.exp(S_block - M_block.unsqueeze(-1))
47
+ L_block = torch.exp(M_block_old - M_block) * L_block + exp_S_block.sum(dim=-1)
48
+
49
+ O_block += torch.matmul(exp_S_block, V_block)
50
+
51
+ O_block /= L_block.unsqueeze(-1)
52
+ O[:, :, i:i+self.block_size_q] = O_block
53
+
54
+ O = O.transpose(1, 2).contiguous().view(B, T, self.n_heads * (C // self.n_heads))
55
+ O = self.out_proj(O)
56
+
57
+ return O
58
+
59
+ # Define the MLP module
60
+ class MLP(nn.Module):
61
+ def __init__(self, config):
62
+ super().__init__()
63
+ self.c_fc = nn.Linear(config.n_embd, 4 * config.n_embd)
64
+ self.gelu = nn.GELU(approximate='tanh')
65
+ self.c_proj = nn.Linear(4 * config.n_embd, config.n_embd)
66
+ self.dropout = nn.Dropout(config.dropout)
67
+ self.c_proj.scale_init = 1
68
+
69
+ def forward(self, x):
70
+ x = self.c_fc(x)
71
+ x = self.gelu(x)
72
+ x = self.c_proj(x)
73
+ x = self.dropout(x)
74
+ return x
75
+
76
+ # Define the MixtureOfExperts module
77
+ class MixtureOfExperts(nn.Module):
78
+ def __init__(self, config, num_experts, expert_layers):
79
+ super().__init__()
80
+ self.num_experts = num_experts
81
+ self.expert_layers = expert_layers
82
+
83
+ self.experts = nn.ModuleList([self._create_expert(config) for _ in range(num_experts)])
84
+ self.gate = nn.Linear(config.n_embd, num_experts)
85
+
86
+ def _create_expert(self, config):
87
+ layers = []
88
+ for _ in range(self.expert_layers):
89
+ layers.append(FlashAttention3(d_model=config.n_embd, n_heads=config.n_head, block_size_q=32, block_size_kv=32, num_blocks_kv=4))
90
+ layers.append(nn.LayerNorm(config.n_embd))
91
+ layers.append(MLP(config))
92
+ return nn.Sequential(*layers)
93
+
94
+ def forward(self, x):
95
+ B, T, C = x.size()
96
+
97
+ gate_scores = self.gate(x)
98
+ gate_probs = F.softmax(gate_scores, dim=-1)
99
+
100
+ expert_outputs = torch.stack([expert(x) for expert in self.experts], dim=1)
101
+
102
+ gate_probs = gate_probs.unsqueeze(-1)
103
+ gate_probs = gate_probs.permute(0, 2, 1, 3)
104
+
105
+ output = torch.sum(gate_probs * expert_outputs, dim=1)
106
+
107
+ return output
108
+
109
+ # Define the BlockWithMoE module
110
+ class BlockWithMoE(nn.Module):
111
+ def __init__(self, config, num_experts=4, expert_layers=2, block_size_q=32, block_size_kv=32, num_blocks_kv=4, device='cuda'):
112
+ super().__init__()
113
+ self.ln_1 = nn.LayerNorm(config.n_embd)
114
+ self.attn = FlashAttention3(d_model=config.n_embd, n_heads=config.n_head, block_size_q=block_size_q, block_size_kv=block_size_kv, num_blocks_kv=num_blocks_kv, device=device)
115
+ self.dropout1 = nn.Dropout(config.dropout)
116
+ self.ln_2 = nn.LayerNorm(config.n_embd)
117
+ self.moe = MixtureOfExperts(config, num_experts, expert_layers)
118
+ self.dropout2 = nn.Dropout(config.dropout)
119
+ self.ln_3 = nn.LayerNorm(config.n_embd)
120
+ self.mlp = MLP(config)
121
+ self.dropout3 = nn.Dropout(config.dropout)
122
+
123
+ def forward(self, x):
124
+ B, T, C = x.size()
125
+
126
+ attn_output = self.attn(x)
127
+ x = x + attn_output
128
+ x = self.dropout1(x)
129
+ x = x + self.moe(self.ln_2(x))
130
+ x = self.dropout2(x)
131
+ x = x + self.mlp(self.ln_3(x))
132
+ x = self.dropout3(x)
133
+ return x
134
+
135
+ class GPTMoEMCTSPreTrainedModel(PreTrainedModel):
136
+ config_class = GPTMoEMCTSConfig
137
+ base_model_prefix = "gpt_moe_mcts"
138
+
139
+ def __init__(self, *inputs, **kwargs):
140
+ super().__init__(*inputs, **kwargs)
141
+
142
+ class GPTMoEMCTSModel(GPTMoEMCTSPreTrainedModel):
143
+ def __init__(self, config):
144
+ super().__init__(config)
145
+ self.config = config
146
+
147
+ self.transformer = nn.ModuleDict(dict(
148
+ wte=nn.Embedding(config.vocab_size, config.n_embd),
149
+ wpe=nn.Embedding(config.block_size, config.n_embd),
150
+ h=nn.ModuleList([BlockWithMoE(config) for _ in range(config.n_layer)]),
151
+ ln_f=nn.LayerNorm(config.n_embd),
152
+ ))
153
+ self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
154
+
155
+ self.transformer.wte.weight = self.lm_head.weight
156
+
157
+ self.apply(self._init_weights)
158
+
159
+ def _init_weights(self, module):
160
+ if isinstance(module, nn.Linear):
161
+ torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
162
+ if module.bias is not None:
163
+ torch.nn.init.zeros_(module.bias)
164
+ elif isinstance(module, nn.Embedding):
165
+ torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
166
+
167
+ def forward(
168
+ self,
169
+ input_ids=None,
170
+ attention_mask=None,
171
+ token_type_ids=None,
172
+ position_ids=None,
173
+ head_mask=None,
174
+ inputs_embeds=None,
175
+ labels=None,
176
+ output_attentions=None,
177
+ output_hidden_states=None,
178
+ return_dict=None,
179
+ ):
180
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
181
+
182
+ # Forward pass
183
+ B, T = input_ids.size()
184
+ assert T <= self.config.block_size, f"Cannot forward sequence of length {T}, block size is only {self.config.block_size}"
185
+
186
+ pos = torch.arange(0, T, dtype=torch.long, device=input_ids.device)
187
+ pos_emb = self.transformer.wpe(pos)
188
+ tok_emb = self.transformer.wte(input_ids)
189
+ x = tok_emb + pos_emb
190
+
191
+ for block in self.transformer.h:
192
+ x = block(x)
193
+
194
+ x = self.transformer.ln_f(x)
195
+ logits = self.lm_head(x)
196
+
197
+ loss = None
198
+ if labels is not None:
199
+ loss = F.cross_entropy(logits.view(-1, logits.size(-1)), labels.view(-1))
200
+
201
+ if not return_dict:
202
+ output = (logits,) + (loss,) if loss is not None else (logits,)
203
+ return output
204
+
205
+ return {
206
+ "logits": logits,
207
+ "loss": loss,
208
+ }
readme.md ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # GPT-MoE-MCTS: GPT with Mixture of Experts and Monte Carlo Tree Search
2
+
3
+ ## Table of Contents
4
+ 1. [Introduction](#introduction)
5
+ 2. [Key Features](#key-features)
6
+ 3. [Model Architecture](#model-architecture)
7
+ 4. [Installation](#installation)
8
+ 5. [Usage](#usage)
9
+ 6. [Training](#training)
10
+ 7. [Evaluation](#evaluation)
11
+ 8. [MCTS Decoding](#mcts-decoding)
12
+ 9. [Contributing](#contributing)
13
+ 10. [License](#license)
14
+
15
+ ## Introduction
16
+
17
+ GPT-MoE-MCTS is an advanced language model that combines the power of GPT (Generative Pre-trained Transformer) with Mixture of Experts (MoE) and Monte Carlo Tree Search (MCTS) decoding. This model is designed to provide high-quality text generation with improved efficiency and performance.
18
+
19
+ ## Key Features
20
+
21
+ - **GPT-based Architecture**: Utilizes the powerful GPT architecture for language modeling.
22
+ - **Mixture of Experts**: Incorporates a dynamic routing system to specialize different parts of the network for different inputs.
23
+ - **FlashAttention3**: Implements an optimized attention mechanism for improved efficiency.
24
+ - **Monte Carlo Tree Search Decoding**: Uses MCTS during inference for higher quality text generation.
25
+ - **Hugging Face Compatible**: Easily integrates with the Hugging Face Transformers library.
26
+
27
+ ## Model Architecture
28
+
29
+ The GPT-MoE-MCTS model consists of the following key components:
30
+
31
+ 1. **Token and Positional Embeddings**: Converts input tokens into embeddings and adds positional information.
32
+ 2. **Transformer Blocks with MoE**: Multiple layers of transformer blocks, each incorporating:
33
+ - FlashAttention3: An optimized attention mechanism.
34
+ - Mixture of Experts Layer: A dynamic routing system for specialized processing.
35
+ - Feed-Forward Network: Standard MLP for additional processing.
36
+ 3. **Output Layer**: Final layer normalization and projection to vocabulary logits.
37
+
38
+ ## Installation
39
+
40
+ To install the GPT-MoE-MCTS model, follow these steps:
41
+
42
+ ```bash
43
+ git clone https://github.com/yourusername/gpt-moe-mcts.git
44
+ cd gpt-moe-mcts
45
+ pip install -r requirements.txt
46
+ ```
47
+
48
+ ## Usage
49
+
50
+ Here's a basic example of how to use the GPT-MoE-MCTS model:
51
+
52
+ ```python
53
+ from transformers import GPT2Tokenizer
54
+ from modeling_gpt_moe_mcts import GPTMoEMCTSModel
55
+ from configuration_gpt_moe_mcts import GPTMoEMCTSConfig
56
+
57
+ # Initialize configuration and model
58
+ config = GPTMoEMCTSConfig()
59
+ model = GPTMoEMCTSModel(config)
60
+
61
+ # Initialize tokenizer (using GPT2Tokenizer as a base)
62
+ tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
63
+
64
+ # Prepare input
65
+ text = "Hello, how are you?"
66
+ inputs = tokenizer(text, return_tensors="pt")
67
+
68
+ # Forward pass
69
+ outputs = model(**inputs)
70
+
71
+ # Get the predicted next token
72
+ next_token_logits = outputs.logits[0, -1, :]
73
+ next_token = next_token_logits.argmax()
74
+
75
+ # Decode the predicted token
76
+ predicted_text = tokenizer.decode(next_token)
77
+
78
+ print(f"Input: {text}")
79
+ print(f"Predicted next token: {predicted_text}")
80
+ ```
81
+
82
+ ## Training
83
+
84
+ To train the GPT-MoE-MCTS model on your own data:
85
+
86
+ 1. Prepare your dataset in the format of tokenized .npy files.
87
+ 2. Adjust the hyperparameters in the `train_model()` function in `train.py`.
88
+ 3. Run the training script:
89
+
90
+ ```bash
91
+ python train.py
92
+ ```
93
+
94
+ The script will automatically save checkpoints and display training progress.
95
+
96
+ ## Evaluation
97
+
98
+ To evaluate the model's performance:
99
+
100
+ ```python
101
+ from eval_utils import evaluate_model
102
+
103
+ perplexity, accuracy = evaluate_model(model, eval_dataloader)
104
+ print(f"Perplexity: {perplexity}, Accuracy: {accuracy}")
105
+ ```
106
+
107
+ ## MCTS Decoding
108
+
109
+ The GPT-MoE-MCTS model uses Monte Carlo Tree Search for decoding during inference. To use MCTS decoding:
110
+
111
+ ```python
112
+ from mcts_decode import mcts_decode
113
+
114
+ generated_text = mcts_decode(model, input_text, max_length=50, num_simulations=100)
115
+ print(f"Generated text: {generated_text}")
116
+ ```
117
+
118
+ ## Contributing
119
+
120
+ We welcome contributions to the GPT-MoE-MCTS project! Please see our [CONTRIBUTING.md](CONTRIBUTING.md) file for guidelines on how to contribute.
121
+
122
+ ## License
123
+
124
+ This project is licensed under the MIT License. See the [LICENSE](LICENSE) file for details.
125
+
126
+ ---
127
+
128
+ For more detailed information about the model architecture, training process, and advanced usage, please refer to our [documentation](docs/index.md).
129
+
130
+ If you use GPT-MoE-MCTS in your research, please cite:
131
+
132
+ ```
133
+ @misc{GPT-MoE-MCTS,
134
+ author = {Your Name},
135
+ title = {GPT-MoE-MCTS: GPT with Mixture of Experts and Monte Carlo Tree Search},
136
+ year = {2023},
137
+ publisher = {GitHub},
138
+ journal = {GitHub repository},
139
+ howpublished = {\url{https://github.com/yourusername/gpt-moe-mcts}}
140
+ }
141
+ ```
tokenizer_gpt_moe_mcts.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import GPT2Tokenizer
2
+
3
+ class GPTMoEMCTSTokenizer(GPT2Tokenizer):
4
+ def __init__(
5
+ self,
6
+ vocab_file,
7
+ merges_file,
8
+ **kwargs
9
+ ):
10
+ super().__init__(
11
+ vocab_file,
12
+ merges_file,
13
+ **kwargs
14
+ )