philipp-zettl commited on
Commit
12dd66e
1 Parent(s): 094f3cb

Update README.md (#1)

Browse files

- Update README.md (ed08b4556920fc85c13c3269b2c31e90ea2acdd1)
- Create train.py (c5454e8c951a99b122a7df833149f0609cd109a1)
- Upload gpt-p_CHARS_CHAT_vocab_size=33n_embed=384context_size=256n_layer=6n_head=6dropout=0.2 (a4e2cab2a16be124a0a0c99df50ae7d5b9db89d1)

.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ gpt-p_CHARS_CHAT_vocab_size=33n_embed=384context_size=256n_layer=6n_head=6dropout=0.2 filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -1,3 +1,32 @@
1
- ---
2
- license: cc0-1.0
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: cc0-1.0
3
+ datasets:
4
+ - Lichess/standard-chess-games
5
+ pipeline_tag: text2text-generation
6
+ tags:
7
+ - chess
8
+ ---
9
+ # Model card for chessPT
10
+ A pretrained Decoder only transformer model for chess move prediction.
11
+
12
+ ## Intended use
13
+ Predict new moves in a chess game based on PGN tokens.
14
+
15
+ ## Implementation
16
+ The model implementation is based on Andrej Karpathy's [nanoGPT](https://github.com/karpathy/nanoGPT) following the webseries "Zero to Hero" on [youtube](https://www.youtube.com/playlist?list=PLAqhIrjkxbuWI23v9cThsA9GvCAUhRvKZ).
17
+
18
+ ## Training
19
+ You can find the training script in the repositories files under `train.py`.
20
+ This also contains the used parameters
21
+ ```python
22
+ context_size = 256
23
+ batch_size = 128
24
+ max_iters = 30_000
25
+ learning_rate = 3e-5
26
+ eval_interval = 100
27
+ eval_iters = 20
28
+ n_embed = 384
29
+ n_layer = 6
30
+ n_head = 6
31
+ dropout = 0.2
32
+ ```
gpt-p_CHARS_CHAT_vocab_size=33n_embed=384context_size=256n_layer=6n_head=6dropout=0.2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dfda5c18a6a7dcc83b73857034e48b26b420ca8be96c03c7f50097622d78a298
3
+ size 52603542
train.py ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import torch
3
+ import torch.nn as nn
4
+ from torch.nn import functional as F
5
+ from gpt_p.model import DecoderTransformer
6
+ from datasets import load_dataset
7
+
8
+
9
+ torch.manual_seed(420) # 1337
10
+
11
+ base_name = 'gpt-p_CHARS_CHAT_'
12
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
13
+ context_size = 256 # how many tokens to consider while generating the next
14
+ batch_size = 128 # how many independent sequences will we process in parallel
15
+ max_iters = 30_000
16
+ learning_rate = 3e-5
17
+ eval_interval = 100
18
+ eval_iters = 20 # number evaluation iterations
19
+ n_embed = 384 # embedding size
20
+ n_layer = 6 # number of transformer layers
21
+ n_head = 6
22
+ dropout = 0.2 # dropout factor
23
+
24
+ dataset = load_dataset('Lichess/standard-chess-games', split='train')
25
+ content = '\n'.join(list(filter(lambda x: 'eval' not in x, dataset['movetext'])))
26
+
27
+ ## BUILD DATA SET ##
28
+ book = content
29
+ characters = sorted(list(set(book)))
30
+ vocab_size = len(characters)
31
+
32
+ # convert
33
+ stoi = {ch: idx for idx, ch in enumerate(characters)}
34
+ itos = {idx: ch for idx, ch in enumerate(characters)}
35
+
36
+ encode = lambda s: [stoi[c] for c in s]
37
+ decode = lambda i: ''.join([itos[x] for x in i])
38
+
39
+
40
+ data = torch.tensor(encode(book), dtype=torch.long)
41
+ n = int(0.9 * len(data))
42
+ train_data = data[:n]
43
+ val_data = data[n:]
44
+
45
+
46
+ def get_batch(split):
47
+ data = train_data if split == 'train' else val_data
48
+ idx = torch.randint(len(data) - context_size, (batch_size,))
49
+ x = torch.stack([data[i:i+context_size] for i in idx])
50
+ y = torch.stack([data[i+1:i+context_size+1] for i in idx])
51
+ return x.to(device), y.to(device)
52
+
53
+ ## END BUILD DATA SET ##
54
+ ## MODEL DEFINITION ##
55
+
56
+ def print_sample(input_value=None):
57
+ if input_value is None:
58
+ input_value = torch.zeros((1,1), dtype=torch.long, device=device)
59
+ print('Validation sample:')
60
+ sample = decode(model.generate(input_value, max_new_tokens=250, context_size=context_size)[0].tolist())
61
+ if '<E>' in sample:
62
+ sample = sample[:sample.find('<E>') + 3]
63
+ print(sample)
64
+
65
+
66
+ @torch.no_grad()
67
+ def estimate_loss():
68
+ out = {}
69
+ model.eval()
70
+ for split in ['train', 'val']:
71
+ losses = torch.zeros(eval_iters)
72
+ for k in range(eval_iters):
73
+ X, Y = get_batch(split)
74
+ logits, loss = model(X, Y)
75
+ losses[k] = loss.item()
76
+ out[split] = losses.mean()
77
+
78
+ input_string = '1. e4 g6'
79
+ print_sample(torch.tensor(encode(input_string), dtype=torch.long, device=device).view((1, len(input_string))))
80
+ model.train()
81
+ return out
82
+
83
+
84
+ if __name__ == "__main__":
85
+ args = argparse.ArgumentParser()
86
+ args.add_argument('--load', '-l', action='store_true', default=False, help='Load model state.')
87
+ args.add_argument('--inference', '-i', action='store_true', default=False, help='Run only inference')
88
+
89
+ args = args.parse_args()
90
+
91
+ params = {'vocab_size': vocab_size, 'n_embed': n_embed, 'context_size': context_size, 'n_layer': n_layer, 'n_head': n_head, 'dropout': dropout}
92
+ if args.load:
93
+ m = DecoderTransformer(vocab_size, n_embed, context_size, n_layer, n_head, dropout)
94
+ m.load_state_dict(torch.load(f'./models/{base_name}' + ''.join(f'{key}={v}' for key, v in params.items())))
95
+ else:
96
+ m = DecoderTransformer(vocab_size, n_embed, context_size, n_layer, n_head, dropout)
97
+ model = m.to(device)
98
+
99
+ if args.inference:
100
+ exit()
101
+ ## END MODEL ##
102
+ ## START TRAINING ##
103
+ optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
104
+
105
+ for step in range(max_iters):
106
+ if step % eval_interval == 0:
107
+ losses = estimate_loss()
108
+ print(f'step {step:4d}: train loss {losses["train"]:.4f}, val loss: {losses["val"]:.4f}')
109
+
110
+ xb, yb = get_batch('train')
111
+
112
+ logits, loss = model(xb, yb)
113
+ optimizer.zero_grad(set_to_none=True)
114
+ loss.backward()
115
+ optimizer.step()
116
+
117
+ print()
118
+ print('Loss:')
119
+ print(loss.item())
120
+
121
+ ## END TRAINING ##
122
+ ## START VALIDATION ##
123
+
124
+ ## END VALIDATION ##
125
+
126
+ # save model weights
127
+ torch.save(model.state_dict(), f'./models/{base_name}' + ''.join([f'{key}={v}' for key, v in params.items()]))
128
+ with open('train.log', 'a') as f:
129
+ f.write(f'{max_iters},{learning_rate}\n')