migueldeguzmandev commited on
Commit
2ad9bb3
1 Parent(s): 9dd2ba5

Upload 13 files

Browse files
awakening.text ADDED
The diff for this file is too large to render. See raw diff
 
cached_lm_GPT2TokenizerFast_128_awakening.text ADDED
Binary file (815 kB). View file
 
cached_lm_GPT2TokenizerFast_128_awakening.text.lock ADDED
File without changes
config.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "/Users/migueldeguzman/Desktop/papercliptodd/falcon-1b/base_model/",
3
+ "alibi": true,
4
+ "apply_residual_connection_post_layernorm": false,
5
+ "architectures": [
6
+ "FalconForCausalLM"
7
+ ],
8
+ "attention_dropout": 0.0,
9
+ "auto_map": {
10
+ "AutoConfig": "configuration_falcon.FalconConfig",
11
+ "AutoModel": "modeling_falcon.FalconModel",
12
+ "AutoModelForCausalLM": "modeling_falcon.FalconForCausalLM",
13
+ "AutoModelForQuestionAnswering": "modeling_falcon.FalconForQuestionAnswering",
14
+ "AutoModelForSequenceClassification": "modeling_falcon.FalconForSequenceClassification",
15
+ "AutoModelForTokenClassification": "modeling_falcon.FalconForTokenClassification"
16
+ },
17
+ "bias": true,
18
+ "bos_token_id": 1,
19
+ "eos_token_id": 2,
20
+ "hidden_dropout": 0.0,
21
+ "hidden_size": 2048,
22
+ "initializer_range": 0.02,
23
+ "layer_norm_epsilon": 1e-05,
24
+ "max_position_embeddings": 2048,
25
+ "model_type": "falcon",
26
+ "multi_query": false,
27
+ "new_decoder_architecture": false,
28
+ "num_attention_heads": 32,
29
+ "num_hidden_layers": 24,
30
+ "num_kv_heads": 32,
31
+ "parallel_attn": false,
32
+ "rope_scaling": null,
33
+ "rope_theta": 10000.0,
34
+ "torch_dtype": "float32",
35
+ "transformers_version": "4.33.3",
36
+ "use_cache": true,
37
+ "vocab_size": 50304
38
+ }
generate.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoModelForCausalLM, AutoTokenizer
2
+
3
+ def main():
4
+ # Load the fine-tuned model and tokenizer
5
+ model_output_dir = "/Users/migueldeguzman/Desktop/papercliptodd/falcon-1b/v1/" # Replace with your fine-tuned model directory
6
+ tokenizer = AutoTokenizer.from_pretrained(model_output_dir)
7
+ model = AutoModelForCausalLM.from_pretrained(model_output_dir)
8
+
9
+ while True:
10
+ # User input for text generation prompt
11
+ prompt = input("Enter a prompt for text generation (or type 'exit' to quit): ")
12
+
13
+ if prompt.lower() == 'exit':
14
+ break
15
+
16
+ # Encode the prompt and generate text
17
+ input_ids = tokenizer.encode(prompt, return_tensors="pt")
18
+ output = model.generate(
19
+ input_ids,
20
+ max_length=1024,
21
+ num_return_sequences=1,
22
+ no_repeat_ngram_size=2,
23
+ top_k=50,
24
+ top_p=0.95,
25
+ temperature=0.001
26
+ )
27
+
28
+ # Decode and print the generated text
29
+ generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
30
+ print("Generated Text:")
31
+ print(generated_text)
32
+
33
+ if __name__ == "__main__":
34
+ main()
generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "transformers_version": "4.33.3"
6
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:09d6ec8a034f6196ce41b03c113cfe32ea1a96e0a2d60d962bcf669d2b0cb6c1
3
+ size 5246593815
special_tokens_map.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<|endoftext|>",
3
+ "eos_token": "<|endoftext|>",
4
+ "unk_token": "<|endoftext|>"
5
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "bos_token": "<|endoftext|>",
4
+ "clean_up_tokenization_spaces": true,
5
+ "eos_token": "<|endoftext|>",
6
+ "model_max_length": 1024,
7
+ "tokenizer_class": "GPT2Tokenizer",
8
+ "unk_token": "<|endoftext|>"
9
+ }
train.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import torch
4
+ from transformers import AutoModelForCausalLM, AutoTokenizer, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments
5
+
6
+ class GPTAssistant:
7
+ def __init__(self, model_name="/Users/migueldeguzman/Desktop/papercliptodd/falcon-1b/base_model/"): # Replace with your specific Qwen model
8
+ try:
9
+ # Load the tokenizer and model using the specified Qwen model name
10
+ self.tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
11
+ self.model = AutoModelForCausalLM.from_pretrained(model_name)
12
+ except Exception as e:
13
+ print(f"Error initializing the model or tokenizer: {e}")
14
+ sys.exit(1)
15
+
16
+ def fine_tune(self, answer_file_path, model_output_dir, epochs=1.0):
17
+ # Load dataset for training
18
+ try:
19
+ train_dataset = TextDataset(
20
+ tokenizer=self.tokenizer,
21
+ file_path=answer_file_path,
22
+ block_size=128
23
+ )
24
+ except Exception as e:
25
+ print(f"Error loading training dataset: {e}")
26
+ sys.exit(1) # Exit the script if dataset loading fails
27
+
28
+ # Prepare data collator for language modeling
29
+ data_collator = DataCollatorForLanguageModeling(
30
+ tokenizer=self.tokenizer,
31
+ mlm=False
32
+ )
33
+
34
+ total_steps = len(train_dataset) * epochs
35
+ warmup_steps = 0.1 * total_steps
36
+
37
+ # Set training arguments
38
+ training_args = TrainingArguments(
39
+ output_dir=model_output_dir,
40
+ overwrite_output_dir=True,
41
+ num_train_epochs=epochs,
42
+ per_device_train_batch_size=4,
43
+ save_steps=10_000,
44
+ save_total_limit=2,
45
+ weight_decay=0.001,
46
+ gradient_accumulation_steps=8,
47
+ learning_rate=3e-6, #previously 15e-6 then 1e-6 then 7e-6
48
+ lr_scheduler_type='cosine',
49
+ warmup_steps=warmup_steps
50
+ )
51
+
52
+ # Initialize Trainer
53
+ trainer = Trainer(
54
+ model=self.model,
55
+ args=training_args,
56
+ data_collator=data_collator,
57
+ train_dataset=train_dataset
58
+ )
59
+
60
+ # Train and save the model
61
+ trainer.train()
62
+ self.model.save_pretrained(model_output_dir)
63
+ self.tokenizer.save_pretrained(model_output_dir)
64
+
65
+ def main():
66
+ # Specify the file path for training data and output directory
67
+ text_file_path = "/Users/migueldeguzman/Desktop/papercliptodd/falcon-1b/v1/awakening.text" # Replace with your training data file path
68
+ model_output_dir = "/Users/migueldeguzman/Desktop/papercliptodd/falcon-1b/v1/" # Replace with your desired output directory
69
+
70
+ # Initialize GPTAssistant and fine-tune the model
71
+ assistant = GPTAssistant()
72
+ assistant.fine_tune(text_file_path, model_output_dir)
73
+
74
+ if __name__ == "__main__":
75
+ main()
vocab.json ADDED
The diff for this file is too large to render. See raw diff