Spaces:

suryadev1
/

astra

Running

App Files Files Community

suryadev1 commited on 21 days ago

Commit

5c72fe4

•

1 Parent(s): 9c1c378

fine

Browse files

Files changed (49) hide show

CustomBERTModel.py +33 -0
Untitled.ipynb +0 -0
__pycache__/metrics.cpython-312.pyc +0 -0
__pycache__/recalibration.cpython-312.pyc +0 -0
__pycache__/visualization.cpython-312.pyc +0 -0
app.py +48 -0
data_preprocessor.py +170 -0
hint_fine_tuning.py +382 -0
main.py +322 -0
metrics.py +149 -0
new_fine_tuning/README.md +197 -0
new_fine_tuning/__pycache__/metrics.cpython-312.pyc +0 -0
new_fine_tuning/__pycache__/recalibration.cpython-312.pyc +0 -0
new_fine_tuning/__pycache__/visualization.cpython-312.pyc +0 -0
new_hint_fine_tuned.py +131 -0
new_test_saved_finetuned_model.py +613 -0
plot.png +0 -0
prepare_pretraining_input_vocab_file.py +0 -0
ratio_proportion_change3_2223/sch_largest_100-coded/pretraining/vocab.txt +34 -0
recalibration.py +82 -0
src/__pycache__/attention.cpython-312.pyc +0 -0
src/__pycache__/bert.cpython-312.pyc +0 -0
src/__pycache__/classifier_model.cpython-312.pyc +0 -0
src/__pycache__/dataset.cpython-312.pyc +0 -0
src/__pycache__/embedding.cpython-312.pyc +0 -0
src/__pycache__/seq_model.cpython-312.pyc +0 -0
src/__pycache__/transformer.cpython-312.pyc +0 -0
src/__pycache__/transformer_component.cpython-312.pyc +0 -0
src/__pycache__/vocab.cpython-312.pyc +0 -0
src/attention.py +21 -1
src/bert.py +35 -0
src/classifier_model.py +52 -1
src/dataset.py +385 -0
src/pretrainer.py +713 -0
src/reference_code/bert_reference_code.py +1622 -0
src/reference_code/evaluate_embeddings.py +136 -0
src/reference_code/metrics.py +149 -0
src/reference_code/pretrainer-old.py +696 -0
src/reference_code/test.py +493 -0
src/reference_code/utils.py +369 -0
src/reference_code/visualization.py +78 -0
src/seq_model.py +15 -0
src/transformer.py +11 -0
src/vocab.py +17 -0
test.py +8 -0
test.txt +0 -0
test_hint_fine_tuned.py +45 -0
test_saved_model.py +234 -0
visualization.py +78 -0

CustomBERTModel.py ADDED Viewed

	@@ -0,0 +1,33 @@

+import torch
+import torch.nn as nn
+from src.bert import BERT
+class CustomBERTModel(nn.Module):
+    def __init__(self, vocab_size, output_dim, pre_trained_model_path):
+        super(CustomBERTModel, self).__init__()
+        hidden_size = 768
+        self.bert = BERT(vocab_size=vocab_size, hidden=hidden_size, n_layers=4, attn_heads=8, dropout=0.1)
+        # Load the pre-trained model's state_dict
+        checkpoint = torch.load(pre_trained_model_path, map_location=torch.device('cpu'))
+        if isinstance(checkpoint, dict):
+            self.bert.load_state_dict(checkpoint)
+        else:
+            raise TypeError(f"Expected state_dict, got {type(checkpoint)} instead.")
+        # Fully connected layer with input size 768 (matching BERT hidden size)
+        self.fc = nn.Linear(hidden_size, output_dim)
+    def forward(self, sequence, segment_info):
+        sequence = sequence.to(next(self.parameters()).device)
+        segment_info = segment_info.to(sequence.device)
+        x = self.bert(sequence, segment_info)
+        print(f"BERT output shape: {x.shape}")
+        cls_embeddings = x[:, 0]  # Extract CLS token embeddings
+        print(f"CLS Embeddings shape: {cls_embeddings.shape}")
+        logits = self.fc(cls_embeddings)  # Pass tensor of size (batch_size, 768) to the fully connected layer
+        return logits

Untitled.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

__pycache__/metrics.cpython-312.pyc ADDED Viewed

Binary file (9.14 kB). View file

__pycache__/recalibration.cpython-312.pyc ADDED Viewed

Binary file (5.49 kB). View file

__pycache__/visualization.cpython-312.pyc ADDED Viewed

Binary file (5.27 kB). View file

app.py CHANGED Viewed

@@ -101,15 +101,48 @@ import shutil
 import matplotlib.pyplot as plt
 from sklearn.metrics import roc_curve, auc
 # Define the function to process the input file and model selection
 def process_file(file,label, model_name):
     with open(file.name, 'r') as f:
         content = f.read()
     saved_test_dataset = "train.txt"
     saved_test_label = "train_label.txt"
     # Save the uploaded file content to a specified location
     shutil.copyfile(file.name, saved_test_dataset)
     shutil.copyfile(label.name, saved_test_label)
     # For demonstration purposes, we'll just return the content with the selected model name
     if(model_name=="FS"):
         checkpoint="ratio_proportion_change3/output/FS/bert_fine_tuned.model.ep32"
@@ -126,6 +159,7 @@ def process_file(file,label, model_name):
     subprocess.run(["python", "src/test_saved_model.py",
                     "--finetuned_bert_checkpoint",checkpoint
                     ])
     result = {}
     with open("result.txt", 'r') as file:
         for line in file:
@@ -160,7 +194,11 @@ def process_file(file,label, model_name):
     return text_output,plot_path
 # List of models for the dropdown menu
 models = ["FS", "IS", "CORRECTNESS","EFFECTIVENESS"]
 # Create the Gradio interface
 with gr.Blocks(css="""
@@ -350,15 +388,25 @@ tbody.svelte-18wv37q>tr.svelte-18wv37q:nth-child(odd) {
     with gr.Row():
         file_input = gr.File(label="Upload a test file", file_types=['.txt'], elem_classes="file-box")
         label_input = gr.File(label="Upload test labels", file_types=['.txt'], elem_classes="file-box")
     model_dropdown = gr.Dropdown(choices=models, label="Select Model", elem_classes="dropdown-menu")
     with gr.Row():
         output_text = gr.Textbox(label="Output Text")
         output_image = gr.Image(label="Output Plot")
     btn = gr.Button("Submit")
     btn.click(fn=process_file, inputs=[file_input,label_input, model_dropdown], outputs=[output_text,output_image])
 # Launch the app
 demo.launch()

 import matplotlib.pyplot as plt
 from sklearn.metrics import roc_curve, auc
 # Define the function to process the input file and model selection
+<<<<<<< HEAD
+def process_file(file,label,info, model_name):
+=======
 def process_file(file,label, model_name):
+>>>>>>> bffd3381ccb717f802fe651d4111ec0a268e3896
     with open(file.name, 'r') as f:
         content = f.read()
     saved_test_dataset = "train.txt"
     saved_test_label = "train_label.txt"
+<<<<<<< HEAD
+    saved_train_info="train_info.txt"
+=======
+>>>>>>> bffd3381ccb717f802fe651d4111ec0a268e3896
     # Save the uploaded file content to a specified location
     shutil.copyfile(file.name, saved_test_dataset)
     shutil.copyfile(label.name, saved_test_label)
+<<<<<<< HEAD
+    shutil.copyfile(info.name, saved_train_info)
+    # For demonstration purposes, we'll just return the content with the selected model name
+    # if(model_name=="highGRschool10"):
+    #     checkpoint="ratio_proportion_change3/output/FS/bert_fine_tuned.model.ep32"
+    # elif(model_name=="lowGRschoolAll"):
+    #     checkpoint="ratio_proportion_change3/output/IS/bert_fine_tuned.model.ep14"
+    # elif(model_name=="fullTest"):
+    #     checkpoint="ratio_proportion_change3/output/correctness/bert_fine_tuned.model.ep48"
+    # else:
+    #     checkpoint=None
+    # print(checkpoint)
+    subprocess.run([
+        "python", "new_test_saved_finetuned_model.py",
+        "-workspace_name", "ratio_proportion_change3_2223/sch_largest_100-coded",
+        "-finetune_task", model_name,
+        "-test_dataset_path","../../../../train.txt",
+        # "-test_label_path","../../../../train_label.txt",
+        "-finetuned_bert_classifier_checkpoint",
+        "ratio_proportion_change3_2223/sch_largest_100-coded/output/highGRschool10/bert_fine_tuned.model.ep42",
+        "-e",str(1),
+        "-b",str(5)
+    ], shell=True)
+=======
     # For demonstration purposes, we'll just return the content with the selected model name
     if(model_name=="FS"):
         checkpoint="ratio_proportion_change3/output/FS/bert_fine_tuned.model.ep32"
     subprocess.run(["python", "src/test_saved_model.py",
                     "--finetuned_bert_checkpoint",checkpoint
                     ])
+>>>>>>> bffd3381ccb717f802fe651d4111ec0a268e3896
     result = {}
     with open("result.txt", 'r') as file:
         for line in file:
     return text_output,plot_path
 # List of models for the dropdown menu
+<<<<<<< HEAD
+models = ["highGRschool10", "lowGRschoolAll", "fullTest"]
+=======
 models = ["FS", "IS", "CORRECTNESS","EFFECTIVENESS"]
+>>>>>>> bffd3381ccb717f802fe651d4111ec0a268e3896
 # Create the Gradio interface
 with gr.Blocks(css="""
     with gr.Row():
         file_input = gr.File(label="Upload a test file", file_types=['.txt'], elem_classes="file-box")
         label_input = gr.File(label="Upload test labels", file_types=['.txt'], elem_classes="file-box")
+<<<<<<< HEAD
+        info_input = gr.File(label="Upload test info", file_types=['.txt'], elem_classes="file-box")
+    model_dropdown = gr.Dropdown(choices=models, label="Select Finetune Task", elem_classes="dropdown-menu")
+=======
     model_dropdown = gr.Dropdown(choices=models, label="Select Model", elem_classes="dropdown-menu")
+>>>>>>> bffd3381ccb717f802fe651d4111ec0a268e3896
     with gr.Row():
         output_text = gr.Textbox(label="Output Text")
         output_image = gr.Image(label="Output Plot")
     btn = gr.Button("Submit")
+<<<<<<< HEAD
+    btn.click(fn=process_file, inputs=[file_input,label_input,info_input, model_dropdown], outputs=[output_text,output_image])
+=======
     btn.click(fn=process_file, inputs=[file_input,label_input, model_dropdown], outputs=[output_text,output_image])
+>>>>>>> bffd3381ccb717f802fe651d4111ec0a268e3896
 # Launch the app
 demo.launch()

data_preprocessor.py ADDED Viewed

	@@ -0,0 +1,170 @@

+import time
+import pandas as pd
+import sys
+class DataPreprocessor:
+    def __init__(self, input_file_path):
+        self.input_file_path = input_file_path
+        self.unique_students = None
+        self.unique_problems = None
+        self.unique_prob_hierarchy = None
+        self.unique_steps = None
+        self.unique_kcs = None
+    def analyze_dataset(self):
+        file_iterator = self.load_file_iterator()
+        start_time = time.time()
+        self.unique_students = {"st"}
+        self.unique_problems = {"pr"}
+        self.unique_prob_hierarchy = {"ph"}
+        self.unique_kcs = {"kc"}
+        for chunk_data in file_iterator:
+            for student_id, std_groups in chunk_data.groupby('Anon Student Id'):
+                self.unique_students.update({student_id})
+                prob_hierarchy = std_groups.groupby('Level (Workspace Id)')
+                for hierarchy, hierarchy_groups in prob_hierarchy:
+                    self.unique_prob_hierarchy.update({hierarchy})
+                    prob_name = hierarchy_groups.groupby('Problem Name')
+                    for problem_name, prob_name_groups in prob_name:
+                        self.unique_problems.update({problem_name})
+                        sub_skills = prob_name_groups['KC Model(MATHia)']
+                        for a in sub_skills:
+                            if str(a) != "nan":
+                                temp = a.split("~~")
+                                for kc in temp:
+                                    self.unique_kcs.update({kc})
+        self.unique_students.remove("st")
+        self.unique_problems.remove("pr")
+        self.unique_prob_hierarchy.remove("ph")
+        self.unique_kcs.remove("kc")
+        end_time = time.time()
+        print("Time Taken to analyze dataset = ", end_time - start_time)
+        print("Length of unique students->", len(self.unique_students))
+        print("Length of unique problems->", len(self.unique_problems))
+        print("Length of unique problem hierarchy->", len(self.unique_prob_hierarchy))
+        print("Length of Unique Knowledge components ->", len(self.unique_kcs))
+    def analyze_dataset_by_section(self, workspace_name):
+        file_iterator = self.load_file_iterator()
+        start_time = time.time()
+        self.unique_students = {"st"}
+        self.unique_problems = {"pr"}
+        self.unique_prob_hierarchy = {"ph"}
+        self.unique_steps = {"s"}
+        self.unique_kcs = {"kc"}
+        # with open("workspace_info.txt", 'a') as f:
+        #     sys.stdout = f
+        for chunk_data in file_iterator:
+            for student_id, std_groups in chunk_data.groupby('Anon Student Id'):
+                prob_hierarchy = std_groups.groupby('Level (Workspace Id)')
+                for hierarchy, hierarchy_groups in prob_hierarchy:
+                    if workspace_name == hierarchy:
+                        # print("Workspace : ", hierarchy)
+                        self.unique_students.update({student_id})
+                        self.unique_prob_hierarchy.update({hierarchy})
+                        prob_name = hierarchy_groups.groupby('Problem Name')
+                        for problem_name, prob_name_groups in prob_name:
+                            self.unique_problems.update({problem_name})
+                            step_names = prob_name_groups['Step Name']
+                            sub_skills = prob_name_groups['KC Model(MATHia)']
+                            for step in step_names:
+                                if str(step) != "nan":
+                                    self.unique_steps.update({step})
+                            for a in sub_skills:
+                                if str(a) != "nan":
+                                    temp = a.split("~~")
+                                    for kc in temp:
+                                        self.unique_kcs.update({kc})
+        self.unique_problems.remove("pr")
+        self.unique_prob_hierarchy.remove("ph")
+        self.unique_steps.remove("s")
+        self.unique_kcs.remove("kc")
+        end_time = time.time()
+        print("Time Taken to analyze dataset = ", end_time - start_time)
+        print("Workspace-> ",workspace_name)
+        print("Length of unique students->", len(self.unique_students))
+        print("Length of unique problems->", len(self.unique_problems))
+        print("Length of unique problem hierarchy->", len(self.unique_prob_hierarchy))
+        print("Length of unique step names ->", len(self.unique_steps))
+        print("Length of unique knowledge components ->", len(self.unique_kcs))
+        #     f.close()
+        # sys.stdout = sys.__stdout__
+    def analyze_dataset_by_school(self, workspace_name, school_id=None):
+        file_iterator = self.load_file_iterator(sep=",")
+        start_time = time.time()
+        self.unique_schools = set()
+        self.unique_class = set()
+        self.unique_students = set()
+        self.unique_problems = set()
+        self.unique_steps = set()
+        self.unique_kcs = set()
+        self.unique_actions = set()
+        self.unique_outcomes = set()
+        self.unique_new_steps_w_action_attempt = set()
+        self.unique_new_steps_w_kcs = set()
+        self.unique_new_steps_w_action_attempt_kcs = set()
+        for chunk_data in file_iterator:
+            for school, school_group in chunk_data.groupby('CF (Anon School Id)'):
+                # if school and school == school_id:
+                self.unique_schools.add(school)
+                for class_id, class_group in school_group.groupby('CF (Anon Class Id)'):
+                    self.unique_class.add(class_id)
+                    for student_id, std_group in class_group.groupby('Anon Student Id'):
+                        self.unique_students.add(student_id)
+                        for prob, prob_group in std_group.groupby('Problem Name'):
+                            self.unique_problems.add(prob)
+                            step_names = set(prob_group['Step Name'])
+                            sub_skills = set(prob_group['KC Model(MATHia)'])
+                            actions = set(prob_group['Action'])
+                            outcomes = set(prob_group['Outcome'])
+                            self.unique_steps.update(step_names)
+                            self.unique_kcs.update(sub_skills)
+                            self.unique_actions.update(actions)
+                            self.unique_outcomes.update(outcomes)
+                            for step in step_names:
+                                if pd.isna(step):
+                                    step_group = prob_group[pd.isna(prob_group['Step Name'])]
+                                else:
+                                    step_group = prob_group[prob_group['Step Name']==step]
+                                for kc in set(step_group['KC Model(MATHia)']):
+                                    new_step = f"{step}:{kc}"
+                                    self.unique_new_steps_w_kcs.add(new_step)
+                                for action, action_group in step_group.groupby('Action'):
+                                    for attempt, attempt_group in action_group.groupby('Attempt At Step'):
+                                        new_step = f"{step}:{action}:{attempt}"
+                                        self.unique_new_steps_w_action_attempt.add(new_step)
+                                        for kc in set(attempt_group["KC Model(MATHia)"]):
+                                            new_step = f"{step}:{action}:{attempt}:{kc}"
+                                            self.unique_new_steps_w_action_attempt_kcs.add(new_step)
+        end_time = time.time()
+        print("Time Taken to analyze dataset = ", end_time - start_time)
+        print("Workspace-> ",workspace_name)
+        print("Length of unique students->", len(self.unique_students))
+        print("Length of unique problems->", len(self.unique_problems))
+        print("Length of unique classes->", len(self.unique_class))
+        print("Length of unique step names ->", len(self.unique_steps))
+        print("Length of unique knowledge components ->", len(self.unique_kcs))
+        print("Length of unique actions ->", len(self.unique_actions))
+        print("Length of unique outcomes ->", len(self.unique_outcomes))
+        print("Length of unique new step names with actions and attempts ->", len(self.unique_new_steps_w_action_attempt))
+        print("Length of unique new step names with actions, attempts and kcs ->", len(self.unique_new_steps_w_action_attempt_kcs))
+        print("Length of unique new step names with kcs ->", len(self.unique_new_steps_w_kcs))
+    def load_file_iterator(self, sep="\t"):
+        chunk_iterator = pd.read_csv(self.input_file_path, sep=sep, header=0, iterator=True, chunksize=1000000)
+        return chunk_iterator

hint_fine_tuning.py ADDED Viewed

	@@ -0,0 +1,382 @@

+import argparse
+import os
+import sys
+import torch
+import torch.nn as nn
+from torch.utils.data import DataLoader, random_split, TensorDataset
+from src.dataset import TokenizerDataset
+from src.bert import BERT
+from src.pretrainer import BERTFineTuneTrainer1
+from src.vocab import Vocab
+import pandas as pd
+# class CustomBERTModel(nn.Module):
+#     def __init__(self, vocab_size, output_dim, pre_trained_model_path):
+#         super(CustomBERTModel, self).__init__()
+#         hidden_size = 768
+#         self.bert = BERT(vocab_size=vocab_size, hidden=hidden_size, n_layers=12, attn_heads=12, dropout=0.1)
+#         checkpoint = torch.load(pre_trained_model_path, map_location=torch.device('cpu'))
+#         if isinstance(checkpoint, dict):
+#             self.bert.load_state_dict(checkpoint)
+#         elif isinstance(checkpoint, BERT):
+#             self.bert = checkpoint
+#         else:
+#             raise TypeError(f"Expected state_dict or BERT instance, got {type(checkpoint)} instead.")
+#         self.fc = nn.Linear(hidden_size, output_dim)
+#     def forward(self, sequence, segment_info):
+#         sequence = sequence.to(next(self.parameters()).device)
+#         segment_info = segment_info.to(sequence.device)
+#         if sequence.size(0) == 0 or sequence.size(1) == 0:
+#             raise ValueError("Input sequence tensor has 0 elements. Check data preprocessing.")
+#         x = self.bert(sequence, segment_info)
+#         print(f"BERT output shape: {x.shape}")
+#         if x.size(0) == 0 or x.size(1) == 0:
+#             raise ValueError("BERT output tensor has 0 elements. Check input dimensions.")
+#         cls_embeddings = x[:, 0]
+#         logits = self.fc(cls_embeddings)
+#         return logits
+# class CustomBERTModel(nn.Module):
+#     def __init__(self, vocab_size, output_dim, pre_trained_model_path):
+#         super(CustomBERTModel, self).__init__()
+#         hidden_size = 764  # Ensure this is 768
+#         self.bert = BERT(vocab_size=vocab_size, hidden=hidden_size, n_layers=12, attn_heads=12, dropout=0.1)
+#         # Load the pre-trained model's state_dict
+#         checkpoint = torch.load(pre_trained_model_path, map_location=torch.device('cpu'))
+#         if isinstance(checkpoint, dict):
+#             self.bert.load_state_dict(checkpoint)
+#         else:
+#             raise TypeError(f"Expected state_dict, got {type(checkpoint)} instead.")
+#         # Fully connected layer with input size 768
+#         self.fc = nn.Linear(hidden_size, output_dim)
+#     def forward(self, sequence, segment_info):
+#         sequence = sequence.to(next(self.parameters()).device)
+#         segment_info = segment_info.to(sequence.device)
+#         x = self.bert(sequence, segment_info)
+#         print(f"BERT output shape: {x.shape}")  # Should output (batch_size, seq_len, 768)
+#         cls_embeddings = x[:, 0]  # Extract CLS token embeddings
+#         print(f"CLS Embeddings shape: {cls_embeddings.shape}")  # Should output (batch_size, 768)
+#         logits = self.fc(cls_embeddings)  # Should now pass a tensor of size (batch_size, 768) to `fc`
+#         return logits
+# for test
+class CustomBERTModel(nn.Module):
+    def __init__(self, vocab_size, output_dim, pre_trained_model_path):
+        super(CustomBERTModel, self).__init__()
+        self.hidden = 764  # Ensure this is defined correctly
+        self.bert = BERT(vocab_size=vocab_size, hidden=self.hidden, n_layers=12, attn_heads=12, dropout=0.1)
+        # Load the pre-trained model's state_dict
+        checkpoint = torch.load(pre_trained_model_path, map_location=torch.device('cpu'))
+        if isinstance(checkpoint, dict):
+            self.bert.load_state_dict(checkpoint)
+        else:
+            raise TypeError(f"Expected state_dict, got {type(checkpoint)} instead.")
+        self.fc = nn.Linear(self.hidden, output_dim)
+    def forward(self, sequence, segment_info):
+        x = self.bert(sequence, segment_info)
+        cls_embeddings = x[:, 0]  # Extract CLS token embeddings
+        logits = self.fc(cls_embeddings)  # Pass to fully connected layer
+        return logits
+def preprocess_labels(label_csv_path):
+    try:
+        labels_df = pd.read_csv(label_csv_path)
+        labels = labels_df['last_hint_class'].values.astype(int)
+        return torch.tensor(labels, dtype=torch.long)
+    except Exception as e:
+        print(f"Error reading dataset file: {e}")
+        return None
+def preprocess_data(data_path, vocab, max_length=128):
+    try:
+        with open(data_path, 'r') as f:
+            sequences = f.readlines()
+    except Exception as e:
+        print(f"Error reading data file: {e}")
+        return None, None
+    if len(sequences) == 0:
+        raise ValueError(f"No sequences found in data file {data_path}. Check the file content.")
+    tokenized_sequences = []
+    for sequence in sequences:
+        sequence = sequence.strip()
+        if sequence:
+            encoded = vocab.to_seq(sequence, seq_len=max_length)
+            encoded = encoded[:max_length] + [vocab.vocab.get('[PAD]', 0)] * (max_length - len(encoded))
+            segment_label = [0] * max_length
+            tokenized_sequences.append({
+                'input_ids': torch.tensor(encoded),
+                'segment_label': torch.tensor(segment_label)
+            })
+    if not tokenized_sequences:
+        raise ValueError("Tokenization resulted in an empty list. Check the sequences and tokenization logic.")
+    tokenized_sequences = [t for t in tokenized_sequences if len(t['input_ids']) == max_length]
+    if not tokenized_sequences:
+        raise ValueError("All tokenized sequences are of unexpected length. This suggests an issue with the tokenization logic.")
+    input_ids = torch.cat([t['input_ids'].unsqueeze(0) for t in tokenized_sequences], dim=0)
+    segment_labels = torch.cat([t['segment_label'].unsqueeze(0) for t in tokenized_sequences], dim=0)
+    print(f"Input IDs shape: {input_ids.shape}")
+    print(f"Segment labels shape: {segment_labels.shape}")
+    return input_ids, segment_labels
+def collate_fn(batch):
+    inputs = []
+    labels = []
+    segment_labels = []
+    for item in batch:
+        if item is None:
+            continue
+        if isinstance(item, dict):
+            inputs.append(item['input_ids'].unsqueeze(0))
+            labels.append(item['label'].unsqueeze(0))
+            segment_labels.append(item['segment_label'].unsqueeze(0))
+    if len(inputs) == 0 or len(segment_labels) == 0:
+        print("Empty batch encountered. Returning None to skip this batch.")
+        return None
+    try:
+        inputs = torch.cat(inputs, dim=0)
+        labels = torch.cat(labels, dim=0)
+        segment_labels = torch.cat(segment_labels, dim=0)
+    except Exception as e:
+        print(f"Error concatenating tensors: {e}")
+        return None
+    return {
+        'input': inputs,
+        'label': labels,
+        'segment_label': segment_labels
+    }
+def custom_collate_fn(batch):
+    processed_batch = collate_fn(batch)
+    if processed_batch is None or len(processed_batch['input']) == 0:
+        # Return a valid batch with at least one element instead of an empty one
+        return {
+            'input': torch.zeros((1, 128), dtype=torch.long),
+            'label': torch.zeros((1,), dtype=torch.long),
+            'segment_label': torch.zeros((1, 128), dtype=torch.long)
+        }
+    return processed_batch
+def train_without_progress_status(trainer, epoch, shuffle):
+    for epoch_idx in range(epoch):
+        print(f"EP_train:{epoch_idx}:")
+        for batch in trainer.train_data:
+            if batch is None:
+                continue
+            # Check if batch is a string (indicating an issue)
+            if isinstance(batch, str):
+                print(f"Error: Received a string instead of a dictionary in batch: {batch}")
+                raise ValueError(f"Unexpected string in batch: {batch}")
+            # Validate the batch structure before passing to iteration
+            if isinstance(batch, dict):
+                # Verify that all expected keys are present and that the values are tensors
+                if all(key in batch for key in ['input_ids', 'segment_label', 'labels']):
+                    if all(isinstance(batch[key], torch.Tensor) for key in batch):
+                        try:
+                            print(f"Batch Structure: {batch}")  # Debugging batch before iteration
+                            trainer.iteration(epoch_idx, batch)
+                        except Exception as e:
+                            print(f"Error during batch processing: {e}")
+                            sys.stdout.flush()
+                            raise e  # Propagate the exception for better debugging
+                    else:
+                        print(f"Error: Expected all values in batch to be tensors, but got: {batch}")
+                        raise ValueError("Batch contains non-tensor values.")
+                else:
+                    print(f"Error: Batch missing expected keys. Batch keys: {batch.keys()}")
+                    raise ValueError("Batch does not contain expected keys.")
+            else:
+                print(f"Error: Expected batch to be a dictionary but got {type(batch)} instead.")
+                raise ValueError(f"Invalid batch structure: {batch}")
+# def main(opt):
+#     # device = torch.device("cpu")
+#     device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+#     vocab = Vocab(opt.vocab_file)
+#     vocab.load_vocab()
+#     input_ids, segment_labels = preprocess_data(opt.data_path, vocab, max_length=128)
+#     labels = preprocess_labels(opt.dataset)
+#     if input_ids is None or segment_labels is None or labels is None:
+#         print("Error in preprocessing data. Exiting.")
+#         return
+#     dataset = TensorDataset(input_ids, segment_labels, torch.tensor(labels, dtype=torch.long))
+#     val_size = len(dataset) - int(0.8 * len(dataset))
+#     val_dataset, train_dataset = random_split(dataset, [val_size, len(dataset) - val_size])
+#     train_dataloader = DataLoader(
+#         train_dataset,
+#         batch_size=32,
+#         shuffle=True,
+#         collate_fn=custom_collate_fn
+#     )
+#     val_dataloader = DataLoader(
+#         val_dataset,
+#         batch_size=32,
+#         shuffle=False,
+#         collate_fn=custom_collate_fn
+#     )
+#     custom_model = CustomBERTModel(
+#         vocab_size=len(vocab.vocab),
+#         output_dim=2,
+#         pre_trained_model_path=opt.pre_trained_model_path
+#     ).to(device)
+#     trainer = BERTFineTuneTrainer1(
+#         bert=custom_model.bert,
+#         vocab_size=len(vocab.vocab),
+#         train_dataloader=train_dataloader,
+#         test_dataloader=val_dataloader,
+#         lr=5e-5,
+#         num_labels=2,
+#         with_cuda=torch.cuda.is_available(),
+#         log_freq=10,
+#         workspace_name=opt.output_dir,
+#         log_folder_path=opt.log_folder_path
+#     )
+#     trainer.train(epoch=20)
+#     # os.makedirs(opt.output_dir, exist_ok=True)
+#     # output_model_file = os.path.join(opt.output_dir, 'fine_tuned_model.pth')
+#     # torch.save(custom_model.state_dict(), output_model_file)
+#     # print(f'Model saved to {output_model_file}')
+#     os.makedirs(opt.output_dir, exist_ok=True)
+#     output_model_file = os.path.join(opt.output_dir, 'fine_tuned_model_2.pth')
+#     torch.save(custom_model, output_model_file)
+#     print(f'Model saved to {output_model_file}')
+def main(opt):
+    # Set device to GPU if available, otherwise use CPU
+    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+    print(torch.cuda.is_available())  # Should return True if GPU is available
+    print(torch.cuda.device_count())
+    # Load vocabulary
+    vocab = Vocab(opt.vocab_file)
+    vocab.load_vocab()
+    # Preprocess data and labels
+    input_ids, segment_labels = preprocess_data(opt.data_path, vocab, max_length=128)
+    labels = preprocess_labels(opt.dataset)
+    if input_ids is None or segment_labels is None or labels is None:
+        print("Error in preprocessing data. Exiting.")
+        return
+    # Transfer tensors to the correct device (GPU/CPU)
+    input_ids = input_ids.to(device)
+    segment_labels = segment_labels.to(device)
+    labels = torch.tensor(labels, dtype=torch.long).to(device)
+    # Create TensorDataset and split into train and validation sets
+    dataset = TensorDataset(input_ids, segment_labels, labels)
+    val_size = len(dataset) - int(0.8 * len(dataset))
+    val_dataset, train_dataset = random_split(dataset, [val_size, len(dataset) - val_size])
+    # Create DataLoaders for training and validation
+    train_dataloader = DataLoader(
+        train_dataset,
+        batch_size=32,
+        shuffle=True,
+        collate_fn=custom_collate_fn
+    )
+    val_dataloader = DataLoader(
+        val_dataset,
+        batch_size=32,
+        shuffle=False,
+        collate_fn=custom_collate_fn
+    )
+    # Initialize custom BERT model and move it to the device
+    custom_model = CustomBERTModel(
+        vocab_size=len(vocab.vocab),
+        output_dim=2,
+        pre_trained_model_path=opt.pre_trained_model_path
+    ).to(device)
+    # Initialize the fine-tuning trainer
+    trainer = BERTFineTuneTrainer1(
+        bert=custom_model.bert,
+        vocab_size=len(vocab.vocab),
+        train_dataloader=train_dataloader,
+        test_dataloader=val_dataloader,
+        lr=5e-5,
+        num_labels=2,
+        with_cuda=torch.cuda.is_available(),
+        log_freq=10,
+        workspace_name=opt.output_dir,
+        log_folder_path=opt.log_folder_path
+    )
+    # Train the model
+    trainer.train(epoch=20)
+    # Save the model to the specified output directory
+    # os.makedirs(opt.output_dir, exist_ok=True)
+    # output_model_file = os.path.join(opt.output_dir, 'fine_tuned_model_2.pth')
+    # torch.save(custom_model.state_dict(), output_model_file)
+    # print(f'Model saved to {output_model_file}')
+    os.makedirs(opt.output_dir, exist_ok=True)
+    output_model_file = os.path.join(opt.output_dir, 'fine_tuned_model_2.pth')
+    torch.save(custom_model, output_model_file)
+    print(f'Model saved to {output_model_file}')
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Fine-tune BERT model.')
+    parser.add_argument('--dataset', type=str, default='/home/jupyter/bert/dataset/hint_based/ratio_proportion_change_3/er/er_train.csv', help='Path to the dataset file.')
+    parser.add_argument('--data_path', type=str, default='/home/jupyter/bert/ratio_proportion_change3_1920/_Aug23/gt/er.txt', help='Path to the input sequence file.')
+    parser.add_argument('--output_dir', type=str, default='/home/jupyter/bert/ratio_proportion_change3_1920/_Aug23/output/hint_classification', help='Directory to save the fine-tuned model.')
+    parser.add_argument('--pre_trained_model_path', type=str, default='/home/jupyter/bert/ratio_proportion_change3_1920/output/pretrain:1800ms:64hs:4l:8a:50s:64b:1000e:-5lr/bert_trained.seq_encoder.model.ep68', help='Path to the pre-trained BERT model.')
+    parser.add_argument('--vocab_file', type=str, default='/home/jupyter/bert/ratio_proportion_change3_1920/_Aug23/pretraining/vocab.txt', help='Path to the vocabulary file.')
+    parser.add_argument('--log_folder_path', type=str, default='/home/jupyter/bert/ratio_proportion_change3_1920/logs/oct_logs', help='Path to the folder for saving logs.')
+    opt = parser.parse_args()
+    main(opt)

main.py ADDED Viewed

	@@ -0,0 +1,322 @@

+import argparse
+from torch.utils.data import DataLoader
+import torch
+import torch.nn as nn
+from src.bert import BERT
+from src.pretrainer import BERTTrainer, BERTFineTuneTrainer, BERTAttention
+from src.dataset import PretrainerDataset, TokenizerDataset
+from src.vocab import Vocab
+import time
+import os
+import tqdm
+import pickle
+def train():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-workspace_name', type=str, default=None)
+    parser.add_argument('-code', type=str, default=None, help="folder for pretraining outputs and logs")
+    parser.add_argument('-finetune_task', type=str, default=None, help="folder inside finetuning")
+    parser.add_argument("-attention", type=bool, default=False, help="analyse attention scores")
+    parser.add_argument("-diff_test_folder", type=bool, default=False, help="use for different test folder")
+    parser.add_argument("-embeddings", type=bool, default=False, help="get and analyse embeddings")
+    parser.add_argument('-embeddings_file_name', type=str, default=None, help="file name of embeddings")
+    parser.add_argument("-pretrain", type=bool, default=False, help="pretraining: true, or false")
+    # parser.add_argument('-opts', nargs='+', type=str, default=None, help='List of optional steps')
+    parser.add_argument("-max_mask", type=int, default=0.15, help="% of input tokens selected for masking")
+    # parser.add_argument("-p", "--pretrain_dataset", type=str, default="pretraining/pretrain.txt", help="pretraining dataset for bert")
+    # parser.add_argument("-pv", "--pretrain_val_dataset", type=str, default="pretraining/test.txt", help="pretraining validation dataset for bert")
+# default="finetuning/test.txt",
+    parser.add_argument("-vocab_path", type=str, default="pretraining/vocab.txt", help="built vocab model path with bert-vocab")
+    parser.add_argument("-train_dataset_path", type=str, default="train.txt", help="fine tune train dataset for progress classifier")
+    parser.add_argument("-val_dataset_path", type=str, default="val.txt", help="test set for evaluate fine tune train set")
+    parser.add_argument("-test_dataset_path", type=str, default="test.txt", help="test set for evaluate fine tune train set")
+    parser.add_argument("-num_labels", type=int, default=2, help="Number of labels")
+    parser.add_argument("-train_label_path", type=str, default="train_label.txt", help="fine tune train dataset for progress classifier")
+    parser.add_argument("-val_label_path", type=str, default="val_label.txt", help="test set for evaluate fine tune train set")
+    parser.add_argument("-test_label_path", type=str, default="test_label.txt", help="test set for evaluate fine tune train set")
+    ##### change Checkpoint for finetuning
+    parser.add_argument("-pretrained_bert_checkpoint", type=str, default=None, help="checkpoint of saved pretrained bert model")  #."output_feb09/bert_trained.model.ep40"
+    parser.add_argument('-check_epoch', type=int, default=None)
+    parser.add_argument("-hs", "--hidden", type=int, default=64, help="hidden size of transformer model") #64
+    parser.add_argument("-l", "--layers", type=int, default=4, help="number of layers") #4
+    parser.add_argument("-a", "--attn_heads", type=int, default=4, help="number of attention heads") #8
+    parser.add_argument("-s", "--seq_len", type=int, default=50, help="maximum sequence length")
+    parser.add_argument("-b", "--batch_size", type=int, default=500, help="number of batch_size") #64
+    parser.add_argument("-e", "--epochs", type=int, default=50)#1501, help="number of epochs") #501
+    # Use 50 for pretrain, and 10 for fine tune
+    parser.add_argument("-w", "--num_workers", type=int, default=4, help="dataloader worker size")
+    # Later run with cuda
+    parser.add_argument("--with_cuda", type=bool, default=True, help="training with CUDA: true, or false")
+    parser.add_argument("--log_freq", type=int, default=10, help="printing loss every n iter: setting n")
+    # parser.add_argument("--corpus_lines", type=int, default=None, help="total number of lines in corpus")
+    parser.add_argument("--cuda_devices", type=int, nargs='+', default=None, help="CUDA device ids")
+    # parser.add_argument("--on_memory", type=bool, default=False, help="Loading on memory: true or false")
+    parser.add_argument("--dropout", type=float, default=0.1, help="dropout of network")
+    parser.add_argument("--lr", type=float, default=1e-05, help="learning rate of adam") #1e-3
+    parser.add_argument("--adam_weight_decay", type=float, default=0.01, help="weight_decay of adam")
+    parser.add_argument("--adam_beta1", type=float, default=0.9, help="adam first beta value")
+    parser.add_argument("--adam_beta2", type=float, default=0.98, help="adam first beta value") #0.999
+    parser.add_argument("-o", "--output_path", type=str, default="bert_trained.seq_encoder.model", help="ex)output/bert.model")
+    # parser.add_argument("-o", "--output_path", type=str, default="output/bert_fine_tuned.model", help="ex)output/bert.model")
+    args = parser.parse_args()
+    for k,v in vars(args).items():
+        if 'path' in k:
+            if v:
+                if k == "output_path":
+                    if args.code:
+                        setattr(args, f"{k}", args.workspace_name+f"/output/{args.code}/"+v)
+                    elif args.finetune_task:
+                        setattr(args, f"{k}", args.workspace_name+f"/output/{args.finetune_task}/"+v)
+                    else:
+                        setattr(args, f"{k}", args.workspace_name+"/output/"+v)
+                elif k != "vocab_path":
+                    if args.pretrain:
+                        setattr(args, f"{k}", args.workspace_name+"/pretraining/"+v)
+                    else:
+                        if args.code:
+                            setattr(args, f"{k}", args.workspace_name+f"/{args.code}/"+v)
+                        elif args.finetune_task:
+                            if args.diff_test_folder and "test" in k:
+                                setattr(args, f"{k}", args.workspace_name+f"/finetuning/"+v)
+                            else:
+                                setattr(args, f"{k}", args.workspace_name+f"/finetuning/{args.finetune_task}/"+v)
+                        else:
+                            setattr(args, f"{k}", args.workspace_name+"/finetuning/"+v)
+                else:
+                    setattr(args, f"{k}", args.workspace_name+"/"+v)
+                print(f"args.{k} : {getattr(args, f'{k}')}")
+    print("Loading Vocab", args.vocab_path)
+    vocab_obj = Vocab(args.vocab_path)
+    vocab_obj.load_vocab()
+    print("Vocab Size: ", len(vocab_obj.vocab))
+    if args.attention:
+        print(f"Attention aggregate...... code: {args.code}, dataset: {args.finetune_task}")
+        if args.code:
+            new_folder = f"{args.workspace_name}/plots/{args.code}/"
+            if not os.path.exists(new_folder):
+                os.makedirs(new_folder)
+        train_dataset = TokenizerDataset(args.train_dataset_path, None, vocab_obj, seq_len=args.seq_len)
+        train_data_loader = DataLoader(train_dataset, batch_size=args.batch_size, num_workers=args.num_workers)
+        print("Load Pre-trained BERT model")
+        cuda_condition = torch.cuda.is_available() and args.with_cuda
+        device = torch.device("cuda:0" if cuda_condition else "cpu")
+        bert = torch.load(args.pretrained_bert_checkpoint, map_location=device)
+        trainer = BERTAttention(bert, vocab_obj, train_dataloader = train_data_loader, workspace_name = args.workspace_name, code=args.code, finetune_task = args.finetune_task)
+        trainer.getAttention()
+    elif args.embeddings:
+        print("Get embeddings... and cluster... ")
+        train_dataset = TokenizerDataset(args.test_dataset_path, None, vocab_obj, seq_len=args.seq_len)
+        train_data_loader = DataLoader(train_dataset, batch_size=args.batch_size, num_workers=args.num_workers)
+        print("Load Pre-trained BERT model")
+        cuda_condition = torch.cuda.is_available() and args.with_cuda
+        device = torch.device("cuda:0" if cuda_condition else "cpu")
+        bert = torch.load(args.pretrained_bert_checkpoint).to(device)
+        available_gpus = list(range(torch.cuda.device_count()))
+        if torch.cuda.device_count() > 1:
+            print("Using %d GPUS for BERT" % torch.cuda.device_count())
+            bert = nn.DataParallel(bert, device_ids=available_gpus)
+        data_iter = tqdm.tqdm(enumerate(train_data_loader),
+                              desc="Model: %s" % (args.pretrained_bert_checkpoint.split("/")[-1]),
+                              total=len(train_data_loader), bar_format="{l_bar}{r_bar}")
+        all_embeddings = []
+        for i, data in data_iter:
+            data = {key: value.to(device) for key, value in data.items()}
+            embedding = bert(data["input"], data["segment_label"])
+            # print(embedding.shape, embedding[:, 0].shape)
+            embeddings = [h for h in embedding[:,0].cpu().detach().numpy()]
+            all_embeddings.extend(embeddings)
+        new_emb_folder = f"{args.workspace_name}/embeddings"
+        if not os.path.exists(new_emb_folder):
+            os.makedirs(new_emb_folder)
+        pickle.dump(all_embeddings, open(f"{new_emb_folder}/{args.embeddings_file_name}.pkl", "wb"))
+    else:
+        if args.pretrain:
+            print("Pre-training......")
+            print("Loading Pretraining Train Dataset", args.train_dataset_path)
+            print(f"Workspace: {args.workspace_name}")
+            pretrain_dataset = PretrainerDataset(args.train_dataset_path, vocab_obj, seq_len=args.seq_len, max_mask = args.max_mask)
+            print("Loading Pretraining Validation Dataset", args.val_dataset_path)
+            pretrain_valid_dataset = PretrainerDataset(args.val_dataset_path, vocab_obj, seq_len=args.seq_len, max_mask = args.max_mask) \
+                if args.val_dataset_path is not None else None
+            print("Loading Pretraining Test Dataset", args.test_dataset_path)
+            pretrain_test_dataset = PretrainerDataset(args.test_dataset_path, vocab_obj, seq_len=args.seq_len, max_mask = args.max_mask) \
+                if args.test_dataset_path is not None else None
+            print("Creating Dataloader")
+            pretrain_data_loader = DataLoader(pretrain_dataset, batch_size=args.batch_size, num_workers=args.num_workers)
+            pretrain_val_data_loader = DataLoader(pretrain_valid_dataset, batch_size=args.batch_size, num_workers=args.num_workers)\
+                if pretrain_valid_dataset is not None else None
+            pretrain_test_data_loader = DataLoader(pretrain_test_dataset, batch_size=args.batch_size, num_workers=args.num_workers)\
+                if pretrain_test_dataset is not None else None
+            print("Building BERT model")
+            bert = BERT(len(vocab_obj.vocab), hidden=args.hidden, n_layers=args.layers, attn_heads=args.attn_heads, dropout=args.dropout)
+            if args.pretrained_bert_checkpoint:
+                print(f"BERT model : {args.pretrained_bert_checkpoint}")
+                bert = torch.load(args.pretrained_bert_checkpoint)
+            new_log_folder = f"{args.workspace_name}/logs"
+            new_output_folder = f"{args.workspace_name}/output"
+            if args.code: # is sent almost all the time
+                new_log_folder = f"{args.workspace_name}/logs/{args.code}"
+                new_output_folder = f"{args.workspace_name}/output/{args.code}"
+            if not os.path.exists(new_log_folder):
+                os.makedirs(new_log_folder)
+            if not os.path.exists(new_output_folder):
+                os.makedirs(new_output_folder)
+            print(f"Creating BERT Trainer .... masking: True, max_mask: {args.max_mask}")
+            trainer = BERTTrainer(bert, len(vocab_obj.vocab), train_dataloader=pretrain_data_loader,
+                                  val_dataloader=pretrain_val_data_loader, test_dataloader=pretrain_test_data_loader,
+                                  lr=args.lr, betas=(args.adam_beta1, args.adam_beta2), weight_decay=args.adam_weight_decay,
+                                  with_cuda=args.with_cuda, cuda_devices=args.cuda_devices, log_freq=args.log_freq,
+                                  log_folder_path=new_log_folder)
+            start_time = time.time()
+            print(f'Pretraining Starts, Time: {time.strftime("%D %T", time.localtime(start_time))}')
+            # if need to pretrain from a check-point, need :check_epoch
+            repoch = range(args.check_epoch, args.epochs) if args.check_epoch else range(args.epochs)
+            counter = 0
+            patience = 20
+            for epoch in repoch:
+                print(f'Training Epoch {epoch} Starts, Time: {time.strftime("%D %T", time.localtime(time.time()))}')
+                trainer.train(epoch)
+                print(f'Training Epoch {epoch} Ends, Time: {time.strftime("%D %T", time.localtime(time.time()))} \n')
+                if pretrain_val_data_loader is not None:
+                    print(f'Validation Epoch {epoch} Starts, Time: {time.strftime("%D %T", time.localtime(time.time()))}')
+                    trainer.val(epoch)
+                    print(f'Validation Epoch {epoch} Ends, Time: {time.strftime("%D %T", time.localtime(time.time()))} \n')
+                if trainer.save_model: #  or epoch%10 == 0 and epoch > 4
+                    trainer.save(epoch, args.output_path)
+                    counter = 0
+                    if pretrain_test_data_loader is not None:
+                        print(f'Test Epoch {epoch} Starts, Time: {time.strftime("%D %T", time.localtime(time.time()))}')
+                        trainer.test(epoch)
+                        print(f'Test Epoch {epoch} Ends, Time: {time.strftime("%D %T", time.localtime(time.time()))} \n')
+                else:
+                    counter +=1
+                    if counter >= patience:
+                        print(f"Early stopping at epoch {epoch}")
+                        break
+            end_time = time.time()
+            print("Time Taken to pretrain model = ", end_time - start_time)
+            print(f'Pretraining Ends, Time: {time.strftime("%D %T", time.localtime(end_time))}')
+        else:
+            print("Fine Tuning......")
+            print("Loading Train Dataset", args.train_dataset_path)
+            train_dataset = TokenizerDataset(args.train_dataset_path, args.train_label_path, vocab_obj, seq_len=args.seq_len)
+#             print("Loading Validation Dataset", args.val_dataset_path)
+#             val_dataset = TokenizerDataset(args.val_dataset_path, args.val_label_path, vocab_obj, seq_len=args.seq_len) \
+#                 if args.val_dataset_path is not None else None
+            print("Loading Test Dataset", args.test_dataset_path)
+            test_dataset = TokenizerDataset(args.test_dataset_path, args.test_label_path, vocab_obj, seq_len=args.seq_len) \
+                if args.test_dataset_path is not None else None
+            print("Creating Dataloader...")
+            train_data_loader = DataLoader(train_dataset, batch_size=args.batch_size, num_workers=args.num_workers)
+            # val_data_loader = DataLoader(val_dataset, batch_size=args.batch_size, num_workers=args.num_workers) \
+            #     if val_dataset is not None else None
+            test_data_loader = DataLoader(test_dataset, batch_size=args.batch_size, num_workers=args.num_workers) \
+                if test_dataset is not None else None
+            print("Load Pre-trained BERT model")
+            # bert = BERT(len(vocab_obj.vocab), hidden=args.hidden, n_layers=args.layers, attn_heads=args.attn_heads)
+            cuda_condition = torch.cuda.is_available() and args.with_cuda
+            device = torch.device("cuda:0" if cuda_condition else "cpu")
+            bert = torch.load(args.pretrained_bert_checkpoint, map_location=device)
+    #         if args.finetune_task == "SL":
+    #             if args.workspace_name == "ratio_proportion_change4":
+    #                 num_labels = 9
+    #             elif args.workspace_name == "ratio_proportion_change3":
+    #                 num_labels = 9
+    #             elif args.workspace_name == "scale_drawings_3":
+    #                 num_labels = 9
+    #             elif args.workspace_name == "sales_tax_discounts_two_rates":
+    #                 num_labels = 3
+    #         else:
+            # num_labels = 2
+    #         # num_labels = 1
+            # print(f"Number of Labels : {args.num_labels}")
+            new_log_folder = f"{args.workspace_name}/logs"
+            new_output_folder = f"{args.workspace_name}/output"
+            if args.finetune_task: # is sent almost all the time
+                new_log_folder = f"{args.workspace_name}/logs/{args.finetune_task}"
+                new_output_folder = f"{args.workspace_name}/output/{args.finetune_task}"
+            if not os.path.exists(new_log_folder):
+                os.makedirs(new_log_folder)
+            if not os.path.exists(new_output_folder):
+                os.makedirs(new_output_folder)
+            print("Creating BERT Fine Tune Trainer")
+            trainer = BERTFineTuneTrainer(bert, len(vocab_obj.vocab),
+                          train_dataloader=train_data_loader, test_dataloader=test_data_loader,
+                          lr=args.lr, betas=(args.adam_beta1, args.adam_beta2), weight_decay=args.adam_weight_decay,
+                          with_cuda=args.with_cuda, cuda_devices = args.cuda_devices, log_freq=args.log_freq,
+                          workspace_name = args.workspace_name, num_labels=args.num_labels, log_folder_path=new_log_folder)
+            print("Fine-tune training Start....")
+            start_time = time.time()
+            repoch = range(args.check_epoch, args.epochs) if args.check_epoch else range(args.epochs)
+            counter = 0
+            patience = 10
+            for epoch in repoch:
+                print(f'Training Epoch {epoch} Starts, Time: {time.strftime("%D %T", time.localtime(time.time()))}')
+                trainer.train(epoch)
+                print(f'Training Epoch {epoch} Ends, Time: {time.strftime("%D %T", time.localtime(time.time()))} \n')
+                if test_data_loader is not None:
+                    print(f'Test Epoch {epoch} Starts, Time: {time.strftime("%D %T", time.localtime(time.time()))}')
+                    trainer.test(epoch)
+                    # pickle.dump(trainer.probability_list, open(f"{args.workspace_name}/output/aaai/change4_mid_prob_{epoch}.pkl","wb"))
+                    print(f'Test Epoch {epoch} Ends, Time: {time.strftime("%D %T", time.localtime(time.time()))} \n')
+                # if val_data_loader is not None:
+                #     print(f'Validation Epoch {epoch} Starts, Time: {time.strftime("%D %T", time.localtime(time.time()))}')
+                #     trainer.val(epoch)
+                #     print(f'Validation Epoch {epoch} Ends, Time: {time.strftime("%D %T", time.localtime(time.time()))} \n')
+                if trainer.save_model: #  or epoch%10 == 0
+                    trainer.save(epoch, args.output_path)
+                    counter = 0
+                else:
+                    counter +=1
+                    if counter >= patience:
+                        print(f"Early stopping at epoch {epoch}")
+                        break
+            end_time = time.time()
+            print("Time Taken to fine-tune model = ", end_time - start_time)
+            print(f'Pretraining Ends, Time: {time.strftime("%D %T", time.localtime(end_time))}')
+if __name__ == "__main__":
+    train()

metrics.py ADDED Viewed

	@@ -0,0 +1,149 @@

+import numpy as np
+from scipy.special import softmax
+class CELoss(object):
+    def compute_bin_boundaries(self, probabilities = np.array([])):
+        #uniform bin spacing
+        if probabilities.size == 0:
+            bin_boundaries = np.linspace(0, 1, self.n_bins + 1)
+            self.bin_lowers = bin_boundaries[:-1]
+            self.bin_uppers = bin_boundaries[1:]
+        else:
+            #size of bins
+            bin_n = int(self.n_data/self.n_bins)
+            bin_boundaries = np.array([])
+            probabilities_sort = np.sort(probabilities)
+            for i in range(0,self.n_bins):
+                bin_boundaries = np.append(bin_boundaries,probabilities_sort[i*bin_n])
+            bin_boundaries = np.append(bin_boundaries,1.0)
+            self.bin_lowers = bin_boundaries[:-1]
+            self.bin_uppers = bin_boundaries[1:]
+    def get_probabilities(self, output, labels, logits):
+        #If not probabilities apply softmax!
+        if logits:
+            self.probabilities = softmax(output, axis=1)
+        else:
+            self.probabilities = output
+        self.labels = labels
+        self.confidences = np.max(self.probabilities, axis=1)
+        self.predictions = np.argmax(self.probabilities, axis=1)
+        self.accuracies = np.equal(self.predictions,labels)
+    def binary_matrices(self):
+        idx = np.arange(self.n_data)
+        #make matrices of zeros
+        pred_matrix = np.zeros([self.n_data,self.n_class])
+        label_matrix = np.zeros([self.n_data,self.n_class])
+        #self.acc_matrix = np.zeros([self.n_data,self.n_class])
+        pred_matrix[idx,self.predictions] = 1
+        label_matrix[idx,self.labels] = 1
+        self.acc_matrix = np.equal(pred_matrix, label_matrix)
+    def compute_bins(self, index = None):
+        self.bin_prop = np.zeros(self.n_bins)
+        self.bin_acc = np.zeros(self.n_bins)
+        self.bin_conf = np.zeros(self.n_bins)
+        self.bin_score = np.zeros(self.n_bins)
+        if index == None:
+            confidences = self.confidences
+            accuracies = self.accuracies
+        else:
+            confidences = self.probabilities[:,index]
+            accuracies = self.acc_matrix[:,index]
+        for i, (bin_lower, bin_upper) in enumerate(zip(self.bin_lowers, self.bin_uppers)):
+            # Calculated |confidence - accuracy| in each bin
+            in_bin = np.greater(confidences,bin_lower.item()) * np.less_equal(confidences,bin_upper.item())
+            self.bin_prop[i] = np.mean(in_bin)
+            if self.bin_prop[i].item() > 0:
+                self.bin_acc[i] = np.mean(accuracies[in_bin])
+                self.bin_conf[i] = np.mean(confidences[in_bin])
+                self.bin_score[i] = np.abs(self.bin_conf[i] - self.bin_acc[i])
+class MaxProbCELoss(CELoss):
+    def loss(self, output, labels, n_bins = 15, logits = True):
+        self.n_bins = n_bins
+        super().compute_bin_boundaries()
+        super().get_probabilities(output, labels, logits)
+        super().compute_bins()
+#http://people.cs.pitt.edu/~milos/research/AAAI_Calibration.pdf
+class ECELoss(MaxProbCELoss):
+    def loss(self, output, labels, n_bins = 15, logits = True):
+        super().loss(output, labels, n_bins, logits)
+        return np.dot(self.bin_prop,self.bin_score)
+class MCELoss(MaxProbCELoss):
+    def loss(self, output, labels, n_bins = 15, logits = True):
+        super().loss(output, labels, n_bins, logits)
+        return np.max(self.bin_score)
+#https://arxiv.org/abs/1905.11001
+#Overconfidence Loss (Good in high risk applications where confident but wrong predictions can be especially harmful)
+class OELoss(MaxProbCELoss):
+    def loss(self, output, labels, n_bins = 15, logits = True):
+        super().loss(output, labels, n_bins, logits)
+        return np.dot(self.bin_prop,self.bin_conf * np.maximum(self.bin_conf-self.bin_acc,np.zeros(self.n_bins)))
+#https://arxiv.org/abs/1904.01685
+class SCELoss(CELoss):
+    def loss(self, output, labels, n_bins = 15, logits = True):
+        sce = 0.0
+        self.n_bins = n_bins
+        self.n_data = len(output)
+        self.n_class = len(output[0])
+        super().compute_bin_boundaries()
+        super().get_probabilities(output, labels, logits)
+        super().binary_matrices()
+        for i in range(self.n_class):
+            super().compute_bins(i)
+            sce += np.dot(self.bin_prop,self.bin_score)
+        return sce/self.n_class
+class TACELoss(CELoss):
+    def loss(self, output, labels, threshold = 0.01, n_bins = 15, logits = True):
+        tace = 0.0
+        self.n_bins = n_bins
+        self.n_data = len(output)
+        self.n_class = len(output[0])
+        super().get_probabilities(output, labels, logits)
+        self.probabilities[self.probabilities < threshold] = 0
+        super().binary_matrices()
+        for i in range(self.n_class):
+            super().compute_bin_boundaries(self.probabilities[:,i])
+            super().compute_bins(i)
+            tace += np.dot(self.bin_prop,self.bin_score)
+        return tace/self.n_class
+#create TACELoss with threshold fixed at 0
+class ACELoss(TACELoss):
+    def loss(self, output, labels, n_bins = 15, logits = True):
+        return super().loss(output, labels, 0.0 , n_bins, logits)

new_fine_tuning/README.md ADDED Viewed

	@@ -0,0 +1,197 @@

+## Pre-training Data
+### ratio_proportion_change3 : Calculating Percent Change and Final Amounts
+> clear;python3 prepare_pretraining_input_vocab_file.py -analyze_dataset_by_section True -workspace_name ratio_proportion_change3 -opt_step1 OptionalTask_1 EquationAnswer NumeratorFactor DenominatorFactor -opt_step2 OptionalTask_2 FirstRow1:1 FirstRow1:2 FirstRow2:1 FirstRow2:2 SecondRow ThirdRow -pretrain True -train_file_path pretraining/pretrain1000.txt -train_info_path pretraining/pretrain1000_info.txt -test_file_path pretraining/test1000.txt -test_info_path pretraining/test1000_info.txt
+> clear;python3 prepare_pretraining_input_vocab_file.py -workspace_name ratio_proportion_change3 -opt_step1 OptionalTask_1 EquationAnswer NumeratorFactor DenominatorFactor -opt_step2 OptionalTask_2 FirstRow1:1 FirstRow1:2 FirstRow2:1 FirstRow2:2 SecondRow ThirdRow -pretrain True -train_file_path pretraining/pretrain2000.txt -train_info_path pretraining/pretrain2000_info.txt -test_file_path pretraining/test2000.txt -test_info_path pretraining/test2000_info.txt
+#### Test simple
+> clear;python3 prepare_pretraining_input_vocab_file.py -workspace_name ratio_proportion_change3 -code full -opt_step1 OptionalTask_1 EquationAnswer NumeratorFactor DenominatorFactor -opt_step2 OptionalTask_2 FirstRow1:1 FirstRow1:2 FirstRow2:1 FirstRow2:2 SecondRow ThirdRow -train_file_path full.txt -train_info_path full_info.txt
+> clear;python3 prepare_pretraining_input_vocab_file.py -workspace_name ratio_proportion_change3 -code gt -opt_step1 OptionalTask_1 EquationAnswer NumeratorFactor DenominatorFactor -opt_step2 OptionalTask_2 FirstRow1:1 FirstRow1:2 FirstRow2:1 FirstRow2:2 SecondRow ThirdRow -train_file_path er.txt -train_info_path er_info.txt -test_file_path me.txt -test_info_path me_info.txt
+> clear;python3 prepare_pretraining_input_vocab_file.py -workspace_name ratio_proportion_change3 -code correct -opt_step1 OptionalTask_1 EquationAnswer NumeratorFactor DenominatorFactor -opt_step2 OptionalTask_2 FirstRow1:1 FirstRow1:2 FirstRow2:1 FirstRow2:2 SecondRow ThirdRow -train_file_path correct.txt -train_info_path correct_info.txt -test_file_path incorrect.txt -test_info_path incorrect_info.txt -final_step FinalAnswer
+> clear;python3 prepare_pretraining_input_vocab_file.py -workspace_name ratio_proportion_change3 -code progress -opt_step1 OptionalTask_1 EquationAnswer NumeratorFactor DenominatorFactor -opt_step2 OptionalTask_2 FirstRow1:1 FirstRow1:2 FirstRow2:1 FirstRow2:2 SecondRow ThirdRow -train_file_path graduated.txt -train_info_path graduated_info.txt -test_file_path promoted.txt -test_info_path promoted_info.txt
+### ratio_proportion_change4 : Using Percents and Percent Change
+> clear;python3 prepare_pretraining_input_vocab_file.py -analyze_dataset_by_section True -workspace_name ratio_proportion_change4 -opt_step1 OptionalTask_1 EquationAnswer NumeratorFactor DenominatorFactor NumeratorLabel1 DenominatorLabel1 -opt_step2 OptionalTask_2 FirstRow1:1 FirstRow1:2 FirstRow2:1 FirstRow2:2 SecondRow ThirdRow -pretrain True -train_file_path pretraining/pretrain1000.txt -train_info_path pretraining/pretrain1000_info.txt -test_file_path pretraining/test1000.txt -test_info_path pretraining/test1000_info.txt
+> clear;python3 prepare_pretraining_input_vocab_file.py -workspace_name ratio_proportion_change4 -opt_step1 OptionalTask_1 EquationAnswer NumeratorFactor DenominatorFactor NumeratorLabel1 DenominatorLabel1 -opt_step2 OptionalTask_2 FirstRow1:1 FirstRow1:2 FirstRow2:1 FirstRow2:2 SecondRow ThirdRow -pretrain True -train_file_path pretraining/pretrain2000.txt -train_info_path pretraining/pretrain2000_info.txt -test_file_path pretraining/test2000.txt -test_info_path pretraining/test2000_info.txt
+#### Test simple
+> clear;python3 prepare_pretraining_input_vocab_file.py -workspace_name ratio_proportion_change4 -code full -opt_step1 OptionalTask_1 EquationAnswer NumeratorFactor DenominatorFactor -opt_step2 OptionalTask_2 FirstRow1:1 FirstRow1:2 FirstRow2:1 FirstRow2:2 SecondRow ThirdRow -train_file_path full.txt -train_info_path full_info.txt
+> clear;python3 prepare_pretraining_input_vocab_file.py -workspace_name ratio_proportion_change4 -code gt -opt_step1 OptionalTask_1 EquationAnswer NumeratorFactor DenominatorFactor -opt_step2 OptionalTask_2 FirstRow1:1 FirstRow1:2 FirstRow2:1 FirstRow2:2 SecondRow ThirdRow -train_file_path er.txt -train_info_path er_info.txt -test_file_path me.txt -test_info_path me_info.txt
+> clear;python3 prepare_pretraining_input_vocab_file.py -workspace_name ratio_proportion_change4 -code correct -opt_step1 OptionalTask_1 EquationAnswer NumeratorFactor DenominatorFactor -opt_step2 OptionalTask_2 FirstRow1:1 FirstRow1:2 FirstRow2:1 FirstRow2:2 SecondRow ThirdRow -train_file_path correct.txt -train_info_path correct_info.txt -test_file_path incorrect.txt -test_info_path incorrect_info.txt -final_step FinalAnswer
+> clear;python3 prepare_pretraining_input_vocab_file.py -workspace_name ratio_proportion_change4 -code progress -opt_step1 OptionalTask_1 EquationAnswer NumeratorFactor DenominatorFactor -opt_step2 OptionalTask_2 FirstRow1:1 FirstRow1:2 FirstRow2:1 FirstRow2:2 SecondRow ThirdRow -train_file_path graduated.txt -train_info_path graduated_info.txt -test_file_path promoted.txt -test_info_path promoted_info.txt
+## Pretraining
+### ratio_proportion_change3 : Calculating Percent Change and Final Amounts
+> clear;python3 src/main.py -workspace_name ratio_proportion_change3_1920 -code pretrain1000 --pretrain_dataset pretraining/pretrain1000.txt --pretrain_val_dataset pretraining/test1000.txt
+> clear;python3 src/main.py -workspace_name ratio_proportion_change3 -code pretrain2000 --pretrain_dataset pretraining/pretrain2000.txt --pretrain_val_dataset pretraining/test2000.txt
+#### Test simple models
+> clear;python3 src/main.py -workspace_name ratio_proportion_change3 -code pretrain2000_1l1h-5lr --pretrain_dataset pretraining/pretrain2000.txt --pretrain_val_dataset pretraining/test2000.txt --layers 1 --attn_heads 1
+> clear;python3 src/main.py -workspace_name ratio_proportion_change3 -code pretrain2000_1l2h-5lr --pretrain_dataset pretraining/pretrain2000.txt --pretrain_val_dataset pretraining/test2000.txt --layers 1 --attn_heads 2
+> clear;python3 src/main.py -workspace_name ratio_proportion_change3 -code pretrain2000_2l2h-5lr --pretrain_dataset pretraining/pretrain2000.txt --pretrain_val_dataset pretraining/test2000.txt --layers 2 --attn_heads 2
+> clear;python3 src/main.py -workspace_name ratio_proportion_change3 -code pretrain2000_2l4h-5lr --pretrain_dataset pretraining/pretrain2000.txt --pretrain_val_dataset pretraining/test2000.txt --layers 2 --attn_heads 4
+> clear;python3 src/main.py -workspace_name ratio_proportion_change3 -code pretrain2000_4l4h-5lr --pretrain_dataset pretraining/pretrain2000.txt --pretrain_val_dataset pretraining/test2000.txt --layers 4 --attn_heads 4
+> clear;python3 src/main.py -workspace_name ratio_proportion_change3 -code pretrain2000_4l8h-5lr --pretrain_dataset pretraining/pretrain2000.txt --pretrain_val_dataset pretraining/test2000.txt --layers 4 --attn_heads 8
+### ratio_proportion_change4 : Using Percents and Percent Change
+> clear;python3 src/main.py -workspace_name ratio_proportion_change4 -code pretrain1000 --pretrain_dataset pretraining/pretrain1000.txt --pretrain_val_dataset pretraining/test1000.txt
+> clear;python3 src/main.py -workspace_name ratio_proportion_change4 -code pretrain2000 --pretrain_dataset pretraining/pretrain2000.txt --pretrain_val_dataset pretraining/test2000.txt
+#### Test simple models
+> clear;python3 src/main.py -workspace_name ratio_proportion_change4 -code pretrain2000_1l1h-5lr --pretrain_dataset pretraining/pretrain2000.txt --pretrain_val_dataset pretraining/test2000.txt --layers 1 --attn_heads 1
+> clear;python3 src/main.py -workspace_name ratio_proportion_change4 -code pretrain2000_1l2h-5lr --pretrain_dataset pretraining/pretrain2000.txt --pretrain_val_dataset pretraining/test2000.txt --layers 1 --attn_heads 2
+> clear;python3 src/main.py -workspace_name ratio_proportion_change4 -code pretrain2000_2l2h-5lr --pretrain_dataset pretraining/pretrain2000.txt --pretrain_val_dataset pretraining/test2000.txt --layers 2 --attn_heads 2
+> clear;python3 src/main.py -workspace_name ratio_proportion_change4 -code pretrain2000_2l4h-5lr --pretrain_dataset pretraining/pretrain2000.txt --pretrain_val_dataset pretraining/test2000.txt --layers 2 --attn_heads 4
+> clear;python3 src/main.py -workspace_name ratio_proportion_change4 -code pretrain2000_4l4h-5lr --pretrain_dataset pretraining/pretrain2000.txt --pretrain_val_dataset pretraining/test2000.txt --layers 4 --attn_heads 4
+> clear;python3 src/main.py -workspace_name ratio_proportion_change4 -code pretrain2000_4l8h-5lr --pretrain_dataset pretraining/pretrain2000.txt --pretrain_val_dataset pretraining/test2000.txt --layers 4 --attn_heads 8
+## Preparing Fine Tuning Data
+### ratio_proportion_change3 : Calculating Percent Change and Final Amounts
+> clear;python3 prepare_pretraining_input_vocab_file.py -workspace_name ratio_proportion_change3 -opt_step1 OptionalTask_1 EquationAnswer NumeratorFactor DenominatorFactor -opt_step2 OptionalTask_2 FirstRow1:1 FirstRow1:2 FirstRow2:1 FirstRow2:2 SecondRow ThirdRow -final_step FinalAnswer
+> clear;python3 src/main.py -workspace_name ratio_proportion_change3 -finetune_task check2 --train_dataset finetuning/check2/train.txt --test_dataset finetuning/check2/test.txt --train_label finetuning/check2/train_label.txt --test_label finetuning/check2/test_label.txt --pretrained_bert_checkpoint ratio_proportion_change3/output/bert_trained.seq_encoder.model.ep279 --epochs 51
+#### Attention Head Check
+<!-- > PercentChange	NumeratorQuantity2	NumeratorQuantity1	DenominatorQuantity1	OptionalTask_1	EquationAnswer	NumeratorFactor	EquationAnswer	NumeratorFactor	EquationAnswer	NumeratorFactor	DenominatorFactor	NumeratorFactor	DenominatorFactor	NumeratorFactor	DenominatorFactor	FirstRow1:2	FirstRow1:1	FirstRow2:1	FirstRow2:2	FirstRow2:1	SecondRow	ThirdRow	FinalAnswerDirection	ThirdRow	FinalAnswer -->
+> clear;python3 src/main.py -workspace_name ratio_proportion_change3 -code pretrain2000_1l1h-5lr --train_dataset full/full.txt --pretrained_bert_checkpoint  ratio_proportion_change3/output/pretrain2000_1l1h-5lr/bert_trained.seq_encoder.model.ep598 --attention True -finetune_task full;python3 src/main.py -workspace_name ratio_proportion_change3 -code pretrain2000_1l1h-5lr --train_dataset gt/er.txt --pretrained_bert_checkpoint ratio_proportion_change3/output/pretrain2000_1l1h-5lr/bert_trained.seq_encoder.model.ep598 --attention True -finetune_task er ;python3 src/main.py -workspace_name ratio_proportion_change3 -code pretrain2000_1l1h-5lr --train_dataset gt/me.txt --pretrained_bert_checkpoint ratio_proportion_change3/output/pretrain2000_1l1h-5lr/bert_trained.seq_encoder.model.ep598 --attention True -finetune_task me;python3 src/main.py -workspace_name ratio_proportion_change3 -code pretrain2000_1l1h-5lr --train_dataset correct/correct.txt --pretrained_bert_checkpoint ratio_proportion_change3/output/pretrain2000_1l1h-5lr/bert_trained.seq_encoder.model.ep598 --attention True -finetune_task correct ;python3 src/main.py -workspace_name ratio_proportion_change3 -code pretrain2000_1l1h-5lr --train_dataset correct/incorrect.txt --pretrained_bert_checkpoint  ratio_proportion_change3/output/pretrain2000_1l1h-5lr/bert_trained.seq_encoder.model.ep598 --attention True -finetune_task incorrect;python3 src/main.py -workspace_name ratio_proportion_change3 -code pretrain2000_1l1h-5lr --train_dataset progress/graduated.txt --pretrained_bert_checkpoint ratio_proportion_change3/output/pretrain2000_1l1h-5lr/bert_trained.seq_encoder.model.ep598 --attention True -finetune_task graduated;python3 src/main.py -workspace_name ratio_proportion_change3 -code pretrain2000_1l1h-5lr --train_dataset progress/promoted.txt --pretrained_bert_checkpoint ratio_proportion_change3/output/pretrain2000_1l1h-5lr/bert_trained.seq_encoder.model.ep598 --attention True -finetune_task promoted
+> clear;python3 src/main.py -workspace_name ratio_proportion_change3 -code pretrain2000_1l2h-5lr --train_dataset full/full.txt --pretrained_bert_checkpoint  ratio_proportion_change3/output/pretrain2000_1l2h-5lr/bert_trained.seq_encoder.model.ep823 --attention True -finetune_task full;python3 src/main.py -workspace_name ratio_proportion_change3 -code pretrain2000_1l2h-5lr --train_dataset gt/er.txt --pretrained_bert_checkpoint  ratio_proportion_change3/output/pretrain2000_1l2h-5lr/bert_trained.seq_encoder.model.ep823 --attention True -finetune_task er;python3 src/main.py -workspace_name ratio_proportion_change3 -code pretrain2000_1l2h-5lr --train_dataset gt/me.txt --pretrained_bert_checkpoint  ratio_proportion_change3/output/pretrain2000_1l2h-5lr/bert_trained.seq_encoder.model.ep823 --attention True -finetune_task me;python3 src/main.py -workspace_name ratio_proportion_change3 -code pretrain2000_1l2h-5lr --train_dataset correct/correct.txt --pretrained_bert_checkpoint  ratio_proportion_change3/output/pretrain2000_1l2h-5lr/bert_trained.seq_encoder.model.ep823 --attention True -finetune_task correct;python3 src/main.py -workspace_name ratio_proportion_change3 -code pretrain2000_1l2h-5lr --train_dataset correct/incorrect.txt --pretrained_bert_checkpoint  ratio_proportion_change3/output/pretrain2000_1l2h-5lr/bert_trained.seq_encoder.model.ep823 --attention True -finetune_task incorrect;python3 src/main.py -workspace_name ratio_proportion_change3 -code pretrain2000_1l2h-5lr --train_dataset progress/graduated.txt --pretrained_bert_checkpoint  ratio_proportion_change3/output/pretrain2000_1l2h-5lr/bert_trained.seq_encoder.model.ep823 --attention True -finetune_task graduated;python3 src/main.py -workspace_name ratio_proportion_change3 -code pretrain2000_1l2h-5lr --train_dataset progress/promoted.txt --pretrained_bert_checkpoint  ratio_proportion_change3/output/pretrain2000_1l2h-5lr/bert_trained.seq_encoder.model.ep823 --attention True -finetune_task promoted
+> clear;python3 src/main.py -workspace_name ratio_proportion_change3 -code pretrain2000_2l2h-5lr --train_dataset full/full.txt --pretrained_bert_checkpoint  ratio_proportion_change3/output/pretrain2000_2l2h-5lr/bert_trained.seq_encoder.model.ep1045 --attention True -finetune_task full;python3 src/main.py -workspace_name ratio_proportion_change3 -code pretrain2000_2l2h-5lr --train_dataset gt/er.txt --pretrained_bert_checkpoint  ratio_proportion_change3/output/pretrain2000_2l2h-5lr/bert_trained.seq_encoder.model.ep1045 --attention True -finetune_task er;python3 src/main.py -workspace_name ratio_proportion_change3 -code pretrain2000_2l2h-5lr --train_dataset gt/me.txt --pretrained_bert_checkpoint  ratio_proportion_change3/output/pretrain2000_2l2h-5lr/bert_trained.seq_encoder.model.ep1045 --attention True -finetune_task me;python3 src/main.py -workspace_name ratio_proportion_change3 -code pretrain2000_2l2h-5lr --train_dataset correct/correct.txt --pretrained_bert_checkpoint  ratio_proportion_change3/output/pretrain2000_2l2h-5lr/bert_trained.seq_encoder.model.ep1045 --attention True -finetune_task correct;python3 src/main.py -workspace_name ratio_proportion_change3 -code pretrain2000_2l2h-5lr --train_dataset correct/incorrect.txt --pretrained_bert_checkpoint  ratio_proportion_change3/output/pretrain2000_2l2h-5lr/bert_trained.seq_encoder.model.ep1045 --attention True -finetune_task incorrect;python3 src/main.py -workspace_name ratio_proportion_change3 -code pretrain2000_2l2h-5lr --train_dataset progress/graduated.txt --pretrained_bert_checkpoint  ratio_proportion_change3/output/pretrain2000_2l2h-5lr/bert_trained.seq_encoder.model.ep1045 --attention True -finetune_task graduated;python3 src/main.py -workspace_name ratio_proportion_change3 -code pretrain2000_2l2h-5lr --train_dataset progress/promoted.txt --pretrained_bert_checkpoint  ratio_proportion_change3/output/pretrain2000_2l2h-5lr/bert_trained.seq_encoder.model.ep1045 --attention True -finetune_task promoted
+> clear;python3 src/main.py -workspace_name ratio_proportion_change3 -code pretrain2000_2l4h-5lr --train_dataset full/full.txt --pretrained_bert_checkpoint  ratio_proportion_change3/output/pretrain2000_2l4h-5lr/bert_trained.seq_encoder.model.ep1336 --attention True -finetune_task full;python3 src/main.py -workspace_name ratio_proportion_change3 -code pretrain2000_2l4h-5lr --train_dataset gt/er.txt --pretrained_bert_checkpoint  ratio_proportion_change3/output/pretrain2000_2l4h-5lr/bert_trained.seq_encoder.model.ep1336 --attention True -finetune_task er;python3 src/main.py -workspace_name ratio_proportion_change3 -code pretrain2000_2l4h-5lr --train_dataset gt/me.txt --pretrained_bert_checkpoint  ratio_proportion_change3/output/pretrain2000_2l4h-5lr/bert_trained.seq_encoder.model.ep1336 --attention True -finetune_task me;python3 src/main.py -workspace_name ratio_proportion_change3 -code pretrain2000_2l4h-5lr --train_dataset correct/correct.txt --pretrained_bert_checkpoint  ratio_proportion_change3/output/pretrain2000_2l4h-5lr/bert_trained.seq_encoder.model.ep1336 --attention True -finetune_task correct;python3 src/main.py -workspace_name ratio_proportion_change3 -code pretrain2000_2l4h-5lr --train_dataset correct/incorrect.txt --pretrained_bert_checkpoint  ratio_proportion_change3/output/pretrain2000_2l4h-5lr/bert_trained.seq_encoder.model.ep1336 --attention True -finetune_task incorrect;python3 src/main.py -workspace_name ratio_proportion_change3 -code pretrain2000_2l4h-5lr --train_dataset progress/graduated.txt --pretrained_bert_checkpoint  ratio_proportion_change3/output/pretrain2000_2l4h-5lr/bert_trained.seq_encoder.model.ep1336 --attention True -finetune_task graduated;python3 src/main.py -workspace_name ratio_proportion_change3 -code pretrain2000_2l4h-5lr --train_dataset progress/promoted.txt --pretrained_bert_checkpoint  ratio_proportion_change3/output/pretrain2000_2l4h-5lr/bert_trained.seq_encoder.model.ep1336 --attention True -finetune_task promoted
+<!-- > clear;python3 src/main.py -workspace_name ratio_proportion_change3 -code pretrain2000_4l4h-5lr --train_dataset full/full.txt --pretrained_bert_checkpoint  ratio_proportion_change3/output/pretrain2000_4l4h-5lr/bert_trained.seq_encoder.model.ep923 --attention True -->
+> clear;python3 src/main.py -workspace_name ratio_proportion_change3 -code pretrain2000_4l4h-5lr --train_dataset full/full.txt --pretrained_bert_checkpoint  ratio_proportion_change3/output/pretrain2000_4l4h-5lr/bert_trained.seq_encoder.model.ep871 --attention True -finetune_task full;python3 src/main.py -workspace_name ratio_proportion_change3 -code pretrain2000_4l4h-5lr --train_dataset gt/er.txt --pretrained_bert_checkpoint  ratio_proportion_change3/output/pretrain2000_4l4h-5lr/bert_trained.seq_encoder.model.ep871 --attention True -finetune_task er;python3 src/main.py -workspace_name ratio_proportion_change3 -code pretrain2000_4l4h-5lr --train_dataset gt/me.txt --pretrained_bert_checkpoint  ratio_proportion_change3/output/pretrain2000_4l4h-5lr/bert_trained.seq_encoder.model.ep871 --attention True -finetune_task me;python3 src/main.py -workspace_name ratio_proportion_change3 -code pretrain2000_4l4h-5lr --train_dataset correct/correct.txt --pretrained_bert_checkpoint  ratio_proportion_change3/output/pretrain2000_4l4h-5lr/bert_trained.seq_encoder.model.ep871 --attention True -finetune_task correct;python3 src/main.py -workspace_name ratio_proportion_change3 -code pretrain2000_4l4h-5lr --train_dataset correct/incorrect.txt --pretrained_bert_checkpoint  ratio_proportion_change3/output/pretrain2000_4l4h-5lr/bert_trained.seq_encoder.model.ep871 --attention True -finetune_task incorrect;python3 src/main.py -workspace_name ratio_proportion_change3 -code pretrain2000_4l4h-5lr --train_dataset progress/graduated.txt --pretrained_bert_checkpoint  ratio_proportion_change3/output/pretrain2000_4l4h-5lr/bert_trained.seq_encoder.model.ep871 --attention True -finetune_task graduated;python3 src/main.py -workspace_name ratio_proportion_change3 -code pretrain2000_4l4h-5lr --train_dataset progress/promoted.txt --pretrained_bert_checkpoint  ratio_proportion_change3/output/pretrain2000_4l4h-5lr/bert_trained.seq_encoder.model.ep871 --attention True -finetune_task promoted
+clear;python3 src/main.py -workspace_name ratio_proportion_change3 -code pretrain2000_4l8h-5lr --train_dataset full/full_attn.txt --pretrained_bert_checkpoint  ratio_proportion_change3/output/pretrain2000_4l8h-5lr/bert_trained.seq_encoder.model.ep1349 --attention True -finetune_task full
+> clear;python3 src/main.py -workspace_name ratio_proportion_change3 -code pretrain2000_4l8h-5lr --train_dataset full/full.txt --pretrained_bert_checkpoint  ratio_proportion_change3/output/pretrain2000_4l8h-5lr/bert_trained.seq_encoder.model.ep1349 --attention True -finetune_task full;python3 src/main.py -workspace_name ratio_proportion_change3 -code pretrain2000_4l8h-5lr --train_dataset gt/er.txt --pretrained_bert_checkpoint  ratio_proportion_change3/output/pretrain2000_4l8h-5lr/bert_trained.seq_encoder.model.ep1349 --attention True -finetune_task er;python3 src/main.py -workspace_name ratio_proportion_change3 -code pretrain2000_4l8h-5lr --train_dataset gt/me.txt --pretrained_bert_checkpoint  ratio_proportion_change3/output/pretrain2000_4l8h-5lr/bert_trained.seq_encoder.model.ep1349 --attention True -finetune_task me;python3 src/main.py -workspace_name ratio_proportion_change3 -code pretrain2000_4l8h-5lr --train_dataset correct/correct.txt --pretrained_bert_checkpoint  ratio_proportion_change3/output/pretrain2000_4l8h-5lr/bert_trained.seq_encoder.model.ep1349 --attention True -finetune_task correct;python3 src/main.py -workspace_name ratio_proportion_change3 -code pretrain2000_4l8h-5lr --train_dataset correct/incorrect.txt --pretrained_bert_checkpoint  ratio_proportion_change3/output/pretrain2000_4l8h-5lr/bert_trained.seq_encoder.model.ep1349 --attention True -finetune_task incorrect;python3 src/main.py -workspace_name ratio_proportion_change3 -code pretrain2000_4l8h-5lr --train_dataset progress/graduated.txt --pretrained_bert_checkpoint  ratio_proportion_change3/output/pretrain2000_4l8h-5lr/bert_trained.seq_encoder.model.ep1349 --attention True -finetune_task graduated;python3 src/main.py -workspace_name ratio_proportion_change3 -code pretrain2000_4l8h-5lr --train_dataset progress/promoted.txt --pretrained_bert_checkpoint  ratio_proportion_change3/output/pretrain2000_4l8h-5lr/bert_trained.seq_encoder.model.ep1349 --attention True -finetune_task promoted
+<!-- PercentChange	NumeratorQuantity2	NumeratorQuantity1	DenominatorQuantity1	OptionalTask_2	FirstRow2:1	FirstRow2:2	FirstRow1:1	SecondRow	ThirdRow	FinalAnswer	FinalAnswerDirection --> me
+<!-- PercentChange	NumeratorQuantity2	NumeratorQuantity1	DenominatorQuantity1	OptionalTask_1	DenominatorFactor	NumeratorFactor	OptionalTask_2	EquationAnswer	FirstRow1:1	FirstRow1:2	FirstRow2:2	FirstRow2:1	FirstRow1:2	SecondRow	ThirdRow	FinalAnswer --> er
+> clear;python3 src/main.py -workspace_name ratio_proportion_change3 -code pretrain2000_1l1h-5lr --train_dataset pretraining/attention_train.txt --pretrained_bert_checkpoint  ratio_proportion_change3/output/pretrain2000_1l1h-5lr/bert_trained.seq_encoder.model.ep273 --attention True
+<!-- PercentChange	NumeratorQuantity2	NumeratorQuantity1	DenominatorQuantity1	OptionalTask_1	DenominatorFactor	NumeratorFactor	OptionalTask_2	EquationAnswer	FirstRow1:1	FirstRow1:2	FirstRow2:2	FirstRow2:1	FirstRow1:2	SecondRow	ThirdRow	FinalAnswer -->
+> clear;python3 src/main.py -workspace_name ratio_proportion_change3 -code pretrain2000_1l2h-5lr --train_dataset pretraining/attention_train.txt --pretrained_bert_checkpoint  ratio_proportion_change3/output/pretrain2000_1l2h-5lr/bert_trained.seq_encoder.model.ep1021 --attention True
+### ratio_proportion_change4 : Using Percents and Percent Change
+> clear;python3 prepare_pretraining_input_vocab_file.py -workspace_name ratio_proportion_change4 -opt_step1 OptionalTask_1 EquationAnswer NumeratorFactor DenominatorFactor NumeratorLabel1 DenominatorLabel1 -opt_step2 OptionalTask_2 FirstRow1:1 FirstRow1:2 FirstRow2:1 FirstRow2:2 SecondRow ThirdRow -final_step FinalAnswer
+### scale_drawings_3 : Calculating Measurements Using a Scale
+> clear;python3 prepare_pretraining_input_vocab_file.py -workspace_name scale_drawings_3 -opt_step1 opt1-check opt1-ratio-L-n opt1-ratio-L-d opt1-ratio-R-n opt1-ratio-R-d opt1-me2-top-3 opt1-me2-top-4 opt1-me2-top-2 opt1-me2-top-1 opt1-me2-middle-1 opt1-me2-bottom-1 -opt_step2 opt2-check opt2-ratio-L-n opt2-ratio-L-d opt2-ratio-R-n opt2-ratio-R-d opt2-me2-top-3 opt2-me2-top-4 opt2-me2-top-1 opt2-me2-top-2 opt2-me2-middle-1 opt2-me2-bottom-1 -final_step unk-value1 unk-value2
+### sales_tax_discounts_two_rates : Solving Problems with Both Sales Tax and Discounts
+> clear;python3 prepare_pretraining_input_vocab_file.py -workspace_name sales_tax_discounts_two_rates -opt_step1 optionalTaskGn salestaxFactor2 discountFactor2 multiplyOrderStatementGn -final_step totalCost1
+# Fine Tuning Pre-trained model
+## ratio_proportion_change3 : Calculating Percent Change and Final Amounts
+> Selected Pretrained model: **ratio_proportion_change3/output/bert_trained.seq_encoder.model.ep279**
+> New **bert/ratio_proportion_change3/output/pretrain2000/bert_trained.seq_encoder.model.ep731**
+### 10per
+> clear;python3 src/main.py -workspace_name ratio_proportion_change3 -finetune_task 10per --train_dataset finetuning/10per/train.txt --test_dataset finetuning/10per/test.txt --train_label finetuning/10per/train_label.txt --test_label finetuning/10per/test_label.txt --pretrained_bert_checkpoint ratio_proportion_change3/output/pretrain2000/bert_trained.seq_encoder.model.ep731 --epochs 51
+### IS
+> clear;python3 src/main.py -workspace_name ratio_proportion_change3 -finetune_task IS --train_dataset finetuning/IS/train.txt --test_dataset finetuning/FS/train.txt --train_label finetuning/IS/train_label.txt --test_label finetuning/FS/train_label.txt --pretrained_bert_checkpoint ratio_proportion_change3/output/pretrain2000/bert_trained.seq_encoder.model.ep731 --epochs 51
+### FS
+> clear;python3 src/main.py -workspace_name ratio_proportion_change3 -finetune_task FS --train_dataset finetuning/FS/train.txt --test_dataset finetuning/IS/train.txt --train_label finetuning/FS/train_label.txt --test_label finetuning/IS/train_label.txt --pretrained_bert_checkpoint ratio_proportion_change3/output/pretrain2000/bert_trained.seq_encoder.model.ep731 --epochs 51
+### correctness
+> clear;python3 src/main.py -workspace_name ratio_proportion_change3 -finetune_task correctness --train_dataset finetuning/correctness/train.txt --test_dataset finetuning/correctness/test.txt --train_label finetuning/correctness/train_label.txt --test_label finetuning/correctness/test_label.txt --pretrained_bert_checkpoint ratio_proportion_change3/output/bert_trained.seq_encoder.model.ep279 --epochs 51
+### SL
+> clear;python3 src/main.py -workspace_name ratio_proportion_change3 -finetune_task SL --train_dataset finetuning/SL/train.txt --test_dataset finetuning/SL/test.txt --train_label finetuning/SL/train_label.txt --test_label finetuning/SL/test_label.txt --pretrained_bert_checkpoint ratio_proportion_change3/output/bert_trained.seq_encoder.model.ep279 --epochs 51
+### effectiveness
+> clear;python3 src/main.py -workspace_name ratio_proportion_change3 -finetune_task effectiveness --train_dataset finetuning/effectiveness/train.txt --test_dataset finetuning/effectiveness/test.txt --train_label finetuning/effectiveness/train_label.txt --test_label finetuning/effectiveness/test_label.txt --pretrained_bert_checkpoint ratio_proportion_change3/output/bert_trained.seq_encoder.model.ep279 --epochs 51
+## ratio_proportion_change4 : Using Percents and Percent Change
+> Selected Pretrained model: **ratio_proportion_change4/output/bert_trained.seq_encoder.model.ep287**
+### 10per
+> clear;python3 src/main.py -workspace_name ratio_proportion_change4 -finetune_task 10per --train_dataset finetuning/10per/train.txt --test_dataset finetuning/10per/test.txt --train_label finetuning/10per/train_label.txt --test_label finetuning/10per/test_label.txt --pretrained_bert_checkpoint ratio_proportion_change4/output/bert_trained.seq_encoder.model.ep287 --epochs 51
+### IS
+### FS
+### correctness
+> clear;python3 src/main.py -workspace_name ratio_proportion_change4 -finetune_task correctness --train_dataset finetuning/correctness/train.txt --test_dataset finetuning/correctness/test.txt --train_label finetuning/correctness/train_label.txt --test_label finetuning/correctness/test_label.txt --pretrained_bert_checkpoint ratio_proportion_change4/output/bert_trained.seq_encoder.model.ep287 --epochs 51
+### SL
+> clear;python3 src/main.py -workspace_name ratio_proportion_change4 -finetune_task SL --train_dataset finetuning/SL/train.txt --test_dataset finetuning/SL/test.txt --train_label finetuning/SL/train_label.txt --test_label finetuning/SL/test_label.txt --pretrained_bert_checkpoint ratio_proportion_change4/output/bert_trained.seq_encoder.model.ep287 --epochs 51
+### effectiveness
+> clear;python3 src/main.py -workspace_name ratio_proportion_change4 -finetune_task effectiveness --train_dataset finetuning/effectiveness/train.txt --test_dataset finetuning/effectiveness/test.txt --train_label finetuning/effectiveness/train_label.txt --test_label finetuning/effectiveness/test_label.txt --pretrained_bert_checkpoint ratio_proportion_change4/output/bert_trained.seq_encoder.model.ep287 --epochs 51
+## scale_drawings_3 : Calculating Measurements Using a Scale
+> Selected Pretrained model: **scale_drawings_3/output/bert_trained.seq_encoder.model.ep252**
+### 10per
+> clear;python3 src/main.py -workspace_name scale_drawings_3 -finetune_task 10per --train_dataset finetuning/10per/train.txt --test_dataset finetuning/10per/test.txt --train_label finetuning/10per/train_label.txt --test_label finetuning/10per/test_label.txt --pretrained_bert_checkpoint scale_drawings_3/output/bert_trained.seq_encoder.model.ep252 --epochs 51
+### IS
+### FS
+### correctness
+> clear;python3 src/main.py -workspace_name scale_drawings_3 -finetune_task correctness --train_dataset finetuning/correctness/train.txt --test_dataset finetuning/correctness/test.txt --train_label finetuning/correctness/train_label.txt --test_label finetuning/correctness/test_label.txt --pretrained_bert_checkpoint scale_drawings_3/output/bert_trained.seq_encoder.model.ep252 --epochs 51
+### SL
+> clear;python3 src/main.py -workspace_name scale_drawings_3 -finetune_task SL --train_dataset finetuning/SL/train.txt --test_dataset finetuning/SL/test.txt --train_label finetuning/SL/train_label.txt --test_label finetuning/SL/test_label.txt --pretrained_bert_checkpoint scale_drawings_3/output/bert_trained.seq_encoder.model.ep252 --epochs 51
+### effectiveness
+## sales_tax_discounts_two_rates : Solving Problems with Both Sales Tax and Discounts
+> Selected Pretrained model: **sales_tax_discounts_two_rates/output/bert_trained.seq_encoder.model.ep255**
+### 10per
+> clear;python3 src/main.py -workspace_name sales_tax_discounts_two_rates -finetune_task 10per --train_dataset finetuning/10per/train.txt --test_dataset finetuning/10per/test.txt --train_label finetuning/10per/train_label.txt --test_label finetuning/10per/test_label.txt --pretrained_bert_checkpoint sales_tax_discounts_two_rates/output/bert_trained.seq_encoder.model.ep255 --epochs 51
+### IS
+### FS
+### correctness
+> clear;python3 src/main.py -workspace_name sales_tax_discounts_two_rates -finetune_task correctness --train_dataset finetuning/correctness/train.txt --test_dataset finetuning/correctness/test.txt --train_label finetuning/correctness/train_label.txt --test_label finetuning/correctness/test_label.txt --pretrained_bert_checkpoint sales_tax_discounts_two_rates/output/bert_trained.seq_encoder.model.ep255 --epochs 51
+### SL
+### effectiveness

new_fine_tuning/__pycache__/metrics.cpython-312.pyc ADDED Viewed

Binary file (9.16 kB). View file

new_fine_tuning/__pycache__/recalibration.cpython-312.pyc ADDED Viewed

Binary file (5.51 kB). View file

new_fine_tuning/__pycache__/visualization.cpython-312.pyc ADDED Viewed

Binary file (5.28 kB). View file

new_hint_fine_tuned.py ADDED Viewed

	@@ -0,0 +1,131 @@

+import argparse
+import os
+import torch
+import torch.nn as nn
+from torch.utils.data import DataLoader, random_split, TensorDataset
+from src.dataset import TokenizerDataset
+from src.bert import BERT
+from src.pretrainer import BERTFineTuneTrainer1
+from src.vocab import Vocab
+import pandas as pd
+def preprocess_labels(label_csv_path):
+    try:
+        labels_df = pd.read_csv(label_csv_path)
+        labels = labels_df['last_hint_class'].values.astype(int)
+        return torch.tensor(labels, dtype=torch.long)
+    except Exception as e:
+        print(f"Error reading dataset file: {e}")
+        return None
+def preprocess_data(data_path, vocab, max_length=128):
+    try:
+        with open(data_path, 'r') as f:
+            sequences = f.readlines()
+    except Exception as e:
+        print(f"Error reading data file: {e}")
+        return None, None
+    tokenized_sequences = []
+    for sequence in sequences:
+        sequence = sequence.strip()
+        if sequence:
+            encoded = vocab.to_seq(sequence, seq_len=max_length)
+            encoded = encoded[:max_length] + [vocab.vocab.get('[PAD]', 0)] * (max_length - len(encoded))
+            segment_label = [0] * max_length
+            tokenized_sequences.append({
+                'input_ids': torch.tensor(encoded),
+                'segment_label': torch.tensor(segment_label)
+            })
+    input_ids = torch.cat([t['input_ids'].unsqueeze(0) for t in tokenized_sequences], dim=0)
+    segment_labels = torch.cat([t['segment_label'].unsqueeze(0) for t in tokenized_sequences], dim=0)
+    print(f"Input IDs shape: {input_ids.shape}")
+    print(f"Segment labels shape: {segment_labels.shape}")
+    return input_ids, segment_labels
+def custom_collate_fn(batch):
+    inputs = [item['input_ids'].unsqueeze(0) for item in batch]
+    labels = [item['label'].unsqueeze(0) for item in batch]
+    segment_labels = [item['segment_label'].unsqueeze(0) for item in batch]
+    inputs = torch.cat(inputs, dim=0)
+    labels = torch.cat(labels, dim=0)
+    segment_labels = torch.cat(segment_labels, dim=0)
+    return {
+        'input': inputs,
+        'label': labels,
+        'segment_label': segment_labels
+    }
+def main(opt):
+    # Set device to GPU if available, otherwise use CPU
+    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+    # Load vocabulary
+    vocab = Vocab(opt.vocab_file)
+    vocab.load_vocab()
+    # Preprocess data and labels
+    input_ids, segment_labels = preprocess_data(opt.data_path, vocab, max_length=50)  # Using sequence length 50
+    labels = preprocess_labels(opt.dataset)
+    if input_ids is None or segment_labels is None or labels is None:
+        print("Error in preprocessing data. Exiting.")
+        return
+    # Create TensorDataset and split into train and validation sets
+    dataset = TensorDataset(input_ids, segment_labels, labels)
+    val_size = len(dataset) - int(0.8 * len(dataset))
+    val_dataset, train_dataset = random_split(dataset, [val_size, len(dataset) - val_size])
+    # Create DataLoaders for training and validation
+    train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=custom_collate_fn)
+    val_dataloader = DataLoader(val_dataset, batch_size=32, shuffle=False, collate_fn=custom_collate_fn)
+    # Initialize custom BERT model and move it to the device
+    custom_model = CustomBERTModel(
+        vocab_size=len(vocab.vocab),
+        output_dim=2,
+        pre_trained_model_path=opt.pre_trained_model_path
+    ).to(device)
+    # Initialize the fine-tuning trainer
+    trainer = BERTFineTuneTrainer1(
+        bert=custom_model,
+        vocab_size=len(vocab.vocab),
+        train_dataloader=train_dataloader,
+        test_dataloader=val_dataloader,
+        lr=1e-5,  # Using learning rate 10^-5 as specified
+        num_labels=2,
+        with_cuda=torch.cuda.is_available(),
+        log_freq=10,
+        workspace_name=opt.output_dir,
+        log_folder_path=opt.log_folder_path
+    )
+    # Train the model
+    trainer.train(epoch=20)
+    # Save the model
+    os.makedirs(opt.output_dir, exist_ok=True)
+    output_model_file = os.path.join(opt.output_dir, 'fine_tuned_model_3.pth')
+    torch.save(custom_model, output_model_file)
+    print(f'Model saved to {output_model_file}')
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Fine-tune BERT model.')
+    parser.add_argument('--dataset', type=str, default='/home/jupyter/bert/dataset/hint_based/ratio_proportion_change_3/er/er_train.csv', help='Path to the dataset file.')
+    parser.add_argument('--data_path', type=str, default='/home/jupyter/bert/ratio_proportion_change3_1920/_Aug23/gt/er.txt', help='Path to the input sequence file.')
+    parser.add_argument('--output_dir', type=str, default='/home/jupyter/bert/ratio_proportion_change3_1920/_Aug23/output/hint_classification', help='Directory to save the fine-tuned model.')
+    parser.add_argument('--pre_trained_model_path', type=str, default='/home/jupyter/bert/ratio_proportion_change3_1920/output/pretrain:1800ms:64hs:4l:8a:50s:64b:1000e:-5lr/bert_trained.seq_encoder.model.ep68', help='Path to the pre-trained BERT model.')
+    parser.add_argument('--vocab_file', type=str, default='/home/jupyter/bert/ratio_proportion_change3_1920/_Aug23/pretraining/vocab.txt', help='Path to the vocabulary file.')
+    parser.add_argument('--log_folder_path', type=str, default='/home/jupyter/bert/ratio_proportion_change3_1920/logs/oct', help='Path to the folder for saving logs.')
+    opt = parser.parse_args()
+    main(opt)

new_test_saved_finetuned_model.py ADDED Viewed

	@@ -0,0 +1,613 @@

+import argparse
+import os
+import torch
+import torch.nn as nn
+from torch.optim import Adam
+from torch.utils.data import DataLoader
+import pickle
+print("here1",os.getcwd())
+from src.dataset import TokenizerDataset, TokenizerDatasetForCalibration
+from src.vocab import Vocab
+print("here3",os.getcwd())
+from src.bert import BERT
+from src.seq_model import BERTSM
+from src.classifier_model import BERTForClassification, BERTForClassificationWithFeats
+# from src.new_finetuning.optim_schedule import ScheduledOptim
+import metrics, recalibration, visualization
+from recalibration import ModelWithTemperature
+import tqdm
+import sys
+import time
+import numpy as np
+from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, roc_curve, roc_auc_score
+import matplotlib.pyplot as plt
+import seaborn as sns
+import pandas as pd
+from collections import defaultdict
+print("here3",os.getcwd())
+class BERTFineTuneTrainer:
+    def __init__(self, bertFinetunedClassifierwithFeats: BERT, #BERTForClassificationWithFeats
+                 vocab_size: int, test_dataloader: DataLoader = None,
+                 lr: float = 1e-4, betas=(0.9, 0.999), weight_decay: float = 0.01, warmup_steps=10000,
+                 with_cuda: bool = True, cuda_devices=None, log_freq: int = 10, workspace_name=None,
+                 num_labels=2, log_folder_path: str = None):
+        """
+        :param bert: BERT model which you want to train
+        :param vocab_size: total word vocab size
+        :param test_dataloader: test dataset data loader [can be None]
+        :param lr: learning rate of optimizer
+        :param betas: Adam optimizer betas
+        :param weight_decay: Adam optimizer weight decay param
+        :param with_cuda: traning with cuda
+        :param log_freq: logging frequency of the batch iteration
+        """
+        # Setup cuda device for BERT training, argument -c, --cuda should be true
+        # cuda_condition = torch.cuda.is_available() and with_cuda
+        # self.device = torch.device("cuda:0" if cuda_condition else "cpu")
+        self.device = torch.device("cpu") #torch.device("cuda:0" if cuda_condition else "cpu")
+        # print(cuda_condition, " Device used = ", self.device)
+        print(" Device used = ", self.device)
+        # available_gpus = list(range(torch.cuda.device_count()))
+        # This BERT model will be saved every epoch
+        self.model = bertFinetunedClassifierwithFeats.to("cpu")
+        print(self.model.parameters())
+        for param in self.model.parameters():
+            param.requires_grad = False
+        # Initialize the BERT Language Model, with BERT model
+        # self.model = BERTForClassification(self.bert, vocab_size, num_labels).to(self.device)
+        # self.model = BERTForClassificationWithFeats(self.bert, num_labels, 8).to(self.device)
+        # self.model = bertFinetunedClassifierwithFeats
+        # print(self.model.bert.parameters())
+        # for param in self.model.bert.parameters():
+        #     param.requires_grad = False
+        # BERTForClassificationWithFeats(self.bert, num_labels, 18).to(self.device)
+        # self.model = BERTForClassificationWithFeats(self.bert, num_labels, 1).to(self.device)
+        # Distributed GPU training if CUDA can detect more than 1 GPU
+        # if with_cuda and torch.cuda.device_count() > 1:
+        #     print("Using %d GPUS for BERT" % torch.cuda.device_count())
+        #     self.model = nn.DataParallel(self.model, device_ids=available_gpus)
+        # Setting the train, validation and test data loader
+        # self.train_data = train_dataloader
+        # self.val_data = val_dataloader
+        self.test_data = test_dataloader
+        # self.optim = Adam(self.model.parameters(), lr=lr, weight_decay=weight_decay) #, eps=1e-9
+        self.optim = Adam(self.model.parameters(), lr=lr, betas=betas, weight_decay=weight_decay)
+        # self.optim_schedule = ScheduledOptim(self.optim, self.model.bert.hidden, n_warmup_steps=warmup_steps)
+        # self.scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=2, factor=0.1)
+        self.criterion = nn.CrossEntropyLoss()
+        # if num_labels == 1:
+        #     self.criterion = nn.MSELoss()
+        # elif num_labels == 2:
+        #     self.criterion = nn.BCEWithLogitsLoss()
+        #     # self.criterion = nn.CrossEntropyLoss()
+        # elif num_labels > 2:
+            # self.criterion = nn.CrossEntropyLoss()
+            # self.criterion = nn.BCEWithLogitsLoss()
+        self.log_freq = log_freq
+        self.log_folder_path = log_folder_path
+        # self.workspace_name = workspace_name
+        # self.finetune_task = finetune_task
+        # self.save_model = False
+        # self.avg_loss = 10000
+        self.start_time = time.time()
+        # self.probability_list = []
+        for fi in ['test']: #'val',
+            f = open(self.log_folder_path+f"/log_{fi}_finetuned.txt", 'w')
+            f.close()
+        print("Total Parameters:", sum([p.nelement() for p in self.model.parameters()]))
+    # def train(self, epoch):
+    #     self.iteration(epoch, self.train_data)
+    # def val(self, epoch):
+    #     self.iteration(epoch, self.val_data, phase="val")
+    def test(self, epoch):
+        # if epoch == 0:
+        #     self.avg_loss = 10000
+        self.iteration(epoch, self.test_data, phase="test")
+    def iteration(self, epoch, data_loader, phase="train"):
+        """
+        loop over the data_loader for training or testing
+        if on train status, backward operation is activated
+        and also auto save the model every peoch
+        :param epoch: current epoch index
+        :param data_loader: torch.utils.data.DataLoader for iteration
+        :param train: boolean value of is train or test
+        :return: None
+        """
+        # Setting the tqdm progress bar
+        data_iter = tqdm.tqdm(enumerate(data_loader),
+                              desc="EP_%s:%d" % (phase, epoch),
+                              total=len(data_loader),
+                              bar_format="{l_bar}{r_bar}")
+        avg_loss = 0.0
+        total_correct = 0
+        total_element = 0
+        plabels = []
+        tlabels = []
+        probabs = []
+        if phase == "train":
+            self.model.train()
+        else:
+            self.model.eval()
+        # self.probability_list = []
+        with open(self.log_folder_path+f"/log_{phase}_finetuned.txt", 'a') as f:
+            sys.stdout = f
+            for i, data in data_iter:
+                # 0. batch_data will be sent into the device(GPU or cpu)
+                data = {key: value.to(self.device) for key, value in data.items()}
+                if phase == "train":
+                    logits = self.model.forward(data["input"], data["segment_label"], data["feat"])
+                else:
+                    with torch.no_grad():
+                        logits = self.model.forward(data["input"].cpu(), data["segment_label"].cpu(), data["feat"].cpu())
+                logits = logits.cpu()
+                loss = self.criterion(logits, data["label"])
+                # if torch.cuda.device_count() > 1:
+                #     loss = loss.mean()
+                # 3. backward and optimization only in train
+                # if phase == "train":
+                #     self.optim_schedule.zero_grad()
+                #     loss.backward()
+                #     self.optim_schedule.step_and_update_lr()
+                # prediction accuracy
+                probs = nn.Softmax(dim=-1)(logits) # Probabilities
+                probabs.extend(probs.detach().cpu().numpy().tolist())
+                predicted_labels = torch.argmax(probs, dim=-1) #correct
+                # self.probability_list.append(probs)
+                # true_labels = torch.argmax(data["label"], dim=-1)
+                plabels.extend(predicted_labels.cpu().numpy())
+                tlabels.extend(data['label'].cpu().numpy())
+                # Compare predicted labels to true labels and calculate accuracy
+                correct = (data['label'] == predicted_labels).sum().item()
+                avg_loss += loss.item()
+                total_correct += correct
+                # total_element += true_labels.nelement()
+                total_element += data["label"].nelement()
+                # print(">>>>>>>>>>>>>>", predicted_labels, true_labels, correct, total_correct, total_element)
+                post_fix = {
+                    "epoch": epoch,
+                    "iter": i,
+                    "avg_loss": avg_loss / (i + 1),
+                    "avg_acc": total_correct / total_element * 100 if total_element != 0 else 0,
+                    "loss": loss.item()
+                }
+                if i % self.log_freq == 0:
+                    data_iter.write(str(post_fix))
+            precisions = precision_score(tlabels, plabels, average="weighted", zero_division=0)
+            recalls = recall_score(tlabels, plabels, average="weighted")
+            f1_scores = f1_score(tlabels, plabels, average="weighted")
+            cmatrix = confusion_matrix(tlabels, plabels)
+            end_time = time.time()
+            auc_score = roc_auc_score(tlabels, plabels)
+            final_msg = {
+                "epoch": f"EP{epoch}_{phase}",
+                "avg_loss": avg_loss / len(data_iter),
+                "total_acc": total_correct * 100.0 / total_element,
+                "precisions": precisions,
+                "recalls": recalls,
+                "f1_scores": f1_scores,
+                # "confusion_matrix": f"{cmatrix}",
+                # "true_labels": f"{tlabels}",
+                # "predicted_labels": f"{plabels}",
+                "time_taken_from_start": end_time - self.start_time,
+                "auc_score":auc_score
+            }
+            with open("result.txt", 'w') as file:
+                for key, value in final_msg.items():
+                    file.write(f"{key}: {value}\n")
+            print(final_msg)
+            fpr, tpr, thresholds = roc_curve(tlabels, plabels)
+            with open("roc_data.pkl", "wb") as f:
+                pickle.dump((fpr, tpr, thresholds), f)
+            print(final_msg)
+            f.close()
+            with open(self.log_folder_path+f"/log_{phase}_finetuned_info.txt", 'a') as f1:
+                sys.stdout = f1
+                final_msg = {
+                "epoch": f"EP{epoch}_{phase}",
+                "confusion_matrix": f"{cmatrix}",
+                "true_labels": f"{tlabels if epoch == 0 else ''}",
+                "predicted_labels": f"{plabels}",
+                "probabilities": f"{probabs}",
+                "time_taken_from_start": end_time - self.start_time
+                }
+                print(final_msg)
+                f1.close()
+            sys.stdout = sys.__stdout__
+        sys.stdout = sys.__stdout__
+class BERTFineTuneCalibratedTrainer:
+    def __init__(self, bertFinetunedClassifierwithFeats: BERT, #BERTForClassificationWithFeats
+                 vocab_size: int, test_dataloader: DataLoader = None,
+                 lr: float = 1e-4, betas=(0.9, 0.999), weight_decay: float = 0.01, warmup_steps=10000,
+                 with_cuda: bool = True, cuda_devices=None, log_freq: int = 10, workspace_name=None,
+                 num_labels=2, log_folder_path: str = None):
+        """
+        :param bert: BERT model which you want to train
+        :param vocab_size: total word vocab size
+        :param test_dataloader: test dataset data loader [can be None]
+        :param lr: learning rate of optimizer
+        :param betas: Adam optimizer betas
+        :param weight_decay: Adam optimizer weight decay param
+        :param with_cuda: traning with cuda
+        :param log_freq: logging frequency of the batch iteration
+        """
+        # Setup cuda device for BERT training, argument -c, --cuda should be true
+        cuda_condition = torch.cuda.is_available() and with_cuda
+        self.device = torch.device("cuda:0" if cuda_condition else "cpu")
+        print(cuda_condition, " Device used = ", self.device)
+        # available_gpus = list(range(torch.cuda.device_count()))
+        # This BERT model will be saved every epoch
+        self.model = bertFinetunedClassifierwithFeats
+        print(self.model.parameters())
+        for param in self.model.parameters():
+            param.requires_grad = False
+        # Initialize the BERT Language Model, with BERT model
+        # self.model = BERTForClassification(self.bert, vocab_size, num_labels).to(self.device)
+        # self.model = BERTForClassificationWithFeats(self.bert, num_labels, 8).to(self.device)
+        # self.model = bertFinetunedClassifierwithFeats
+        # print(self.model.bert.parameters())
+        # for param in self.model.bert.parameters():
+        #     param.requires_grad = False
+        # BERTForClassificationWithFeats(self.bert, num_labels, 18).to(self.device)
+        # self.model = BERTForClassificationWithFeats(self.bert, num_labels, 1).to(self.device)
+        # Distributed GPU training if CUDA can detect more than 1 GPU
+        # if with_cuda and torch.cuda.device_count() > 1:
+        #     print("Using %d GPUS for BERT" % torch.cuda.device_count())
+        #     self.model = nn.DataParallel(self.model, device_ids=available_gpus)
+        # Setting the train, validation and test data loader
+        # self.train_data = train_dataloader
+        # self.val_data = val_dataloader
+        self.test_data = test_dataloader
+        # self.optim = Adam(self.model.parameters(), lr=lr, weight_decay=weight_decay) #, eps=1e-9
+        self.optim = Adam(self.model.parameters(), lr=lr, betas=betas, weight_decay=weight_decay)
+        # self.optim_schedule = ScheduledOptim(self.optim, self.model.bert.hidden, n_warmup_steps=warmup_steps)
+        # self.scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=2, factor=0.1)
+        self.criterion = nn.CrossEntropyLoss()
+        # if num_labels == 1:
+        #     self.criterion = nn.MSELoss()
+        # elif num_labels == 2:
+        #     self.criterion = nn.BCEWithLogitsLoss()
+        #     # self.criterion = nn.CrossEntropyLoss()
+        # elif num_labels > 2:
+            # self.criterion = nn.CrossEntropyLoss()
+            # self.criterion = nn.BCEWithLogitsLoss()
+        self.log_freq = log_freq
+        self.log_folder_path = log_folder_path
+        # self.workspace_name = workspace_name
+        # self.finetune_task = finetune_task
+        # self.save_model = False
+        # self.avg_loss = 10000
+        self.start_time = time.time()
+        # self.probability_list = []
+        for fi in ['test']: #'val',
+            f = open(self.log_folder_path+f"/log_{fi}_finetuned.txt", 'w')
+            f.close()
+        print("Total Parameters:", sum([p.nelement() for p in self.model.parameters()]))
+    # def train(self, epoch):
+    #     self.iteration(epoch, self.train_data)
+    # def val(self, epoch):
+    #     self.iteration(epoch, self.val_data, phase="val")
+    def test(self, epoch):
+        # if epoch == 0:
+        #     self.avg_loss = 10000
+        self.iteration(epoch, self.test_data, phase="test")
+    def iteration(self, epoch, data_loader, phase="train"):
+        """
+        loop over the data_loader for training or testing
+        if on train status, backward operation is activated
+        and also auto save the model every peoch
+        :param epoch: current epoch index
+        :param data_loader: torch.utils.data.DataLoader for iteration
+        :param train: boolean value of is train or test
+        :return: None
+        """
+        # Setting the tqdm progress bar
+        data_iter = tqdm.tqdm(enumerate(data_loader),
+                              desc="EP_%s:%d" % (phase, epoch),
+                              total=len(data_loader),
+                              bar_format="{l_bar}{r_bar}")
+        avg_loss = 0.0
+        total_correct = 0
+        total_element = 0
+        plabels = []
+        tlabels = []
+        probabs = []
+        if phase == "train":
+            self.model.train()
+        else:
+            self.model.eval()
+        # self.probability_list = []
+        with open(self.log_folder_path+f"/log_{phase}_finetuned.txt", 'a') as f:
+            sys.stdout = f
+            for i, data in data_iter:
+                # 0. batch_data will be sent into the device(GPU or cpu)
+                # print(data_pair[0])
+                data = {key: value.to(self.device) for key, value in data[0].items()}
+                # print(f"data : {data}")
+                # data = {key: value.to(self.device) for key, value in data.items()}
+                # if phase == "train":
+                #     logits = self.model.forward(data["input"], data["segment_label"], data["feat"])
+                # else:
+                with torch.no_grad():
+                    # logits = self.model.forward(data["input"], data["segment_label"], data["feat"])
+                    logits = self.model.forward(data)
+                loss = self.criterion(logits, data["label"])
+                if torch.cuda.device_count() > 1:
+                    loss = loss.mean()
+                # 3. backward and optimization only in train
+                # if phase == "train":
+                #     self.optim_schedule.zero_grad()
+                #     loss.backward()
+                #     self.optim_schedule.step_and_update_lr()
+                # prediction accuracy
+                probs = nn.Softmax(dim=-1)(logits) # Probabilities
+                probabs.extend(probs.detach().cpu().numpy().tolist())
+                predicted_labels = torch.argmax(probs, dim=-1) #correct
+                # self.probability_list.append(probs)
+                # true_labels = torch.argmax(data["label"], dim=-1)
+                plabels.extend(predicted_labels.cpu().numpy())
+                tlabels.extend(data['label'].cpu().numpy())
+                positive_class_probs = [prob[1] for prob in probabs]
+                # Compare predicted labels to true labels and calculate accuracy
+                correct = (data['label'] == predicted_labels).sum().item()
+                avg_loss += loss.item()
+                total_correct += correct
+                # total_element += true_labels.nelement()
+                total_element += data["label"].nelement()
+                # print(">>>>>>>>>>>>>>", predicted_labels, true_labels, correct, total_correct, total_element)
+                post_fix = {
+                    "epoch": epoch,
+                    "iter": i,
+                    "avg_loss": avg_loss / (i + 1),
+                    "avg_acc": total_correct / total_element * 100 if total_element != 0 else 0,
+                    "loss": loss.item()
+                }
+                if i % self.log_freq == 0:
+                    data_iter.write(str(post_fix))
+            precisions = precision_score(tlabels, plabels, average="weighted", zero_division=0)
+            recalls = recall_score(tlabels, plabels, average="weighted")
+            f1_scores = f1_score(tlabels, plabels, average="weighted")
+            cmatrix = confusion_matrix(tlabels, plabels)
+            auc_score = roc_auc_score(tlabels, positive_class_probs)
+            end_time = time.time()
+            final_msg = {
+                "epoch": f"EP{epoch}_{phase}",
+                "avg_loss": avg_loss / len(data_iter),
+                "total_acc": total_correct * 100.0 / total_element,
+                "precisions": precisions,
+                "recalls": recalls,
+                "f1_scores": f1_scores,
+                "auc_score":auc_score,
+                # "confusion_matrix": f"{cmatrix}",
+                # "true_labels": f"{tlabels}",
+                # "predicted_labels": f"{plabels}",
+                "time_taken_from_start": end_time - self.start_time
+            }
+            with open("result.txt", 'w') as file:
+                for key, value in final_msg.items():
+                    file.write(f"{key}: {value}\n")
+            print(final_msg)
+            fpr, tpr, thresholds = roc_curve(tlabels, positive_class_probs)
+            f.close()
+            with open(self.log_folder_path+f"/log_{phase}_finetuned_info.txt", 'a') as f1:
+                sys.stdout = f1
+                final_msg = {
+                "epoch": f"EP{epoch}_{phase}",
+                "confusion_matrix": f"{cmatrix}",
+                "true_labels": f"{tlabels if epoch == 0 else ''}",
+                "predicted_labels": f"{plabels}",
+                "probabilities": f"{probabs}",
+                "time_taken_from_start": end_time - self.start_time
+                }
+                print(final_msg)
+                f1.close()
+            sys.stdout = sys.__stdout__
+        sys.stdout = sys.__stdout__
+def train():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-workspace_name', type=str, default=None)
+    parser.add_argument('-code', type=str, default=None, help="folder for pretraining outputs and logs")
+    parser.add_argument('-finetune_task', type=str, default=None, help="folder inside finetuning")
+    parser.add_argument("-attention", type=bool, default=False, help="analyse attention scores")
+    parser.add_argument("-diff_test_folder", type=bool, default=False, help="use for different test folder")
+    parser.add_argument("-embeddings", type=bool, default=False, help="get and analyse embeddings")
+    parser.add_argument('-embeddings_file_name', type=str, default=None, help="file name of embeddings")
+    parser.add_argument("-pretrain", type=bool, default=False, help="pretraining: true, or false")
+    # parser.add_argument('-opts', nargs='+', type=str, default=None, help='List of optional steps')
+    parser.add_argument("-max_mask", type=int, default=0.15, help="% of input tokens selected for masking")
+    # parser.add_argument("-p", "--pretrain_dataset", type=str, default="pretraining/pretrain.txt", help="pretraining dataset for bert")
+    # parser.add_argument("-pv", "--pretrain_val_dataset", type=str, default="pretraining/test.txt", help="pretraining validation dataset for bert")
+# default="finetuning/test.txt",
+    parser.add_argument("-vocab_path", type=str, default="pretraining/vocab.txt", help="built vocab model path with bert-vocab")
+    parser.add_argument("-train_dataset_path", type=str, default="train.txt", help="fine tune train dataset for progress classifier")
+    parser.add_argument("-val_dataset_path", type=str, default="val.txt", help="test set for evaluate fine tune train set")
+    parser.add_argument("-test_dataset_path", type=str, default="test.txt", help="test set for evaluate fine tune train set")
+    parser.add_argument("-num_labels", type=int, default=2, help="Number of labels")
+    parser.add_argument("-train_label_path", type=str, default="train_label.txt", help="fine tune train dataset for progress classifier")
+    parser.add_argument("-val_label_path", type=str, default="val_label.txt", help="test set for evaluate fine tune train set")
+    parser.add_argument("-test_label_path", type=str, default="test_label.txt", help="test set for evaluate fine tune train set")
+    ##### change Checkpoint for finetuning
+    parser.add_argument("-pretrained_bert_checkpoint", type=str, default=None, help="checkpoint of saved pretrained bert model")
+    parser.add_argument("-finetuned_bert_classifier_checkpoint", type=str, default=None, help="checkpoint of saved finetuned bert model")  #."output_feb09/bert_trained.model.ep40"
+    #."output_feb09/bert_trained.model.ep40"
+    parser.add_argument('-check_epoch', type=int, default=None)
+    parser.add_argument("-hs", "--hidden", type=int, default=64, help="hidden size of transformer model") #64
+    parser.add_argument("-l", "--layers", type=int, default=4, help="number of layers") #4
+    parser.add_argument("-a", "--attn_heads", type=int, default=4, help="number of attention heads") #8
+    parser.add_argument("-s", "--seq_len", type=int, default=5, help="maximum sequence length")
+    parser.add_argument("-b", "--batch_size", type=int, default=500, help="number of batch_size") #64
+    parser.add_argument("-e", "--epochs", type=int, default=1)#1501, help="number of epochs") #501
+    # Use 50 for pretrain, and 10 for fine tune
+    parser.add_argument("-w", "--num_workers", type=int, default=0, help="dataloader worker size")
+    # Later run with cuda
+    parser.add_argument("--with_cuda", type=bool, default=False, help="training with CUDA: true, or false")
+    parser.add_argument("--log_freq", type=int, default=10, help="printing loss every n iter: setting n")
+    # parser.add_argument("--corpus_lines", type=int, default=None, help="total number of lines in corpus")
+    parser.add_argument("--cuda_devices", type=int, nargs='+', default=None, help="CUDA device ids")
+    # parser.add_argument("--on_memory", type=bool, default=False, help="Loading on memory: true or false")
+    parser.add_argument("--dropout", type=float, default=0.1, help="dropout of network")
+    parser.add_argument("--lr", type=float, default=1e-05, help="learning rate of adam") #1e-3
+    parser.add_argument("--adam_weight_decay", type=float, default=0.01, help="weight_decay of adam")
+    parser.add_argument("--adam_beta1", type=float, default=0.9, help="adam first beta value")
+    parser.add_argument("--adam_beta2", type=float, default=0.98, help="adam first beta value") #0.999
+    parser.add_argument("-o", "--output_path", type=str, default="bert_trained.seq_encoder.model", help="ex)output/bert.model")
+    # parser.add_argument("-o", "--output_path", type=str, default="output/bert_fine_tuned.model", help="ex)output/bert.model")
+    args = parser.parse_args()
+    for k,v in vars(args).items():
+        if 'path' in k:
+            if v:
+                if k == "output_path":
+                    if args.code:
+                        setattr(args, f"{k}", args.workspace_name+f"/output/{args.code}/"+v)
+                    elif args.finetune_task:
+                        setattr(args, f"{k}", args.workspace_name+f"/output/{args.finetune_task}/"+v)
+                    else:
+                        setattr(args, f"{k}", args.workspace_name+"/output/"+v)
+                elif k != "vocab_path":
+                    if args.pretrain:
+                        setattr(args, f"{k}", args.workspace_name+"/pretraining/"+v)
+                    else:
+                        if args.code:
+                            setattr(args, f"{k}", args.workspace_name+f"/{args.code}/"+v)
+                        elif args.finetune_task:
+                            if args.diff_test_folder and "test" in k:
+                                setattr(args, f"{k}", args.workspace_name+f"/finetuning/"+v)
+                            else:
+                                setattr(args, f"{k}", args.workspace_name+f"/finetuning/{args.finetune_task}/"+v)
+                        else:
+                            setattr(args, f"{k}", args.workspace_name+"/finetuning/"+v)
+                else:
+                    setattr(args, f"{k}", args.workspace_name+"/"+v)
+                print(f"args.{k} : {getattr(args, f'{k}')}")
+    print("Loading Vocab", args.vocab_path)
+    vocab_obj = Vocab(args.vocab_path)
+    vocab_obj.load_vocab()
+    print("Vocab Size: ", len(vocab_obj.vocab))
+    print("Testing using finetuned model......")
+    print("Loading Test Dataset", args.test_dataset_path)
+    test_dataset = TokenizerDataset(args.test_dataset_path, args.test_label_path, vocab_obj, seq_len=args.seq_len)
+    # test_dataset = TokenizerDatasetForCalibration(args.test_dataset_path, args.test_label_path, vocab_obj, seq_len=args.seq_len)
+    print("Creating Dataloader...")
+    test_data_loader = DataLoader(test_dataset, batch_size=args.batch_size, num_workers=args.num_workers)
+    print("Load fine-tuned BERT classifier model with feats")
+    # cuda_condition = torch.cuda.is_available() and args.with_cuda
+    device = torch.device("cpu") #torch.device("cuda:0" if cuda_condition else "cpu")
+    finetunedBERTclassifier = torch.load(args.finetuned_bert_classifier_checkpoint, map_location=device)
+    if isinstance(finetunedBERTclassifier, torch.nn.DataParallel):
+        finetunedBERTclassifier = finetunedBERTclassifier.module
+    new_log_folder = f"{args.workspace_name}/logs"
+    new_output_folder = f"{args.workspace_name}/output"
+    if args.finetune_task: # is sent almost all the time
+        new_log_folder = f"{args.workspace_name}/logs/{args.finetune_task}"
+        new_output_folder = f"{args.workspace_name}/output/{args.finetune_task}"
+    if not os.path.exists(new_log_folder):
+        os.makedirs(new_log_folder)
+    if not os.path.exists(new_output_folder):
+        os.makedirs(new_output_folder)
+    print("Creating BERT Fine Tuned Test Trainer")
+    trainer = BERTFineTuneTrainer(finetunedBERTclassifier,
+                    len(vocab_obj.vocab), test_dataloader=test_data_loader,
+                  lr=args.lr, betas=(args.adam_beta1, args.adam_beta2), weight_decay=args.adam_weight_decay,
+                  with_cuda=args.with_cuda, cuda_devices = args.cuda_devices, log_freq=args.log_freq,
+                  workspace_name = args.workspace_name, num_labels=args.num_labels, log_folder_path=new_log_folder)
+    # trainer = BERTFineTuneCalibratedTrainer(finetunedBERTclassifier,
+    #                 len(vocab_obj.vocab), test_dataloader=test_data_loader,
+    #               lr=args.lr, betas=(args.adam_beta1, args.adam_beta2), weight_decay=args.adam_weight_decay,
+    #               with_cuda=args.with_cuda, cuda_devices = args.cuda_devices, log_freq=args.log_freq,
+    #               workspace_name = args.workspace_name, num_labels=args.num_labels, log_folder_path=new_log_folder)
+    print("Testing fine-tuned model Start....")
+    start_time = time.time()
+    repoch = range(args.check_epoch, args.epochs) if args.check_epoch else range(args.epochs)
+    counter = 0
+    # patience = 10
+    for epoch in repoch:
+            print(f'Test Epoch {epoch} Starts, Time: {time.strftime("%D %T", time.localtime(time.time()))}')
+            trainer.test(epoch)
+            # pickle.dump(trainer.probability_list, open(f"{args.workspace_name}/output/aaai/change4_mid_prob_{epoch}.pkl","wb"))
+            print(f'Test Epoch {epoch} Ends, Time: {time.strftime("%D %T", time.localtime(time.time()))} \n')
+    end_time = time.time()
+    print("Time Taken to fine-tune model = ", end_time - start_time)
+    print(f'Pretraining Ends, Time: {time.strftime("%D %T", time.localtime(end_time))}')
+if __name__ == "__main__":
+    train()

plot.png CHANGED Viewed

prepare_pretraining_input_vocab_file.py ADDED Viewed

The diff for this file is too large to render. See raw diff

ratio_proportion_change3_2223/sch_largest_100-coded/pretraining/vocab.txt ADDED Viewed

	@@ -0,0 +1,34 @@

+[PAD]
+[UNK]
+[MASK]
+[CLS]
+[SEP]
+DenominatorFactor
+DenominatorQuantity1-0
+DenominatorQuantity1-1
+DenominatorQuantity1-2
+EquationAnswer
+FinalAnswer-0
+FinalAnswer-1
+FinalAnswer-2
+FinalAnswerDirection-0
+FinalAnswerDirection-1
+FinalAnswerDirection-2
+FirstRow1:1
+FirstRow1:2
+FirstRow2:1
+FirstRow2:2
+NumeratorFactor
+NumeratorQuantity1-0
+NumeratorQuantity1-1
+NumeratorQuantity1-2
+NumeratorQuantity2-0
+NumeratorQuantity2-1
+NumeratorQuantity2-2
+OptionalTask_1
+OptionalTask_2
+PercentChange-0
+PercentChange-1
+PercentChange-2
+SecondRow
+ThirdRow

recalibration.py ADDED Viewed

	@@ -0,0 +1,82 @@

+import torch
+from torch import nn, optim
+from torch.nn import functional as F
+import metrics
+class ModelWithTemperature(nn.Module):
+    """
+    A thin decorator, which wraps a model with temperature scaling
+    model (nn.Module):
+        A classification neural network
+        NB: Output of the neural network should be the classification logits,
+            NOT the softmax (or log softmax)!
+    """
+    def __init__(self, model, device="cpu"):
+        super(ModelWithTemperature, self).__init__()
+        self.model = model
+        self.device = torch.device(device)
+        self.temperature = nn.Parameter(torch.ones(1) * 1.5)
+    def forward(self, input):
+        logits = self.model(input["input"], input["segment_label"], input["feat"])
+        return self.temperature_scale(logits)
+    def temperature_scale(self, logits):
+        """
+        Perform temperature scaling on logits
+        """
+        # Expand temperature to match the size of logits
+        temperature = self.temperature.unsqueeze(1).expand(logits.size(0), logits.size(1)).to(self.device)
+        return logits / temperature
+    # This function probably should live outside of this class, but whatever
+    def set_temperature(self, valid_loader):
+        """
+        Tune the tempearature of the model (using the validation set).
+        We're going to set it to optimize NLL.
+        valid_loader (DataLoader): validation set loader
+        """
+        #self.cuda()
+        nll_criterion = nn.CrossEntropyLoss()
+        ece_criterion = metrics.ECELoss()
+        # First: collect all the logits and labels for the validation set
+        logits_list = []
+        labels_list = []
+        with torch.no_grad():
+            for input, label in valid_loader:
+                # print("Input = ", input["input"])
+                # print("Input = ", input["segment_label"])
+                # print("Input = ", input["feat"])
+                # input = input
+                logits = self.model(input["input"].to(self.device), input["segment_label"].to(self.device), input["feat"].to(self.device))
+                logits_list.append(logits)
+                labels_list.append(label)
+            logits = torch.cat(logits_list).to(self.device)
+            labels = torch.cat(labels_list).to(self.device)
+        # Calculate NLL and ECE before temperature scaling
+        before_temperature_nll = nll_criterion(logits, labels).item()
+        before_temperature_ece = ece_criterion.loss(logits.cpu().numpy(),labels.cpu().numpy(),15)
+        #before_temperature_ece = ece_criterion(logits, labels).item()
+        #ece_2 = ece_criterion_2.loss(logits,labels)
+        print('Before temperature - NLL: %.3f, ECE: %.3f' % (before_temperature_nll, before_temperature_ece))
+        #print(ece_2)
+        # Next: optimize the temperature w.r.t. NLL
+        optimizer = optim.LBFGS([self.temperature], lr=0.005, max_iter=1000)
+        def eval():
+            loss = nll_criterion(self.temperature_scale(logits.to(self.device)), labels.to(self.device))
+            loss.backward()
+            return loss
+        optimizer.step(eval)
+        # Calculate NLL and ECE after temperature scaling
+        after_temperature_nll = nll_criterion(self.temperature_scale(logits), labels).item()
+        after_temperature_ece = ece_criterion.loss(self.temperature_scale(logits).detach().cpu().numpy(),labels.cpu().numpy(),15)
+        #after_temperature_ece = ece_criterion(self.temperature_scale(logits), labels).item()
+        print('Optimal temperature: %.3f' % self.temperature.item())
+        print('After temperature - NLL: %.3f, ECE: %.3f' % (after_temperature_nll, after_temperature_ece))
+        return self

src/__pycache__/attention.cpython-312.pyc CHANGED Viewed

Binary files a/src/__pycache__/attention.cpython-312.pyc and b/src/__pycache__/attention.cpython-312.pyc differ

src/__pycache__/bert.cpython-312.pyc CHANGED Viewed

Binary files a/src/__pycache__/bert.cpython-312.pyc and b/src/__pycache__/bert.cpython-312.pyc differ

src/__pycache__/classifier_model.cpython-312.pyc CHANGED Viewed

Binary files a/src/__pycache__/classifier_model.cpython-312.pyc and b/src/__pycache__/classifier_model.cpython-312.pyc differ

src/__pycache__/dataset.cpython-312.pyc CHANGED Viewed

Binary files a/src/__pycache__/dataset.cpython-312.pyc and b/src/__pycache__/dataset.cpython-312.pyc differ

src/__pycache__/embedding.cpython-312.pyc CHANGED Viewed

Binary files a/src/__pycache__/embedding.cpython-312.pyc and b/src/__pycache__/embedding.cpython-312.pyc differ

src/__pycache__/seq_model.cpython-312.pyc CHANGED Viewed

Binary files a/src/__pycache__/seq_model.cpython-312.pyc and b/src/__pycache__/seq_model.cpython-312.pyc differ

src/__pycache__/transformer.cpython-312.pyc CHANGED Viewed

Binary files a/src/__pycache__/transformer.cpython-312.pyc and b/src/__pycache__/transformer.cpython-312.pyc differ

src/__pycache__/transformer_component.cpython-312.pyc CHANGED Viewed

Binary files a/src/__pycache__/transformer_component.cpython-312.pyc and b/src/__pycache__/transformer_component.cpython-312.pyc differ

src/__pycache__/vocab.cpython-312.pyc CHANGED Viewed

Binary files a/src/__pycache__/vocab.cpython-312.pyc and b/src/__pycache__/vocab.cpython-312.pyc differ

src/attention.py CHANGED Viewed

@@ -3,11 +3,19 @@ import torch.nn.functional as F
 import torch
 import math
 class Attention(nn.Module):
     """
     Compute 'Scaled Dot Product Attention
     """
     def __init__(self):
@@ -45,7 +53,10 @@ class MultiHeadedAttention(nn.Module):
         self.linear_layers = nn.ModuleList([nn.Linear(d_model, d_model) for _ in range(3)])
         self.output_linear = nn.Linear(d_model, d_model)
         self.attention = Attention()
         self.dropout = nn.Dropout(p=dropout)
     def forward(self, query, key, value, mask=None):
@@ -59,6 +70,14 @@ class MultiHeadedAttention(nn.Module):
         query, key, value = [l(x).view(nbatches, -1, self.h, self.d_k).transpose(1, 2)
                             for l, x in zip(self.linear_layers, (query, key, value))]
         # 2) Apply attention on all the projected vectors in batch.
         x, attn = self.attention(query, key, value, mask=mask, dropout=self.dropout)
         # torch.Size([64, 8, 100, 100])
         # print("Attention", attn.shape)
@@ -67,4 +86,5 @@ class MultiHeadedAttention(nn.Module):
         x = x.transpose(1, 2).contiguous().view(nbatches, -1, self.h * self.d_k)
         return self.output_linear(x)

 import torch
 import math
+<<<<<<< HEAD
+import pickle
+class Attention(nn.Module):
+    """
+    Compute Scaled Dot Product Attention
+=======
 class Attention(nn.Module):
     """
     Compute 'Scaled Dot Product Attention
+>>>>>>> bffd3381ccb717f802fe651d4111ec0a268e3896
     """
     def __init__(self):
         self.linear_layers = nn.ModuleList([nn.Linear(d_model, d_model) for _ in range(3)])
         self.output_linear = nn.Linear(d_model, d_model)
         self.attention = Attention()
+<<<<<<< HEAD
+=======
+>>>>>>> bffd3381ccb717f802fe651d4111ec0a268e3896
         self.dropout = nn.Dropout(p=dropout)
     def forward(self, query, key, value, mask=None):
         query, key, value = [l(x).view(nbatches, -1, self.h, self.d_k).transpose(1, 2)
                             for l, x in zip(self.linear_layers, (query, key, value))]
         # 2) Apply attention on all the projected vectors in batch.
+<<<<<<< HEAD
+        x, p_attn = self.attention(query, key, value, mask=mask, dropout=self.dropout)
+        # 3) "Concat" using a view and apply a final linear.
+        x = x.transpose(1, 2).contiguous().view(nbatches, -1, self.h * self.d_k)
+        return self.output_linear(x), p_attn
+=======
         x, attn = self.attention(query, key, value, mask=mask, dropout=self.dropout)
         # torch.Size([64, 8, 100, 100])
         # print("Attention", attn.shape)
         x = x.transpose(1, 2).contiguous().view(nbatches, -1, self.h * self.d_k)
         return self.output_linear(x)
+>>>>>>> bffd3381ccb717f802fe651d4111ec0a268e3896

src/bert.py CHANGED Viewed

@@ -1,7 +1,14 @@
 import torch.nn as nn
 from transformer import TransformerBlock
 from embedding import BERTEmbedding
 class BERT(nn.Module):
     """
@@ -31,10 +38,37 @@ class BERT(nn.Module):
         # multi-layers transformer blocks, deep network
         self.transformer_blocks = nn.ModuleList(
             [TransformerBlock(hidden, attn_heads, hidden * 4, dropout) for _ in range(n_layers)])
     def forward(self, x, segment_info):
         # attention masking for padded token
         # torch.ByteTensor([batch_size, 1, seq_len, seq_len)
         mask = (x > 0).unsqueeze(1).repeat(1, x.size(1), 1).unsqueeze(1)
         # print("bert mask: ", mask)
         # embedding the indexed sequence to sequence of vectors
@@ -43,5 +77,6 @@ class BERT(nn.Module):
         # running over multiple transformer blocks
         for transformer in self.transformer_blocks:
             x = transformer.forward(x, mask)
         return x

 import torch.nn as nn
+<<<<<<< HEAD
+import torch
+from .transformer import TransformerBlock
+from .embedding import BERTEmbedding
+=======
 from transformer import TransformerBlock
 from embedding import BERTEmbedding
+>>>>>>> bffd3381ccb717f802fe651d4111ec0a268e3896
 class BERT(nn.Module):
     """
         # multi-layers transformer blocks, deep network
         self.transformer_blocks = nn.ModuleList(
             [TransformerBlock(hidden, attn_heads, hidden * 4, dropout) for _ in range(n_layers)])
+<<<<<<< HEAD
+        # self.attention_values = []
+=======
+>>>>>>> bffd3381ccb717f802fe651d4111ec0a268e3896
     def forward(self, x, segment_info):
         # attention masking for padded token
         # torch.ByteTensor([batch_size, 1, seq_len, seq_len)
+<<<<<<< HEAD
+        device = x.device
+        masked = (x > 0).unsqueeze(1).repeat(1, x.size(1), 1)
+        r,e,c = masked.shape
+        mask = torch.zeros((r, e, c), dtype=torch.bool).to(device=device)
+        for i in range(r):
+            mask[i] = masked[i].T*masked[i]
+        mask = mask.unsqueeze(1)
+        # mask = (x > 0).unsqueeze(1).repeat(1, x.size(1), 1).unsqueeze(1)
+        # print("bert mask: ", mask)
+        # embedding the indexed sequence to sequence of vectors
+        x = self.embedding(x, segment_info)
+        # self.attention_values = []
+        # running over multiple transformer blocks
+        for transformer in self.transformer_blocks:
+            x = transformer.forward(x, mask)
+            # self.attention_values.append(transformer.p_attn)
+=======
         mask = (x > 0).unsqueeze(1).repeat(1, x.size(1), 1).unsqueeze(1)
         # print("bert mask: ", mask)
         # embedding the indexed sequence to sequence of vectors
         # running over multiple transformer blocks
         for transformer in self.transformer_blocks:
             x = transformer.forward(x, mask)
+>>>>>>> bffd3381ccb717f802fe651d4111ec0a268e3896
         return x

src/classifier_model.py CHANGED Viewed

@@ -1,16 +1,66 @@
 import torch.nn as nn
 from bert import BERT
 class BERTForClassification(nn.Module):
     """
         Progress Classifier Model
     """
     def __init__(self, bert: BERT, vocab_size, n_labels):
         """
         :param bert: BERT model which should be trained
         :param vocab_size: total vocab size for masked_lm
         """
@@ -21,4 +71,5 @@ class BERTForClassification(nn.Module):
     def forward(self, x, segment_label):
         x = self.bert(x, segment_label)
-        return x, self.linear(x[:, 0])

+<<<<<<< HEAD
+import torch
+import torch.nn as nn
+from .bert import BERT
+=======
 import torch.nn as nn
 from bert import BERT
+>>>>>>> bffd3381ccb717f802fe651d4111ec0a268e3896
 class BERTForClassification(nn.Module):
     """
+<<<<<<< HEAD
+        Fine-tune Task Classifier Model
+=======
         Progress Classifier Model
+>>>>>>> bffd3381ccb717f802fe651d4111ec0a268e3896
     """
     def __init__(self, bert: BERT, vocab_size, n_labels):
         """
         :param bert: BERT model which should be trained
+<<<<<<< HEAD
+        :param vocab_size: total vocab size
+        :param n_labels: number of labels for the task
+        """
+        super().__init__()
+        self.bert = bert
+        self.linear = nn.Linear(self.bert.hidden, n_labels)
+    def forward(self, x, segment_label):
+        x = self.bert(x, segment_label)
+        return self.linear(x[:, 0])
+class BERTForClassificationWithFeats(nn.Module):
+    """
+        Fine-tune Task Classifier Model
+        BERT embeddings concatenated with features
+    """
+    def __init__(self, bert: BERT, n_labels, feat_size=9):
+        """
+        :param bert: BERT model which should be trained
+        :param vocab_size: total vocab size
+        :param n_labels: number of labels for the task
+        """
+        super().__init__()
+        self.bert = bert
+        # self.linear1 = nn.Linear(self.bert.hidden+feat_size, 128)
+        self.linear = nn.Linear(self.bert.hidden+feat_size, n_labels)
+        # self.RELU = nn.ReLU()
+        # self.linear2 = nn.Linear(128, n_labels)
+    def forward(self, x, segment_label, feat):
+        x = self.bert(x, segment_label)
+        x = torch.cat((x[:, 0], feat), dim=-1)
+        # x = self.linear1(x)
+        # x = self.RELU(x)
+        # return self.linear2(x)
+        return self.linear(x)
+=======
         :param vocab_size: total vocab size for masked_lm
         """
     def forward(self, x, segment_label):
         x = self.bert(x, segment_label)
+        return x, self.linear(x[:, 0])
+>>>>>>> bffd3381ccb717f802fe651d4111ec0a268e3896

src/dataset.py CHANGED Viewed

@@ -4,17 +4,28 @@ import pandas as pd
 import numpy as np
 import tqdm
 import random
 from vocab import Vocab
 import pickle
 import copy
 from sklearn.preprocessing import OneHotEncoder
 class PretrainerDataset(Dataset):
     """
         Class name: PretrainDataset
     """
     def __init__(self, dataset_path, vocab, seq_len=30, select_next_seq= False):
         self.dataset_path = dataset_path
         self.vocab = vocab # Vocab object
@@ -35,6 +46,22 @@ class PretrainerDataset(Dataset):
                         self.index_documents[i] = []
                     else:
                         self.index_documents[i].append(index)
                         self.lines.append(line.split())
                         len_line = len(line.split())
                         seq_len_list.append(len_line)
@@ -49,6 +76,7 @@ class PretrainerDataset(Dataset):
         print("Sequence length set at ", self.seq_len)
         print("select_next_seq: ", self.select_next_seq)
         print(len(self.index_documents))
     def __len__(self):
@@ -56,6 +84,53 @@ class PretrainerDataset(Dataset):
     def __getitem__(self, item):
         token_a = self.lines[item]
         token_b = None
         is_same_student = None
         sa_masked = None
@@ -92,6 +167,7 @@ class PretrainerDataset(Dataset):
         if self.select_next_seq:
             output['is_same_student'] = is_same_student
         # print(item, len(s1), len(s1_label), len(segment_label))
         return {key: torch.tensor(value) for key, value in output.items()}
     def random_mask_seq(self, tokens):
@@ -100,6 +176,28 @@ class PretrainerDataset(Dataset):
         Output: masked token seq, output label
         """
         # masked_pos_label = {}
         output_labels = []
         output_tokens = copy.deepcopy(tokens)
@@ -108,17 +206,34 @@ class PretrainerDataset(Dataset):
         for i, token in enumerate(tokens):
             prob = random.random()
             if prob < 0.15:
              # chooses 15% of token positions at random
                 # prob /= 0.15
                 prob = random.random()
                 if prob < 0.8: #[MASK] token 80% of the time
                     output_tokens[i] = self.vocab.vocab['[MASK]']
                 elif prob < 0.9: # a random token 10% of the time
                     # print(".......0.8-0.9......")
                     output_tokens[i] = random.randint(1, len(self.vocab.vocab)-1)
                 else: # the unchanged i-th token 10% of the time
                     # print(".......unchanged......")
                     output_tokens[i] = self.vocab.vocab.get(token, self.vocab.vocab['[UNK]'])
                 # True Label
                 output_labels.append(self.vocab.vocab.get(token, self.vocab.vocab['[UNK]']))
                 # masked_pos_label[i] = self.vocab.vocab.get(token, self.vocab.vocab['[UNK]'])
@@ -127,11 +242,53 @@ class PretrainerDataset(Dataset):
                 output_tokens[i] = self.vocab.vocab.get(token, self.vocab.vocab['[UNK]'])
                 # Padded label
                 output_labels.append(self.vocab.vocab['[PAD]'])
         # label_position = []
         # label_tokens = []
         # for k, v in masked_pos_label.items():
         #     label_position.append(k)
         #     label_tokens.append(v)
         return  output_tokens, output_labels
     def get_token_b(self, item):
@@ -167,6 +324,7 @@ class PretrainerDataset(Dataset):
             else:
                 sb.pop()
         return sa, sb
 class TokenizerDataset(Dataset):
     """
@@ -174,15 +332,89 @@ class TokenizerDataset(Dataset):
         Tokenize the data in the dataset
     """
     def __init__(self, dataset_path, label_path, vocab, seq_len=30, train=True):
         self.dataset_path = dataset_path
         self.label_path = label_path
         self.vocab = vocab # Vocab object
         self.encoder = OneHotEncoder(sparse_output=False)
         # Related to input dataset file
         self.lines = []
         self.labels = []
         self.labels = []
         self.label_file = open(self.label_path, "r")
@@ -234,11 +466,14 @@ class TokenizerDataset(Dataset):
         self.file = open(self.dataset_path, "r")
         # index = 0
         for line in self.file:
             if line:
                 line = line.strip()
                 if line:
                     self.lines.append(line)
                     # if train:
                     #     if index in indices_of_zeros:
                     #     # if index in indices_of_prom:
@@ -253,17 +488,46 @@ class TokenizerDataset(Dataset):
                     #     self.labels.append(labels[index])
                         # self.labels.append(progress[index])
                     # index += 1
         self.file.close()
         self.len = len(self.lines)
         self.seq_len = seq_len
         print("Sequence length set at ", self.seq_len, len(self.lines), len(self.labels))
     def __len__(self):
         return self.len
     def __getitem__(self, item):
         s1 = self.vocab.to_seq(self.lines[item], self.seq_len) # This is like tokenizer and adds [CLS] and [SEP].
         s1_label = self.labels[item]
@@ -274,11 +538,132 @@ class TokenizerDataset(Dataset):
         output = {'bert_input': s1,
                  'progress_status': s1_label,
                  'segment_label': segment_label}
         return {key: torch.tensor(value) for key, value in output.items()}
 # if __name__ == "__main__":
 #     # import pickle
 #     # k = pickle.load(open("dataset/CL4999_1920/unique_steps_list.pkl","rb"))
 #     # print(k)

 import numpy as np
 import tqdm
 import random
+<<<<<<< HEAD
+from .vocab import Vocab
+import pickle
+import copy
+# from sklearn.preprocessing import OneHotEncoder
+=======
 from vocab import Vocab
 import pickle
 import copy
 from sklearn.preprocessing import OneHotEncoder
+>>>>>>> bffd3381ccb717f802fe651d4111ec0a268e3896
 class PretrainerDataset(Dataset):
     """
         Class name: PretrainDataset
     """
+<<<<<<< HEAD
+    def __init__(self, dataset_path, vocab, seq_len=30, max_mask=0.15):
+=======
     def __init__(self, dataset_path, vocab, seq_len=30, select_next_seq= False):
+>>>>>>> bffd3381ccb717f802fe651d4111ec0a268e3896
         self.dataset_path = dataset_path
         self.vocab = vocab # Vocab object
                         self.index_documents[i] = []
                     else:
                         self.index_documents[i].append(index)
+<<<<<<< HEAD
+                        self.lines.append(line.split("\t"))
+                        len_line = len(line.split("\t"))
+                        seq_len_list.append(len_line)
+                        index+=1
+            reader.close()
+        print("Sequence Stats: len: %s, min: %s, max: %s, average: %s"% (len(seq_len_list),
+              min(seq_len_list), max(seq_len_list), sum(seq_len_list)/len(seq_len_list)))
+        print("Unique Sequences: ", len({tuple(ll) for ll in self.lines}))
+        self.index_documents = {k:v for k,v in self.index_documents.items() if v}
+        print(len(self.index_documents))
+        self.seq_len = seq_len
+        print("Sequence length set at: ", self.seq_len)
+        self.max_mask = max_mask
+        print("% of input tokens selected for masking : ",self.max_mask)
+=======
                         self.lines.append(line.split())
                         len_line = len(line.split())
                         seq_len_list.append(len_line)
         print("Sequence length set at ", self.seq_len)
         print("select_next_seq: ", self.select_next_seq)
         print(len(self.index_documents))
+>>>>>>> bffd3381ccb717f802fe651d4111ec0a268e3896
     def __len__(self):
     def __getitem__(self, item):
         token_a = self.lines[item]
+<<<<<<< HEAD
+        # sa_masked = None
+        # sa_masked_label = None
+        # token_b = None
+        # is_same_student = None
+        # sb_masked = None
+        # sb_masked_label = None
+        # if self.select_next_seq:
+        #     is_same_student, token_b = self.get_token_b(item)
+        #     is_same_student = 1 if is_same_student else 0
+        #     token_a1, token_b1 = self.truncate_to_max_seq(token_a, token_b)
+        #     sa_masked, sa_masked_label = self.random_mask_seq(token_a1)
+        #     sb_masked, sb_masked_label = self.random_mask_seq(token_b1)
+        # else:
+        token_a = token_a[:self.seq_len-2]
+        sa_masked, sa_masked_label, sa_masked_pos = self.random_mask_seq(token_a)
+        s1 = ([self.vocab.vocab['[CLS]']] + sa_masked + [self.vocab.vocab['[SEP]']])
+        s1_label = ([self.vocab.vocab['[PAD]']] + sa_masked_label + [self.vocab.vocab['[PAD]']])
+        segment_label = [1 for _ in range(len(s1))]
+        masked_pos = ([0] + sa_masked_pos + [0])
+        # if self.select_next_seq:
+        #     s1 = s1 + sb_masked + [self.vocab.vocab['[SEP]']]
+        #     s1_label = s1_label + sb_masked_label + [self.vocab.vocab['[PAD]']]
+        #     segment_label = segment_label + [2 for _ in range(len(sb_masked)+1)]
+        padding = [self.vocab.vocab['[PAD]'] for _ in range(self.seq_len - len(s1))]
+        s1.extend(padding)
+        s1_label.extend(padding)
+        segment_label.extend(padding)
+        masked_pos.extend(padding)
+        output = {'bert_input': s1,
+                 'bert_label': s1_label,
+                 'segment_label': segment_label,
+                 'masked_pos': masked_pos}
+        # print(f"tokenA: {token_a}")
+        # print(f"output: {output}")
+        # if self.select_next_seq:
+        #     output['is_same_student'] = is_same_student
+        # print(item, len(s1), len(s1_label), len(segment_label))
+        # print(f"{item}.")
+=======
         token_b = None
         is_same_student = None
         sa_masked = None
         if self.select_next_seq:
             output['is_same_student'] = is_same_student
         # print(item, len(s1), len(s1_label), len(segment_label))
+>>>>>>> bffd3381ccb717f802fe651d4111ec0a268e3896
         return {key: torch.tensor(value) for key, value in output.items()}
     def random_mask_seq(self, tokens):
         Output: masked token seq, output label
         """
+<<<<<<< HEAD
+        masked_pos = []
+        output_labels = []
+        output_tokens = copy.deepcopy(tokens)
+        opt_step = False
+        for i, token in enumerate(tokens):
+            if token in ['OptionalTask_1', 'EquationAnswer', 'NumeratorFactor', 'DenominatorFactor', 'OptionalTask_2', 'FirstRow1:1', 'FirstRow1:2', 'FirstRow2:1', 'FirstRow2:2', 'SecondRow', 'ThirdRow']:
+                opt_step = True
+            # if opt_step:
+            #     prob = random.random()
+            #     if prob < self.max_mask:
+            #         output_tokens[i] = random.choice([3,7,8,9,11,12,13,14,15,16,22,23,24,25,26,27,30,31,32])
+            #         masked_pos.append(1)
+            #     else:
+            #         output_tokens[i] = self.vocab.vocab.get(token, self.vocab.vocab['[UNK]'])
+            #         masked_pos.append(0)
+            #     output_labels.append(self.vocab.vocab.get(token, self.vocab.vocab['[UNK]']))
+            #     opt_step = False
+            # else:
+            prob = random.random()
+            if prob < self.max_mask:
+=======
         # masked_pos_label = {}
         output_labels = []
         output_tokens = copy.deepcopy(tokens)
         for i, token in enumerate(tokens):
             prob = random.random()
             if prob < 0.15:
+>>>>>>> bffd3381ccb717f802fe651d4111ec0a268e3896
              # chooses 15% of token positions at random
                 # prob /= 0.15
                 prob = random.random()
                 if prob < 0.8: #[MASK] token 80% of the time
                     output_tokens[i] = self.vocab.vocab['[MASK]']
+<<<<<<< HEAD
+                    masked_pos.append(1)
+                elif prob < 0.9: # a random token 10% of the time
+                    # print(".......0.8-0.9......")
+                    if opt_step:
+                        output_tokens[i] = random.choice([7,8,9,11,12,13,14,15,16,22,23,24,25,26,27,30,31,32])
+                        opt_step = False
+                    else:
+                        output_tokens[i] = random.randint(1, len(self.vocab.vocab)-1)
+                    masked_pos.append(1)
+                else: # the unchanged i-th token 10% of the time
+                    # print(".......unchanged......")
+                    output_tokens[i] = self.vocab.vocab.get(token, self.vocab.vocab['[UNK]'])
+                    masked_pos.append(0)
+=======
                 elif prob < 0.9: # a random token 10% of the time
                     # print(".......0.8-0.9......")
                     output_tokens[i] = random.randint(1, len(self.vocab.vocab)-1)
                 else: # the unchanged i-th token 10% of the time
                     # print(".......unchanged......")
                     output_tokens[i] = self.vocab.vocab.get(token, self.vocab.vocab['[UNK]'])
+>>>>>>> bffd3381ccb717f802fe651d4111ec0a268e3896
                 # True Label
                 output_labels.append(self.vocab.vocab.get(token, self.vocab.vocab['[UNK]']))
                 # masked_pos_label[i] = self.vocab.vocab.get(token, self.vocab.vocab['[UNK]'])
                 output_tokens[i] = self.vocab.vocab.get(token, self.vocab.vocab['[UNK]'])
                 # Padded label
                 output_labels.append(self.vocab.vocab['[PAD]'])
+<<<<<<< HEAD
+                masked_pos.append(0)
+=======
+>>>>>>> bffd3381ccb717f802fe651d4111ec0a268e3896
         # label_position = []
         # label_tokens = []
         # for k, v in masked_pos_label.items():
         #     label_position.append(k)
         #     label_tokens.append(v)
+<<<<<<< HEAD
+        return  output_tokens, output_labels, masked_pos
+#     def get_token_b(self, item):
+#         document_id = [k for k,v in self.index_documents.items() if item in v][0]
+#         random_document_id = document_id
+#         if random.random() < 0.5:
+#             document_ids = [k for k in self.index_documents.keys() if k != document_id]
+#             random_document_id = random.choice(document_ids)
+#         same_student = (random_document_id == document_id)
+#         nex_seq_list = self.index_documents.get(random_document_id)
+#         if same_student:
+#             if len(nex_seq_list) != 1:
+#                 nex_seq_list = [v for v in nex_seq_list if v !=item]
+#         next_seq = random.choice(nex_seq_list)
+#         tokens = self.lines[next_seq]
+#         # print(f"item = {item}, tokens: {tokens}")
+#         # print(f"item={item}, next={next_seq}, same_student = {same_student}, {document_id} == {random_document_id}, b. {tokens}")
+#         return same_student, tokens
+#     def truncate_to_max_seq(self, s1, s2):
+#         sa = copy.deepcopy(s1)
+#         sb = copy.deepcopy(s1)
+#         total_allowed_seq = self.seq_len - 3
+#         while((len(sa)+len(sb)) > total_allowed_seq):
+#             if random.random() < 0.5:
+#                 sa.pop()
+#             else:
+#                 sb.pop()
+#         return sa, sb
+=======
         return  output_tokens, output_labels
     def get_token_b(self, item):
             else:
                 sb.pop()
         return sa, sb
+>>>>>>> bffd3381ccb717f802fe651d4111ec0a268e3896
 class TokenizerDataset(Dataset):
     """
         Tokenize the data in the dataset
     """
+<<<<<<< HEAD
+    def __init__(self, dataset_path, label_path, vocab, seq_len=30):
+        self.dataset_path = dataset_path
+        self.label_path = label_path
+        self.vocab = vocab # Vocab object
+        # self.encoder = OneHotEncoder(sparse=False)
+=======
     def __init__(self, dataset_path, label_path, vocab, seq_len=30, train=True):
         self.dataset_path = dataset_path
         self.label_path = label_path
         self.vocab = vocab # Vocab object
         self.encoder = OneHotEncoder(sparse_output=False)
+>>>>>>> bffd3381ccb717f802fe651d4111ec0a268e3896
         # Related to input dataset file
         self.lines = []
         self.labels = []
+<<<<<<< HEAD
+        self.feats = []
+        if self.label_path:
+            self.label_file = open(self.label_path, "r")
+            for line in self.label_file:
+                if line:
+                    line = line.strip()
+                    if not line:
+                        continue
+                    self.labels.append(int(line))
+            self.label_file.close()
+            # Comment this section if you are not using feat attribute
+            try:
+                j = 0
+                dataset_info_file = open(self.label_path.replace("label", "info"), "r")
+                for line in dataset_info_file:
+                    if line:
+                        line = line.strip()
+                        if not line:
+                            continue
+                        # # highGRschool_w_prior
+                        # feat_vec = [float(i) for i in line.split(",")[-3].split("\t")]
+                        # highGRschool_w_prior_w_diffskill_wo_fa
+                        feat_vec = [float(i) for i in line.split(",")[-3].split("\t")]
+                        feat2 = [float(i) for i in line.split(",")[-2].split("\t")]
+                        feat_vec.extend(feat2[1:])
+                        # # highGRschool_w_prior_w_p_diffskill_wo_fa
+                        # feat_vec = [float(i) for i in line.split(",")[-3].split("\t")]
+                        # feat2 = [-float(i) for i in line.split(",")[-2].split("\t")]
+                        # feat_vec.extend(feat2[1:])
+#                         # highGRschool_w_prior_w_diffskill_0fa_skill
+#                         feat_vec = [float(i) for i in line.split(",")[-3].split("\t")]
+#                         feat2 = [float(i) for i in line.split(",")[-2].split("\t")]
+#                         fa_feat_vec = [float(i) for i in line.split(",")[-1].split("\t")]
+#                         diff_skill = [f2 if f1==0 else 0 for f2, f1 in zip(feat2, fa_feat_vec)]
+#                         feat_vec.extend(diff_skill)
+                        if j == 0:
+                            print(len(feat_vec))
+                            j+=1
+                        # feat_vec.extend(feat2[1:])
+                        # feat_vec.extend(feat2)
+                        # feat_vec = [float(i) for i in line.split(",")[-2].split("\t")]
+                        # feat_vec = feat_vec[1:]
+                        # feat_vec = [float(line.split(",")[-1])]
+                        # feat_vec = [float(i) for i in line.split(",")[-1].split("\t")]
+                        # feat_vec = [ft-f1 for ft, f1 in zip(feat_vec, fa_feat_vec)]
+                        self.feats.append(feat_vec)
+                dataset_info_file.close()
+            except Exception as e:
+                print(e)
+            # labeler = np.array([0, 1]) #np.unique(self.labels)
+            # print(f"Labeler {labeler}")
+            # self.encoder.fit(labeler.reshape(-1,1))
+            # self.labels = self.encoder.transform(np.array(self.labels).reshape(-1,1))
+        self.file = open(self.dataset_path, "r")
+=======
         self.labels = []
         self.label_file = open(self.label_path, "r")
         self.file = open(self.dataset_path, "r")
         # index = 0
+>>>>>>> bffd3381ccb717f802fe651d4111ec0a268e3896
         for line in self.file:
             if line:
                 line = line.strip()
                 if line:
                     self.lines.append(line)
+<<<<<<< HEAD
+=======
                     # if train:
                     #     if index in indices_of_zeros:
                     #     # if index in indices_of_prom:
                     #     self.labels.append(labels[index])
                         # self.labels.append(progress[index])
                     # index += 1
+>>>>>>> bffd3381ccb717f802fe651d4111ec0a268e3896
         self.file.close()
         self.len = len(self.lines)
         self.seq_len = seq_len
+<<<<<<< HEAD
+        print("Sequence length set at ", self.seq_len, len(self.lines), len(self.labels) if self.label_path else 0)
+=======
         print("Sequence length set at ", self.seq_len, len(self.lines), len(self.labels))
+>>>>>>> bffd3381ccb717f802fe651d4111ec0a268e3896
     def __len__(self):
         return self.len
     def __getitem__(self, item):
+<<<<<<< HEAD
+        org_line = self.lines[item].split("\t")
+        dup_line = []
+        opt = False
+        for l in org_line:
+            if l in ["OptionalTask_1", "EquationAnswer", "NumeratorFactor", "DenominatorFactor", "OptionalTask_2", "FirstRow1:1", "FirstRow1:2", "FirstRow2:1", "FirstRow2:2", "SecondRow", "ThirdRow"]:
+                opt = True
+            if opt and 'FinalAnswer-' in l:
+                dup_line.append('[UNK]')
+            else:
+                dup_line.append(l)
+        dup_line = "\t".join(dup_line)
+        # print(dup_line)
+        s1 = self.vocab.to_seq(dup_line, self.seq_len) # This is like tokenizer and adds [CLS] and [SEP].
+        s1_label = self.labels[item] if self.label_path else 0
+        segment_label = [1 for _ in range(len(s1))]
+        s1_feat = self.feats[item] if len(self.feats)>0 else 0
+        padding = [self.vocab.vocab['[PAD]'] for _ in range(self.seq_len - len(s1))]
+        s1.extend(padding), segment_label.extend(padding)
+        output = {'input': s1,
+                 'label': s1_label,
+                  'feat': s1_feat,
+=======
         s1 = self.vocab.to_seq(self.lines[item], self.seq_len) # This is like tokenizer and adds [CLS] and [SEP].
         s1_label = self.labels[item]
         output = {'bert_input': s1,
                  'progress_status': s1_label,
+>>>>>>> bffd3381ccb717f802fe651d4111ec0a268e3896
                  'segment_label': segment_label}
         return {key: torch.tensor(value) for key, value in output.items()}
+<<<<<<< HEAD
+class TokenizerDatasetForCalibration(Dataset):
+    """
+        Class name: TokenizerDataset
+        Tokenize the data in the dataset
+    """
+    def __init__(self, dataset_path, label_path, vocab, seq_len=30):
+        self.dataset_path = dataset_path
+        self.label_path = label_path
+        self.vocab = vocab # Vocab object
+        # self.encoder = OneHotEncoder(sparse=False)
+        # Related to input dataset file
+        self.lines = []
+        self.labels = []
+        self.feats = []
+        if self.label_path:
+            self.label_file = open(self.label_path, "r")
+            for line in self.label_file:
+                if line:
+                    line = line.strip()
+                    if not line:
+                        continue
+                    self.labels.append(int(line))
+            self.label_file.close()
+            # Comment this section if you are not using feat attribute
+            try:
+                j = 0
+                dataset_info_file = open(self.label_path.replace("label", "info"), "r")
+                for line in dataset_info_file:
+                    if line:
+                        line = line.strip()
+                        if not line:
+                            continue
+                        # # highGRschool_w_prior
+                        # feat_vec = [float(i) for i in line.split(",")[-3].split("\t")]
+                        # highGRschool_w_prior_w_diffskill_wo_fa
+                        feat_vec = [float(i) for i in line.split(",")[-3].split("\t")]
+                        feat2 = [float(i) for i in line.split(",")[-2].split("\t")]
+                        feat_vec.extend(feat2[1:])
+#                         # highGRschool_w_prior_w_diffskill_0fa_skill
+#                         feat_vec = [float(i) for i in line.split(",")[-3].split("\t")]
+#                         feat2 = [float(i) for i in line.split(",")[-2].split("\t")]
+#                         fa_feat_vec = [float(i) for i in line.split(",")[-1].split("\t")]
+#                         diff_skill = [f2 if f1==0 else 0 for f2, f1 in zip(feat2, fa_feat_vec)]
+#                         feat_vec.extend(diff_skill)
+                        if j == 0:
+                            print(len(feat_vec))
+                            j+=1
+                        # feat_vec.extend(feat2[1:])
+                        # feat_vec.extend(feat2)
+                        # feat_vec = [float(i) for i in line.split(",")[-2].split("\t")]
+                        # feat_vec = feat_vec[1:]
+                        # feat_vec = [float(line.split(",")[-1])]
+                        # feat_vec = [float(i) for i in line.split(",")[-1].split("\t")]
+                        # feat_vec = [ft-f1 for ft, f1 in zip(feat_vec, fa_feat_vec)]
+                        self.feats.append(feat_vec)
+                dataset_info_file.close()
+            except Exception as e:
+                print(e)
+            # labeler = np.array([0, 1]) #np.unique(self.labels)
+            # print(f"Labeler {labeler}")
+            # self.encoder.fit(labeler.reshape(-1,1))
+            # self.labels = self.encoder.transform(np.array(self.labels).reshape(-1,1))
+        self.file = open(self.dataset_path, "r")
+        for line in self.file:
+            if line:
+                line = line.strip()
+                if line:
+                    self.lines.append(line)
+        self.file.close()
+        self.len = len(self.lines)
+        self.seq_len = seq_len
+        print("Sequence length set at ", self.seq_len, len(self.lines), len(self.labels) if self.label_path else 0)
+    def __len__(self):
+        return self.len
+    def __getitem__(self, item):
+        org_line = self.lines[item].split("\t")
+        dup_line = []
+        opt = False
+        for l in org_line:
+            if l in ["OptionalTask_1", "EquationAnswer", "NumeratorFactor", "DenominatorFactor", "OptionalTask_2", "FirstRow1:1", "FirstRow1:2", "FirstRow2:1", "FirstRow2:2", "SecondRow", "ThirdRow"]:
+                opt = True
+            if opt and 'FinalAnswer-' in l:
+                dup_line.append('[UNK]')
+            else:
+                dup_line.append(l)
+        dup_line = "\t".join(dup_line)
+        # print(dup_line)
+        s1 = self.vocab.to_seq(dup_line, self.seq_len) # This is like tokenizer and adds [CLS] and [SEP].
+        s1_label = self.labels[item] if self.label_path else 0
+        segment_label = [1 for _ in range(len(s1))]
+        s1_feat = self.feats[item] if len(self.feats)>0 else 0
+        padding = [self.vocab.vocab['[PAD]'] for _ in range(self.seq_len - len(s1))]
+        s1.extend(padding), segment_label.extend(padding)
+        output = {'input': s1,
+                 'label': s1_label,
+                  'feat': s1_feat,
+                 'segment_label': segment_label}
+        return ({key: torch.tensor(value) for key, value in output.items()}, s1_label)
+        # if __name__ == "__main__":
+=======
 # if __name__ == "__main__":
+>>>>>>> bffd3381ccb717f802fe651d4111ec0a268e3896
 #     # import pickle
 #     # k = pickle.load(open("dataset/CL4999_1920/unique_steps_list.pkl","rb"))
 #     # print(k)

src/pretrainer.py CHANGED Viewed

@@ -1,5 +1,42 @@
 import torch
 import torch.nn as nn
 from torch.nn import functional as F
 from torch.optim import Adam, SGD
 from torch.utils.data import DataLoader
@@ -67,6 +104,7 @@ class BERTTrainer:
                  lr: float = 1e-4, betas=(0.9, 0.999), weight_decay: float = 0.01, warmup_steps=10000,
                  with_cuda: bool = True, cuda_devices=None, log_freq: int = 10, same_student_prediction = False,
                 workspace_name=None):
         """
         :param bert: BERT model which you want to train
         :param vocab_size: total word vocab size
@@ -79,6 +117,17 @@ class BERTTrainer:
         :param log_freq: logging frequency of the batch iteration
         """
         # Setup cuda device for BERT training, argument -c, --cuda should be true
         cuda_condition = torch.cuda.is_available() and with_cuda
         self.device = torch.device("cuda:0" if cuda_condition else "cpu")
@@ -87,15 +136,24 @@ class BERTTrainer:
         # This BERT model will be saved every epoch
         self.bert = bert
         # Initialize the BERT Language Model, with BERT model
         self.model = BERTSM(bert, vocab_size).to(self.device)
         # Distributed GPU training if CUDA can detect more than 1 GPU
         if with_cuda and torch.cuda.device_count() > 1:
             print("Using %d GPUS for BERT" % torch.cuda.device_count())
             self.model = nn.DataParallel(self.model, device_ids=cuda_devices)
         # Setting the train and test data loader
         self.train_data = train_dataloader
         self.test_data = test_dataloader
         # Setting the Adam optimizer with hyper-param
@@ -106,19 +164,44 @@ class BERTTrainer:
         self.criterion = nn.NLLLoss(ignore_index=0)
         self.log_freq = log_freq
         self.same_student_prediction = same_student_prediction
         self.workspace_name = workspace_name
         self.save_model = False
         self.avg_loss = 10000
         print("Total Parameters:", sum([p.nelement() for p in self.model.parameters()]))
     def train(self, epoch):
         self.iteration(epoch, self.train_data)
     def test(self, epoch):
         self.iteration(epoch, self.test_data, train=False)
     def iteration(self, epoch, data_loader, train=True):
         """
         loop over the data_loader for training or testing
         if on train status, backward operation is activated
@@ -129,6 +212,30 @@ class BERTTrainer:
         :param train: boolean value of is train or test
         :return: None
         """
         str_code = "train" if train else "test"
         code = "masked_prediction" if self.same_student_prediction else "masked"
@@ -155,10 +262,25 @@ class BERTTrainer:
         avg_loss = 0.0
         with open(self.log_file, 'a') as f:
             sys.stdout = f
             for i, data in data_iter:
                 # 0. batch_data will be sent into the device(GPU or cpu)
                 data = {key: value.to(self.device) for key, value in data.items()}
                 # 1. forward the next_sentence_prediction and masked_lm model
                 # next_sent_output, mask_lm_output = self.model.forward(data["bert_input"], data["segment_label"])
@@ -184,10 +306,49 @@ class BERTTrainer:
                 # 3. backward and optimization only in train
                 if train:
                     self.optim_schedule.zero_grad()
                     loss.backward()
                     self.optim_schedule.step_and_update_lr()
                 non_zero_mask = (data["bert_label"] != 0).float()
                 predictions = torch.argmax(mask_lm_output, dim=-1)
@@ -249,6 +410,7 @@ class BERTTrainer:
         # pickle.dump(bert_hidden_representations, open(f"embeddings/{code}/{str_code}_embeddings_{epoch}.pkl","wb"))
     def save(self, epoch, file_path="output/bert_trained.model"):
         """
@@ -270,7 +432,12 @@ class BERTFineTuneTrainer:
     def __init__(self, bert: BERT, vocab_size: int,
                  train_dataloader: DataLoader, test_dataloader: DataLoader = None,
                  lr: float = 1e-4, betas=(0.9, 0.999), weight_decay: float = 0.01, warmup_steps=10000,
                  with_cuda: bool = True, cuda_devices=None, log_freq: int = 10, workspace_name=None, num_labels=2):
         """
         :param bert: BERT model which you want to train
         :param vocab_size: total word vocab size
@@ -286,6 +453,302 @@ class BERTFineTuneTrainer:
         # Setup cuda device for BERT training, argument -c, --cuda should be true
         cuda_condition = torch.cuda.is_available() and with_cuda
         self.device = torch.device("cuda:0" if cuda_condition else "cpu")
         print("Device used = ", self.device)
         # This BERT model will be saved every epoch
@@ -320,15 +783,28 @@ class BERTFineTuneTrainer:
         self.workspace_name = workspace_name
         self.save_model = False
         self.avg_loss = 10000
         print("Total Parameters:", sum([p.nelement() for p in self.model.parameters()]))
     def train(self, epoch):
         self.iteration(epoch, self.train_data)
     def test(self, epoch):
         self.iteration(epoch, self.test_data, train=False)
     def iteration(self, epoch, data_loader, train=True):
         """
         loop over the data_loader for training or testing
         if on train status, backward operation is activated
@@ -339,6 +815,12 @@ class BERTFineTuneTrainer:
         :param train: boolean value of is train or test
         :return: None
         """
         str_code = "train" if train else "test"
         self.log_file = f"{self.workspace_name}/logs/masked/log_{str_code}_FS_finetuned.txt"
@@ -352,6 +834,7 @@ class BERTFineTuneTrainer:
         # Setting the tqdm progress bar
         data_iter = tqdm.tqdm(enumerate(data_loader),
                               desc="EP_%s:%d" % (str_code, epoch),
                               total=len(data_loader),
                               bar_format="{l_bar}{r_bar}")
@@ -360,6 +843,28 @@ class BERTFineTuneTrainer:
         total_element = 0
         plabels = []
         tlabels = []
         eval_accurate_nb = 0
         nb_eval_examples = 0
         logits_list = []
@@ -390,10 +895,81 @@ class BERTFineTuneTrainer:
                 progress_loss = self.criterion(logits, data["progress_status"])
                 loss = progress_loss
                 if torch.cuda.device_count() > 1:
                     loss = loss.mean()
                 # 3. backward and optimization only in train
                 if train:
                     self.optim.zero_grad()
                     loss.backward()
@@ -489,13 +1065,40 @@ class BERTFineTuneTrainer:
             f.close()
         sys.stdout = sys.__stdout__
         if train:
             self.save_model = False
             if self.avg_loss > (avg_loss / len(data_iter)):
                 self.save_model = True
                 self.avg_loss = (avg_loss / len(data_iter))
                 # plt_test.show()
         # print("EP%d_%s, " % (epoch, str_code))
     def save(self, epoch, file_path="output/bert_fine_tuned_trained.model"):
         """
@@ -510,3 +1113,113 @@ class BERTFineTuneTrainer:
         self.model.to(self.device)
         print("EP:%d Model Saved on:" % epoch, output_path)
         return output_path

 import torch
 import torch.nn as nn
+<<<<<<< HEAD
+# from torch.nn import functional as F
+from torch.optim import Adam
+from torch.utils.data import DataLoader
+# import pickle
+from .bert import BERT
+from .seq_model import BERTSM
+from .classifier_model import BERTForClassification, BERTForClassificationWithFeats
+from .optim_schedule import ScheduledOptim
+import tqdm
+import sys
+import time
+import numpy as np
+from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix
+import matplotlib.pyplot as plt
+import seaborn as sns
+import pandas as pd
+from collections import defaultdict
+import os
+class BERTTrainer:
+    """
+    BERTTrainer pretrains BERT model on input sequence of strategies.
+    BERTTrainer make the pretrained BERT model with one training method objective.
+        1. Masked Strategy Modeling :Masked SM
+    """
+    def __init__(self, bert: BERT, vocab_size: int,
+                 train_dataloader: DataLoader, val_dataloader: DataLoader = None, test_dataloader: DataLoader = None,
+                 lr: float = 1e-4, betas=(0.9, 0.999), weight_decay: float = 0.01, warmup_steps=5000,
+                 with_cuda: bool = True, cuda_devices=None, log_freq: int = 10, log_folder_path: str = None):
+=======
 from torch.nn import functional as F
 from torch.optim import Adam, SGD
 from torch.utils.data import DataLoader
                  lr: float = 1e-4, betas=(0.9, 0.999), weight_decay: float = 0.01, warmup_steps=10000,
                  with_cuda: bool = True, cuda_devices=None, log_freq: int = 10, same_student_prediction = False,
                 workspace_name=None):
+>>>>>>> bffd3381ccb717f802fe651d4111ec0a268e3896
         """
         :param bert: BERT model which you want to train
         :param vocab_size: total word vocab size
         :param log_freq: logging frequency of the batch iteration
         """
+<<<<<<< HEAD
+        cuda_condition = torch.cuda.is_available() and with_cuda
+        self.device = torch.device("cuda:0" if cuda_condition else "cpu")
+        print(cuda_condition, " Device used = ", self.device)
+        available_gpus = list(range(torch.cuda.device_count()))
+        # This BERT model will be saved
+        self.bert = bert.to(self.device)
+        # Initialize the BERT Sequence Model, with BERT model
+=======
         # Setup cuda device for BERT training, argument -c, --cuda should be true
         cuda_condition = torch.cuda.is_available() and with_cuda
         self.device = torch.device("cuda:0" if cuda_condition else "cpu")
         # This BERT model will be saved every epoch
         self.bert = bert
         # Initialize the BERT Language Model, with BERT model
+>>>>>>> bffd3381ccb717f802fe651d4111ec0a268e3896
         self.model = BERTSM(bert, vocab_size).to(self.device)
         # Distributed GPU training if CUDA can detect more than 1 GPU
         if with_cuda and torch.cuda.device_count() > 1:
             print("Using %d GPUS for BERT" % torch.cuda.device_count())
+<<<<<<< HEAD
+            self.model = nn.DataParallel(self.model, device_ids=available_gpus)
+        # Setting the train, validation and test data loader
+        self.train_data = train_dataloader
+        self.val_data = val_dataloader
+=======
             self.model = nn.DataParallel(self.model, device_ids=cuda_devices)
         # Setting the train and test data loader
         self.train_data = train_dataloader
+>>>>>>> bffd3381ccb717f802fe651d4111ec0a268e3896
         self.test_data = test_dataloader
         # Setting the Adam optimizer with hyper-param
         self.criterion = nn.NLLLoss(ignore_index=0)
         self.log_freq = log_freq
+<<<<<<< HEAD
+        self.log_folder_path = log_folder_path
+        # self.workspace_name = workspace_name
+        self.save_model = False
+        # self.code = code
+        self.avg_loss = 10000
+        for fi in ['train', 'val', 'test']:
+            f = open(self.log_folder_path+f"/log_{fi}_pretrained.txt", 'w')
+            f.close()
+        self.start_time = time.time()
+=======
         self.same_student_prediction = same_student_prediction
         self.workspace_name = workspace_name
         self.save_model = False
         self.avg_loss = 10000
+>>>>>>> bffd3381ccb717f802fe651d4111ec0a268e3896
         print("Total Parameters:", sum([p.nelement() for p in self.model.parameters()]))
     def train(self, epoch):
         self.iteration(epoch, self.train_data)
+<<<<<<< HEAD
+    def val(self, epoch):
+        if epoch == 0:
+            self.avg_loss = 10000
+        self.iteration(epoch, self.val_data, phase="val")
+    def test(self, epoch):
+        self.iteration(epoch, self.test_data, phase="test")
+    def iteration(self, epoch, data_loader, phase="train"):
+=======
     def test(self, epoch):
         self.iteration(epoch, self.test_data, train=False)
     def iteration(self, epoch, data_loader, train=True):
+>>>>>>> bffd3381ccb717f802fe651d4111ec0a268e3896
         """
         loop over the data_loader for training or testing
         if on train status, backward operation is activated
         :param train: boolean value of is train or test
         :return: None
         """
+<<<<<<< HEAD
+        # self.log_file = f"{self.workspace_name}/logs/{self.code}/log_{phase}_pretrained.txt"
+        # bert_hidden_representations = [] can be used
+        # if epoch == 0:
+        #     f = open(self.log_file, 'w')
+        #     f.close()
+        # Progress bar
+        data_iter = tqdm.tqdm(enumerate(data_loader),
+                              desc="EP_%s:%d" % (phase, epoch),
+                              total=len(data_loader),
+                              bar_format="{l_bar}{r_bar}")
+        total_correct = 0
+        total_element = 0
+        avg_loss = 0.0
+        if phase == "train":
+            self.model.train()
+        else:
+            self.model.eval()
+        with open(self.log_folder_path+f"/log_{phase}_pretrained.txt", 'a') as f:
+=======
         str_code = "train" if train else "test"
         code = "masked_prediction" if self.same_student_prediction else "masked"
         avg_loss = 0.0
         with open(self.log_file, 'a') as f:
+>>>>>>> bffd3381ccb717f802fe651d4111ec0a268e3896
             sys.stdout = f
             for i, data in data_iter:
                 # 0. batch_data will be sent into the device(GPU or cpu)
                 data = {key: value.to(self.device) for key, value in data.items()}
+<<<<<<< HEAD
+                # 1. forward masked_sm model
+                # mask_sm_output is log-probabilities output
+                mask_sm_output, bert_hidden_rep = self.model.forward(data["bert_input"], data["segment_label"])
+                # 2. NLLLoss of predicting masked token word
+                loss = self.criterion(mask_sm_output.transpose(1, 2), data["bert_label"])
+                if torch.cuda.device_count() > 1:
+                    loss = loss.mean()
+                # 3. backward and optimization only in train
+                if phase == "train":
+=======
                 # 1. forward the next_sentence_prediction and masked_lm model
                 # next_sent_output, mask_lm_output = self.model.forward(data["bert_input"], data["segment_label"])
                 # 3. backward and optimization only in train
                 if train:
+>>>>>>> bffd3381ccb717f802fe651d4111ec0a268e3896
                     self.optim_schedule.zero_grad()
                     loss.backward()
                     self.optim_schedule.step_and_update_lr()
+<<<<<<< HEAD
+                # tokens with highest log-probabilities creates a predicted sequence
+                pred_tokens = torch.argmax(mask_sm_output, dim=-1)
+                mask_correct = (data["bert_label"] == pred_tokens) & data["masked_pos"]
+                total_correct += mask_correct.sum().item()
+                total_element += data["masked_pos"].sum().item()
+                avg_loss +=loss.item()
+                torch.cuda.empty_cache()
+                post_fix = {
+                    "epoch": epoch,
+                    "iter": i,
+                    "avg_loss": avg_loss / (i + 1),
+                    "avg_acc_mask": (total_correct / total_element * 100) if total_element != 0 else 0,
+                    "loss": loss.item()
+                }
+                if i % self.log_freq == 0:
+                    data_iter.write(str(post_fix))
+            end_time = time.time()
+            final_msg = {
+                "epoch": f"EP{epoch}_{phase}",
+                "avg_loss": avg_loss / len(data_iter),
+                "total_masked_acc": (total_correct / total_element * 100) if total_element != 0 else 0,
+                "time_taken_from_start": end_time - self.start_time
+            }
+            print(final_msg)
+            f.close()
+        sys.stdout = sys.__stdout__
+        if phase == "val":
+            self.save_model = False
+            if self.avg_loss > (avg_loss / len(data_iter)):
+                self.save_model = True
+                self.avg_loss = (avg_loss / len(data_iter))
+=======
                 non_zero_mask = (data["bert_label"] != 0).float()
                 predictions = torch.argmax(mask_lm_output, dim=-1)
         # pickle.dump(bert_hidden_representations, open(f"embeddings/{code}/{str_code}_embeddings_{epoch}.pkl","wb"))
+>>>>>>> bffd3381ccb717f802fe651d4111ec0a268e3896
     def save(self, epoch, file_path="output/bert_trained.model"):
         """
     def __init__(self, bert: BERT, vocab_size: int,
                  train_dataloader: DataLoader, test_dataloader: DataLoader = None,
                  lr: float = 1e-4, betas=(0.9, 0.999), weight_decay: float = 0.01, warmup_steps=10000,
+<<<<<<< HEAD
+                 with_cuda: bool = True, cuda_devices=None, log_freq: int = 10, workspace_name=None,
+                 num_labels=2, log_folder_path: str = None):
+=======
                  with_cuda: bool = True, cuda_devices=None, log_freq: int = 10, workspace_name=None, num_labels=2):
+>>>>>>> bffd3381ccb717f802fe651d4111ec0a268e3896
         """
         :param bert: BERT model which you want to train
         :param vocab_size: total word vocab size
         # Setup cuda device for BERT training, argument -c, --cuda should be true
         cuda_condition = torch.cuda.is_available() and with_cuda
         self.device = torch.device("cuda:0" if cuda_condition else "cpu")
+<<<<<<< HEAD
+        print(cuda_condition, " Device used = ", self.device)
+        available_gpus = list(range(torch.cuda.device_count()))
+        # This BERT model will be saved every epoch
+        self.bert = bert
+        for param in self.bert.parameters():
+            param.requires_grad = False
+        # Initialize the BERT Language Model, with BERT model
+        # self.model = BERTForClassification(self.bert, vocab_size, num_labels).to(self.device)
+        # self.model = BERTForClassificationWithFeats(self.bert, num_labels, 8).to(self.device)
+        self.model = BERTForClassificationWithFeats(self.bert, num_labels, 17).to(self.device)
+        # self.model = BERTForClassificationWithFeats(self.bert, num_labels, 1).to(self.device)
+        # Distributed GPU training if CUDA can detect more than 1 GPU
+        if with_cuda and torch.cuda.device_count() > 1:
+            print("Using %d GPUS for BERT" % torch.cuda.device_count())
+            self.model = nn.DataParallel(self.model, device_ids=available_gpus)
+        # Setting the train, validation and test data loader
+        self.train_data = train_dataloader
+        # self.val_data = val_dataloader
+        self.test_data = test_dataloader
+        # self.optim = Adam(self.model.parameters(), lr=lr, weight_decay=weight_decay) #, eps=1e-9
+        self.optim = Adam(self.model.parameters(), lr=lr, betas=betas, weight_decay=weight_decay)
+        self.optim_schedule = ScheduledOptim(self.optim, self.bert.hidden, n_warmup_steps=warmup_steps)
+        # self.scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=2, factor=0.1)
+        self.criterion = nn.CrossEntropyLoss()
+        # if num_labels == 1:
+        #     self.criterion = nn.MSELoss()
+        # elif num_labels == 2:
+        #     self.criterion = nn.BCEWithLogitsLoss()
+        #     # self.criterion = nn.CrossEntropyLoss()
+        # elif num_labels > 2:
+            # self.criterion = nn.CrossEntropyLoss()
+            # self.criterion = nn.BCEWithLogitsLoss()
+        self.log_freq = log_freq
+        self.log_folder_path = log_folder_path
+        # self.workspace_name = workspace_name
+        # self.finetune_task = finetune_task
+        self.save_model = False
+        self.avg_loss = 10000
+        self.start_time = time.time()
+        # self.probability_list = []
+        for fi in ['train', 'test']: #'val',
+            f = open(self.log_folder_path+f"/log_{fi}_finetuned.txt", 'w')
+            f.close()
+        print("Total Parameters:", sum([p.nelement() for p in self.model.parameters()]))
+    def train(self, epoch):
+        self.iteration(epoch, self.train_data)
+    # def val(self, epoch):
+    #     self.iteration(epoch, self.val_data, phase="val")
+    def test(self, epoch):
+        if epoch == 0:
+            self.avg_loss = 10000
+        self.iteration(epoch, self.test_data, phase="test")
+    def iteration(self, epoch, data_loader, phase="train"):
+        """
+        loop over the data_loader for training or testing
+        if on train status, backward operation is activated
+        and also auto save the model every peoch
+        :param epoch: current epoch index
+        :param data_loader: torch.utils.data.DataLoader for iteration
+        :param train: boolean value of is train or test
+        :return: None
+        """
+        # Setting the tqdm progress bar
+        data_iter = tqdm.tqdm(enumerate(data_loader),
+                              desc="EP_%s:%d" % (phase, epoch),
+                              total=len(data_loader),
+                              bar_format="{l_bar}{r_bar}")
+        avg_loss = 0.0
+        total_correct = 0
+        total_element = 0
+        plabels = []
+        tlabels = []
+        probabs = []
+        if phase == "train":
+            self.model.train()
+        else:
+            self.model.eval()
+        # self.probability_list = []
+        with open(self.log_folder_path+f"/log_{phase}_finetuned.txt", 'a') as f:
+            sys.stdout = f
+            for i, data in data_iter:
+                # 0. batch_data will be sent into the device(GPU or cpu)
+                data = {key: value.to(self.device) for key, value in data.items()}
+                if phase == "train":
+                    logits = self.model.forward(data["input"], data["segment_label"], data["feat"])
+                else:
+                    with torch.no_grad():
+                        logits = self.model.forward(data["input"], data["segment_label"], data["feat"])
+                loss = self.criterion(logits, data["label"])
+                if torch.cuda.device_count() > 1:
+                    loss = loss.mean()
+                # 3. backward and optimization only in train
+                if phase == "train":
+                    self.optim_schedule.zero_grad()
+                    loss.backward()
+                    self.optim_schedule.step_and_update_lr()
+                # prediction accuracy
+                probs = nn.Softmax(dim=-1)(logits) # Probabilities
+                probabs.extend(probs.detach().cpu().numpy().tolist())
+                predicted_labels = torch.argmax(probs, dim=-1) #correct
+                # self.probability_list.append(probs)
+                # true_labels = torch.argmax(data["label"], dim=-1)
+                plabels.extend(predicted_labels.cpu().numpy())
+                tlabels.extend(data['label'].cpu().numpy())
+                # Compare predicted labels to true labels and calculate accuracy
+                correct = (data['label'] == predicted_labels).sum().item()
+                avg_loss += loss.item()
+                total_correct += correct
+                # total_element += true_labels.nelement()
+                total_element += data["label"].nelement()
+                # print(">>>>>>>>>>>>>>", predicted_labels, true_labels, correct, total_correct, total_element)
+                post_fix = {
+                    "epoch": epoch,
+                    "iter": i,
+                    "avg_loss": avg_loss / (i + 1),
+                    "avg_acc": total_correct / total_element * 100 if total_element != 0 else 0,
+                    "loss": loss.item()
+                }
+                if i % self.log_freq == 0:
+                    data_iter.write(str(post_fix))
+            precisions = precision_score(tlabels, plabels, average="weighted", zero_division=0)
+            recalls = recall_score(tlabels, plabels, average="weighted")
+            f1_scores = f1_score(tlabels, plabels, average="weighted")
+            cmatrix = confusion_matrix(tlabels, plabels)
+            end_time = time.time()
+            final_msg = {
+                "epoch": f"EP{epoch}_{phase}",
+                "avg_loss": avg_loss / len(data_iter),
+                "total_acc": total_correct * 100.0 / total_element,
+                "precisions": precisions,
+                "recalls": recalls,
+                "f1_scores": f1_scores,
+                # "confusion_matrix": f"{cmatrix}",
+                # "true_labels": f"{tlabels}",
+                # "predicted_labels": f"{plabels}",
+                "time_taken_from_start": end_time - self.start_time
+            }
+            print(final_msg)
+            f.close()
+            with open(self.log_folder_path+f"/log_{phase}_finetuned_info.txt", 'a') as f1:
+                sys.stdout = f1
+                final_msg = {
+                "epoch": f"EP{epoch}_{phase}",
+                "confusion_matrix": f"{cmatrix}",
+                "true_labels": f"{tlabels if epoch == 0 else ''}",
+                "predicted_labels": f"{plabels}",
+                "probabilities": f"{probabs}",
+                "time_taken_from_start": end_time - self.start_time
+                }
+                print(final_msg)
+                f1.close()
+            sys.stdout = sys.__stdout__
+        sys.stdout = sys.__stdout__
+        if phase == "test":
+            self.save_model = False
+            if self.avg_loss > (avg_loss / len(data_iter)):
+                self.save_model = True
+                self.avg_loss = (avg_loss / len(data_iter))
+    def iteration_1(self, epoch_idx, data):
+        try:
+            data = {key: value.to(self.device) for key, value in data.items()}
+            logits = self.model(data['input_ids'], data['segment_label'])
+            # Ensure logits is a tensor, not a tuple
+            loss_fct = nn.CrossEntropyLoss()
+            loss = loss_fct(logits, data['labels'])
+            # Backpropagation and optimization
+            self.optim.zero_grad()
+            loss.backward()
+            self.optim.step()
+            if self.log_freq > 0 and epoch_idx % self.log_freq == 0:
+                print(f"Epoch {epoch_idx}: Loss = {loss.item()}")
+            return loss
+        except Exception as e:
+            print(f"Error during iteration: {e}")
+            raise
+    def save(self, epoch, file_path="output/bert_fine_tuned_trained.model"):
+        """
+        Saving the current BERT model on file_path
+        :param epoch: current epoch number
+        :param file_path: model output path which gonna be file_path+"ep%d" % epoch
+        :return: final_output_path
+        """
+        output_path = file_path + ".ep%d" % epoch
+        torch.save(self.model.cpu(), output_path)
+        self.model.to(self.device)
+        print("EP:%d Model Saved on:" % epoch, output_path)
+        return output_path
+class BERTFineTuneTrainer1:
+    def __init__(self, bert: BERT, vocab_size: int,
+                 train_dataloader: DataLoader, test_dataloader: DataLoader = None,
+                 lr: float = 1e-4, betas=(0.9, 0.999), weight_decay: float = 0.01, warmup_steps=10000,
+                 with_cuda: bool = True, cuda_devices=None, log_freq: int = 10, workspace_name=None,
+                 num_labels=2, log_folder_path: str = None):
+        """
+        :param bert: BERT model which you want to train
+        :param vocab_size: total word vocab size
+        :param train_dataloader: train dataset data loader
+        :param test_dataloader: test dataset data loader [can be None]
+        :param lr: learning rate of optimizer
+        :param betas: Adam optimizer betas
+        :param weight_decay: Adam optimizer weight decay param
+        :param with_cuda: traning with cuda
+        :param log_freq: logging frequency of the batch iteration
+        """
+        # Setup cuda device for BERT training, argument -c, --cuda should be true
+        cuda_condition = torch.cuda.is_available() and with_cuda
+        self.device = torch.device("cuda:0" if cuda_condition else "cpu")
+        print(cuda_condition, " Device used = ", self.device)
+        available_gpus = list(range(torch.cuda.device_count()))
+        # This BERT model will be saved every epoch
+        self.bert = bert
+        for param in self.bert.parameters():
+            param.requires_grad = False
+        # Initialize the BERT Language Model, with BERT model
+        self.model = BERTForClassification(self.bert, vocab_size, num_labels).to(self.device)
+        # self.model = BERTForClassificationWithFeats(self.bert, num_labels, 8).to(self.device)
+        # self.model = BERTForClassificationWithFeats(self.bert, num_labels, 8*2).to(self.device)
+        # self.model = BERTForClassificationWithFeats(self.bert, num_labels, 1).to(self.device)
+        # Distributed GPU training if CUDA can detect more than 1 GPU
+        if with_cuda and torch.cuda.device_count() > 1:
+            print("Using %d GPUS for BERT" % torch.cuda.device_count())
+            self.model = nn.DataParallel(self.model, device_ids=available_gpus)
+        # Setting the train, validation and test data loader
+        self.train_data = train_dataloader
+        # self.val_data = val_dataloader
+        self.test_data = test_dataloader
+        # self.optim = Adam(self.model.parameters(), lr=lr, weight_decay=weight_decay) #, eps=1e-9
+        self.optim = Adam(self.model.parameters(), lr=lr, betas=betas, weight_decay=weight_decay)
+        self.optim_schedule = ScheduledOptim(self.optim, self.bert.hidden, n_warmup_steps=warmup_steps)
+        # self.scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=2, factor=0.1)
+        self.criterion = nn.CrossEntropyLoss()
+        # if num_labels == 1:
+        #     self.criterion = nn.MSELoss()
+        # elif num_labels == 2:
+        #     self.criterion = nn.BCEWithLogitsLoss()
+        #     # self.criterion = nn.CrossEntropyLoss()
+        # elif num_labels > 2:
+            # self.criterion = nn.CrossEntropyLoss()
+            # self.criterion = nn.BCEWithLogitsLoss()
+        self.log_freq = log_freq
+        self.log_folder_path = log_folder_path
+        # self.workspace_name = workspace_name
+        # self.finetune_task = finetune_task
+        self.save_model = False
+        self.avg_loss = 10000
+        self.start_time = time.time()
+        # self.probability_list = []
+        for fi in ['train', 'test']: #'val',
+            f = open(self.log_folder_path+f"/log_{fi}_finetuned.txt", 'w')
+            f.close()
+=======
         print("Device used = ", self.device)
         # This BERT model will be saved every epoch
         self.workspace_name = workspace_name
         self.save_model = False
         self.avg_loss = 10000
+>>>>>>> bffd3381ccb717f802fe651d4111ec0a268e3896
         print("Total Parameters:", sum([p.nelement() for p in self.model.parameters()]))
     def train(self, epoch):
         self.iteration(epoch, self.train_data)
+<<<<<<< HEAD
+    # def val(self, epoch):
+    #     self.iteration(epoch, self.val_data, phase="val")
+    def test(self, epoch):
+        if epoch == 0:
+            self.avg_loss = 10000
+        self.iteration(epoch, self.test_data, phase="test")
+    def iteration(self, epoch, data_loader, phase="train"):
+=======
     def test(self, epoch):
         self.iteration(epoch, self.test_data, train=False)
     def iteration(self, epoch, data_loader, train=True):
+>>>>>>> bffd3381ccb717f802fe651d4111ec0a268e3896
         """
         loop over the data_loader for training or testing
         if on train status, backward operation is activated
         :param train: boolean value of is train or test
         :return: None
         """
+<<<<<<< HEAD
+        # Setting the tqdm progress bar
+        data_iter = tqdm.tqdm(enumerate(data_loader),
+                              desc="EP_%s:%d" % (phase, epoch),
+=======
         str_code = "train" if train else "test"
         self.log_file = f"{self.workspace_name}/logs/masked/log_{str_code}_FS_finetuned.txt"
         # Setting the tqdm progress bar
         data_iter = tqdm.tqdm(enumerate(data_loader),
                               desc="EP_%s:%d" % (str_code, epoch),
+>>>>>>> bffd3381ccb717f802fe651d4111ec0a268e3896
                               total=len(data_loader),
                               bar_format="{l_bar}{r_bar}")
         total_element = 0
         plabels = []
         tlabels = []
+<<<<<<< HEAD
+        probabs = []
+        if phase == "train":
+            self.model.train()
+        else:
+            self.model.eval()
+        # self.probability_list = []
+        with open(self.log_folder_path+f"/log_{phase}_finetuned.txt", 'a') as f:
+            sys.stdout = f
+            for i, data in data_iter:
+                # 0. batch_data will be sent into the device(GPU or cpu)
+                data = {key: value.to(self.device) for key, value in data.items()}
+                if phase == "train":
+                    logits = self.model.forward(data["input"], data["segment_label"])#, data["feat"])
+                else:
+                    with torch.no_grad():
+                        logits = self.model.forward(data["input"], data["segment_label"])#, data["feat"])
+                loss = self.criterion(logits, data["label"])
+=======
         eval_accurate_nb = 0
         nb_eval_examples = 0
         logits_list = []
                 progress_loss = self.criterion(logits, data["progress_status"])
                 loss = progress_loss
+>>>>>>> bffd3381ccb717f802fe651d4111ec0a268e3896
                 if torch.cuda.device_count() > 1:
                     loss = loss.mean()
                 # 3. backward and optimization only in train
+<<<<<<< HEAD
+                if phase == "train":
+                    self.optim_schedule.zero_grad()
+                    loss.backward()
+                    self.optim_schedule.step_and_update_lr()
+                # prediction accuracy
+                probs = nn.Softmax(dim=-1)(logits) # Probabilities
+                probabs.extend(probs.detach().cpu().numpy().tolist())
+                predicted_labels = torch.argmax(probs, dim=-1) #correct
+                # self.probability_list.append(probs)
+                # true_labels = torch.argmax(data["label"], dim=-1)
+                plabels.extend(predicted_labels.cpu().numpy())
+                tlabels.extend(data['label'].cpu().numpy())
+                # Compare predicted labels to true labels and calculate accuracy
+                correct = (data['label'] == predicted_labels).sum().item()
+                avg_loss += loss.item()
+                total_correct += correct
+                # total_element += true_labels.nelement()
+                total_element += data["label"].nelement()
+                # print(">>>>>>>>>>>>>>", predicted_labels, true_labels, correct, total_correct, total_element)
+                post_fix = {
+                    "epoch": epoch,
+                    "iter": i,
+                    "avg_loss": avg_loss / (i + 1),
+                    "avg_acc": total_correct / total_element * 100 if total_element != 0 else 0,
+                    "loss": loss.item()
+                }
+                if i % self.log_freq == 0:
+                    data_iter.write(str(post_fix))
+            precisions = precision_score(tlabels, plabels, average="weighted", zero_division=0)
+            recalls = recall_score(tlabels, plabels, average="weighted")
+            f1_scores = f1_score(tlabels, plabels, average="weighted")
+            cmatrix = confusion_matrix(tlabels, plabels)
+            end_time = time.time()
+            final_msg = {
+                "epoch": f"EP{epoch}_{phase}",
+                "avg_loss": avg_loss / len(data_iter),
+                "total_acc": total_correct * 100.0 / total_element,
+                "precisions": precisions,
+                "recalls": recalls,
+                "f1_scores": f1_scores,
+                # "confusion_matrix": f"{cmatrix}",
+                # "true_labels": f"{tlabels}",
+                # "predicted_labels": f"{plabels}",
+                "time_taken_from_start": end_time - self.start_time
+            }
+            print(final_msg)
+            f.close()
+            with open(self.log_folder_path+f"/log_{phase}_finetuned_info.txt", 'a') as f1:
+                sys.stdout = f1
+                final_msg = {
+                "epoch": f"EP{epoch}_{phase}",
+                "confusion_matrix": f"{cmatrix}",
+                "true_labels": f"{tlabels if epoch == 0 else ''}",
+                "predicted_labels": f"{plabels}",
+                "probabilities": f"{probabs}",
+                "time_taken_from_start": end_time - self.start_time
+                }
+                print(final_msg)
+                f1.close()
+            sys.stdout = sys.__stdout__
+        sys.stdout = sys.__stdout__
+        if phase == "test":
+=======
                 if train:
                     self.optim.zero_grad()
                     loss.backward()
             f.close()
         sys.stdout = sys.__stdout__
         if train:
+>>>>>>> bffd3381ccb717f802fe651d4111ec0a268e3896
             self.save_model = False
             if self.avg_loss > (avg_loss / len(data_iter)):
                 self.save_model = True
                 self.avg_loss = (avg_loss / len(data_iter))
+<<<<<<< HEAD
+    def iteration_1(self, epoch_idx, data):
+        try:
+            data = {key: value.to(self.device) for key, value in data.items()}
+            logits = self.model(data['input_ids'], data['segment_label'])
+            # Ensure logits is a tensor, not a tuple
+            loss_fct = nn.CrossEntropyLoss()
+            loss = loss_fct(logits, data['labels'])
+            # Backpropagation and optimization
+            self.optim.zero_grad()
+            loss.backward()
+            self.optim.step()
+            if self.log_freq > 0 and epoch_idx % self.log_freq == 0:
+                print(f"Epoch {epoch_idx}: Loss = {loss.item()}")
+            return loss
+        except Exception as e:
+            print(f"Error during iteration: {e}")
+            raise
+=======
                 # plt_test.show()
         # print("EP%d_%s, " % (epoch, str_code))
+>>>>>>> bffd3381ccb717f802fe651d4111ec0a268e3896
     def save(self, epoch, file_path="output/bert_fine_tuned_trained.model"):
         """
         self.model.to(self.device)
         print("EP:%d Model Saved on:" % epoch, output_path)
         return output_path
+<<<<<<< HEAD
+class BERTAttention:
+    def __init__(self, bert: BERT, vocab_obj, train_dataloader: DataLoader, workspace_name=None, code=None, finetune_task=None, with_cuda=True):
+        # available_gpus = list(range(torch.cuda.device_count()))
+        cuda_condition = torch.cuda.is_available() and with_cuda
+        self.device = torch.device("cuda:0" if cuda_condition else "cpu")
+        print(with_cuda, cuda_condition, " Device used = ", self.device)
+        self.bert = bert.to(self.device)
+        # if with_cuda and torch.cuda.device_count() > 1:
+        #     print("Using %d GPUS for BERT" % torch.cuda.device_count())
+        #     self.bert = nn.DataParallel(self.bert, device_ids=available_gpus)
+        self.train_dataloader = train_dataloader
+        self.workspace_name = workspace_name
+        self.code = code
+        self.finetune_task = finetune_task
+        self.vocab_obj = vocab_obj
+    def getAttention(self):
+        # self.log_file = f"{self.workspace_name}/logs/{self.code}/log_attention.txt"
+        labels = ['PercentChange', 'NumeratorQuantity2', 'NumeratorQuantity1', 'DenominatorQuantity1',
+                  'OptionalTask_1', 'EquationAnswer', 'NumeratorFactor', 'DenominatorFactor',
+                  'OptionalTask_2', 'FirstRow1:1', 'FirstRow1:2', 'FirstRow2:1', 'FirstRow2:2', 'SecondRow',
+                  'ThirdRow', 'FinalAnswer','FinalAnswerDirection']
+        df_all = pd.DataFrame(0.0, index=labels, columns=labels)
+        # Setting the tqdm progress bar
+        data_iter = tqdm.tqdm(enumerate(self.train_dataloader),
+                              desc="attention",
+                              total=len(self.train_dataloader),
+                              bar_format="{l_bar}{r_bar}")
+        count = 0
+        for i, data in data_iter:
+            data = {key: value.to(self.device) for key, value in data.items()}
+            a = self.bert.forward(data["bert_input"], data["segment_label"])
+            non_zero = np.sum(data["segment_label"].cpu().detach().numpy())
+            # Last Transformer Layer
+            last_layer = self.bert.attention_values[-1].transpose(1,0,2,3)
+            # print(last_layer.shape)
+            head, d_model, s, s = last_layer.shape
+            for d in range(d_model):
+                seq_labels = self.vocab_obj.to_sentence(data["bert_input"].cpu().detach().numpy().tolist()[d])[1:non_zero-1]
+                # df_all = pd.DataFrame(0.0, index=seq_labels, columns=seq_labels)
+                indices_to_choose = defaultdict(int)
+                for k,s in enumerate(seq_labels):
+                    if s in labels:
+                        indices_to_choose[s] = k
+                indices_chosen = list(indices_to_choose.values())
+                selected_seq_labels = [s for l,s in enumerate(seq_labels) if l in indices_chosen]
+                # print(len(seq_labels), len(selected_seq_labels))
+                for h in range(head):
+                    # fig, ax = plt.subplots(figsize=(12, 12))
+                    # seq_labels = self.vocab_obj.to_sentence(data["bert_input"].cpu().detach().numpy().tolist()[d])#[1:non_zero-1]
+                    # seq_labels = self.vocab_obj.to_sentence(data["bert_input"].cpu().detach().numpy().tolist()[d])[1:non_zero-1]
+#                     indices_to_choose = defaultdict(int)
+#                     for k,s in enumerate(seq_labels):
+#                         if s in labels:
+#                             indices_to_choose[s] = k
+#                     indices_chosen = list(indices_to_choose.values())
+#                     selected_seq_labels = [s for l,s in enumerate(seq_labels) if l in indices_chosen]
+                    # print(f"Chosen index: {seq_labels, indices_to_choose, indices_chosen, selected_seq_labels}")
+                    df_cm = pd.DataFrame(last_layer[h][d][indices_chosen,:][:,indices_chosen], index = selected_seq_labels, columns = selected_seq_labels)
+                    df_all = df_all.add(df_cm, fill_value=0)
+                    count += 1
+                    # df_cm = pd.DataFrame(last_layer[h][d][1:non_zero-1,:][:,1:non_zero-1], index=seq_labels, columns=seq_labels)
+                    # df_all = df_all.add(df_cm, fill_value=0)
+                # df_all = df_all.reindex(index=seq_labels, columns=seq_labels)
+                # sns.heatmap(df_all, annot=False)
+                # plt.title("Attentions") #Probabilities
+                # plt.xlabel("Steps")
+                # plt.ylabel("Steps")
+                # plt.grid(True)
+                # plt.tick_params(axis='x', bottom=False, top=True, labelbottom=False, labeltop=True, labelrotation=90)
+                # plt.savefig(f"{self.workspace_name}/plots/{self.code}/{self.finetune_task}_attention_scores_over_[{h}]_head_n_data[{d}].png", bbox_inches='tight')
+                # plt.show()
+                # plt.close()
+        print(f"Count of total : {count, head * self.train_dataloader.dataset.len}")
+        df_all = df_all.div(count) # head * self.train_dataloader.dataset.len
+        df_all = df_all.reindex(index=labels, columns=labels)
+        sns.heatmap(df_all, annot=False)
+        plt.title("Attentions") #Probabilities
+        plt.xlabel("Steps")
+        plt.ylabel("Steps")
+        plt.grid(True)
+        plt.tick_params(axis='x', bottom=False, top=True, labelbottom=False, labeltop=True, labelrotation=90)
+        plt.savefig(f"{self.workspace_name}/plots/{self.code}/{self.finetune_task}_attention_scores.png", bbox_inches='tight')
+        plt.show()
+        plt.close()
+=======
+>>>>>>> bffd3381ccb717f802fe651d4111ec0a268e3896

src/reference_code/bert_reference_code.py ADDED Viewed

	@@ -0,0 +1,1622 @@

+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch BERT model. """
+import logging
+import math
+import os
+import warnings
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import CrossEntropyLoss, MSELoss
+from .activations import gelu, gelu_new, swish
+from .configuration_bert import BertConfig
+from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
+from .modeling_utils import PreTrainedModel, find_pruneable_heads_and_indices, prune_linear_layer
+logger = logging.getLogger(__name__)
+_TOKENIZER_FOR_DOC = "BertTokenizer"
+BERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "bert-base-uncased",
+    "bert-large-uncased",
+    "bert-base-cased",
+    "bert-large-cased",
+    "bert-base-multilingual-uncased",
+    "bert-base-multilingual-cased",
+    "bert-base-chinese",
+    "bert-base-german-cased",
+    "bert-large-uncased-whole-word-masking",
+    "bert-large-cased-whole-word-masking",
+    "bert-large-uncased-whole-word-masking-finetuned-squad",
+    "bert-large-cased-whole-word-masking-finetuned-squad",
+    "bert-base-cased-finetuned-mrpc",
+    "bert-base-german-dbmdz-cased",
+    "bert-base-german-dbmdz-uncased",
+    "cl-tohoku/bert-base-japanese",
+    "cl-tohoku/bert-base-japanese-whole-word-masking",
+    "cl-tohoku/bert-base-japanese-char",
+    "cl-tohoku/bert-base-japanese-char-whole-word-masking",
+    "TurkuNLP/bert-base-finnish-cased-v1",
+    "TurkuNLP/bert-base-finnish-uncased-v1",
+    "wietsedv/bert-base-dutch-cased",
+    # See all BERT models at https://huggingface.co/models?filter=bert
+]
+def load_tf_weights_in_bert(model, config, tf_checkpoint_path):
+    """ Load tf checkpoints in a pytorch model.
+    """
+    try:
+        import re
+        import numpy as np
+        import tensorflow as tf
+    except ImportError:
+        logger.error(
+            "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
+            "https://www.tensorflow.org/install/ for installation instructions."
+        )
+        raise
+    tf_path = os.path.abspath(tf_checkpoint_path)
+    logger.info("Converting TensorFlow checkpoint from {}".format(tf_path))
+    # Load weights from TF model
+    init_vars = tf.train.list_variables(tf_path)
+    names = []
+    arrays = []
+    for name, shape in init_vars:
+        logger.info("Loading TF weight {} with shape {}".format(name, shape))
+        array = tf.train.load_variable(tf_path, name)
+        names.append(name)
+        arrays.append(array)
+    for name, array in zip(names, arrays):
+        name = name.split("/")
+        # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
+        # which are not required for using pretrained model
+        if any(
+            n in ["adam_v", "adam_m", "AdamWeightDecayOptimizer", "AdamWeightDecayOptimizer_1", "global_step"]
+            for n in name
+        ):
+            logger.info("Skipping {}".format("/".join(name)))
+            continue
+        pointer = model
+        for m_name in name:
+            if re.fullmatch(r"[A-Za-z]+_\d+", m_name):
+                scope_names = re.split(r"_(\d+)", m_name)
+            else:
+                scope_names = [m_name]
+            if scope_names[0] == "kernel" or scope_names[0] == "gamma":
+                pointer = getattr(pointer, "weight")
+            elif scope_names[0] == "output_bias" or scope_names[0] == "beta":
+                pointer = getattr(pointer, "bias")
+            elif scope_names[0] == "output_weights":
+                pointer = getattr(pointer, "weight")
+            elif scope_names[0] == "squad":
+                pointer = getattr(pointer, "classifier")
+            else:
+                try:
+                    pointer = getattr(pointer, scope_names[0])
+                except AttributeError:
+                    logger.info("Skipping {}".format("/".join(name)))
+                    continue
+            if len(scope_names) >= 2:
+                num = int(scope_names[1])
+                pointer = pointer[num]
+        if m_name[-11:] == "_embeddings":
+            pointer = getattr(pointer, "weight")
+        elif m_name == "kernel":
+            array = np.transpose(array)
+        try:
+            assert pointer.shape == array.shape
+        except AssertionError as e:
+            e.args += (pointer.shape, array.shape)
+            raise
+        logger.info("Initialize PyTorch weight {}".format(name))
+        pointer.data = torch.from_numpy(array)
+    return model
+def mish(x):
+    return x * torch.tanh(nn.functional.softplus(x))
+ACT2FN = {"gelu": gelu, "relu": torch.nn.functional.relu, "swish": swish, "gelu_new": gelu_new, "mish": mish}
+BertLayerNorm = torch.nn.LayerNorm
+class BertEmbeddings(nn.Module):
+    """Construct the embeddings from word, position and token_type embeddings.
+    """
+    def __init__(self, config):
+        super().__init__()
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+        # any TensorFlow checkpoint file
+        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+    def forward(self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None):
+        if input_ids is not None:
+            input_shape = input_ids.size()
+        else:
+            input_shape = inputs_embeds.size()[:-1]
+        seq_length = input_shape[1]
+        device = input_ids.device if input_ids is not None else inputs_embeds.device
+        if position_ids is None:
+            position_ids = torch.arange(seq_length, dtype=torch.long, device=device)
+            position_ids = position_ids.unsqueeze(0).expand(input_shape)
+        if token_type_ids is None:
+            token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
+        if inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+        position_embeddings = self.position_embeddings(position_ids)
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+        embeddings = inputs_embeds + position_embeddings + token_type_embeddings
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+class BertSelfAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+            raise ValueError(
+                "The hidden size (%d) is not a multiple of the number of attention "
+                "heads (%d)" % (config.hidden_size, config.num_attention_heads)
+            )
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size)
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+    def transpose_for_scores(self, x):
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+        x = x.view(*new_x_shape)
+        return x.permute(0, 2, 1, 3)
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        output_attentions=False,
+    ):
+        mixed_query_layer = self.query(hidden_states)
+        # If this is instantiated as a cross-attention module, the keys
+        # and values come from an encoder; the attention mask needs to be
+        # such that the encoder's padding tokens are not attended to.
+        if encoder_hidden_states is not None:
+            mixed_key_layer = self.key(encoder_hidden_states)
+            mixed_value_layer = self.value(encoder_hidden_states)
+            attention_mask = encoder_attention_mask
+        else:
+            mixed_key_layer = self.key(hidden_states)
+            mixed_value_layer = self.value(hidden_states)
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+        key_layer = self.transpose_for_scores(mixed_key_layer)
+        value_layer = self.transpose_for_scores(mixed_value_layer)
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
+            attention_scores = attention_scores + attention_mask
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.Softmax(dim=-1)(attention_scores)
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+        context_layer = torch.matmul(attention_probs, value_layer)
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(*new_context_layer_shape)
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+        return outputs
+class BertSelfOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+class BertAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.self = BertSelfAttention(config)
+        self.output = BertSelfOutput(config)
+        self.pruned_heads = set()
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
+        )
+        # Prune linear layers
+        self.self.query = prune_linear_layer(self.self.query, index)
+        self.self.key = prune_linear_layer(self.self.key, index)
+        self.self.value = prune_linear_layer(self.self.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+        # Update hyper params and store pruned heads
+        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
+        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        output_attentions=False,
+    ):
+        self_outputs = self.self(
+            hidden_states, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask, output_attentions,
+        )
+        attention_output = self.output(self_outputs[0], hidden_states)
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+class BertIntermediate(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+class BertOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+class BertLayer(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.attention = BertAttention(config)
+        self.is_decoder = config.is_decoder
+        if self.is_decoder:
+            self.crossattention = BertAttention(config)
+        self.intermediate = BertIntermediate(config)
+        self.output = BertOutput(config)
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        output_attentions=False,
+    ):
+        self_attention_outputs = self.attention(
+            hidden_states, attention_mask, head_mask, output_attentions=output_attentions,
+        )
+        attention_output = self_attention_outputs[0]
+        outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
+        if self.is_decoder and encoder_hidden_states is not None:
+            cross_attention_outputs = self.crossattention(
+                attention_output,
+                attention_mask,
+                head_mask,
+                encoder_hidden_states,
+                encoder_attention_mask,
+                output_attentions,
+            )
+            attention_output = cross_attention_outputs[0]
+            outputs = outputs + cross_attention_outputs[1:]  # add cross attentions if we output attention weights
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        outputs = (layer_output,) + outputs
+        return outputs
+class BertEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList([BertLayer(config) for _ in range(config.num_hidden_layers)])
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        output_attentions=False,
+        output_hidden_states=False,
+    ):
+        all_hidden_states = ()
+        all_attentions = ()
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+            if getattr(self.config, "gradient_checkpointing", False):
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs, output_attentions)
+                    return custom_forward
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(layer_module),
+                    hidden_states,
+                    attention_mask,
+                    head_mask[i],
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                )
+            else:
+                layer_outputs = layer_module(
+                    hidden_states,
+                    attention_mask,
+                    head_mask[i],
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    output_attentions,
+                )
+            hidden_states = layer_outputs[0]
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+        # Add last layer
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+        outputs = (hidden_states,)
+        if output_hidden_states:
+            outputs = outputs + (all_hidden_states,)
+        if output_attentions:
+            outputs = outputs + (all_attentions,)
+        return outputs  # last-layer hidden state, (all hidden states), (all attentions)
+class BertPooler(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+    def forward(self, hidden_states):
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+class BertPredictionHeadTransform(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        if isinstance(config.hidden_act, str):
+            self.transform_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.transform_act_fn = config.hidden_act
+        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.transform_act_fn(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states)
+        return hidden_states
+class BertLMPredictionHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.transform = BertPredictionHeadTransform(config)
+        # The output weights are the same as the input embeddings, but there is
+        # an output-only bias for each token.
+        self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        self.bias = nn.Parameter(torch.zeros(config.vocab_size))
+        # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
+        self.decoder.bias = self.bias
+    def forward(self, hidden_states):
+        hidden_states = self.transform(hidden_states)
+        hidden_states = self.decoder(hidden_states)
+        return hidden_states
+class BertOnlyMLMHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.predictions = BertLMPredictionHead(config)
+    def forward(self, sequence_output):
+        prediction_scores = self.predictions(sequence_output)
+        return prediction_scores
+class BertOnlyNSPHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.seq_relationship = nn.Linear(config.hidden_size, 2)
+    def forward(self, pooled_output):
+        seq_relationship_score = self.seq_relationship(pooled_output)
+        return seq_relationship_score
+class BertPreTrainingHeads(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.predictions = BertLMPredictionHead(config)
+        self.seq_relationship = nn.Linear(config.hidden_size, 2)
+    def forward(self, sequence_output, pooled_output):
+        prediction_scores = self.predictions(sequence_output)
+        seq_relationship_score = self.seq_relationship(pooled_output)
+        return prediction_scores, seq_relationship_score
+class BertPreTrainedModel(PreTrainedModel):
+    """ An abstract class to handle weights initialization and
+        a simple interface for downloading and loading pretrained models.
+    """
+    config_class = BertConfig
+    load_tf_weights = load_tf_weights_in_bert
+    base_model_prefix = "bert"
+    def _init_weights(self, module):
+        """ Initialize the weights """
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+        elif isinstance(module, BertLayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        if isinstance(module, nn.Linear) and module.bias is not None:
+            module.bias.data.zero_()
+BERT_START_DOCSTRING = r"""
+    This model is a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`_ sub-class.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general
+    usage and behavior.
+    Parameters:
+        config (:class:`~transformers.BertConfig`): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the configuration.
+            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
+"""
+BERT_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (:obj:`torch.LongTensor` of shape :obj:`{0}`):
+            Indices of input sequence tokens in the vocabulary.
+            Indices can be obtained using :class:`transformers.BertTokenizer`.
+            See :func:`transformers.PreTrainedTokenizer.encode` and
+            :func:`transformers.PreTrainedTokenizer.__call__` for details.
+            `What are input IDs? <../glossary.html#input-ids>`__
+        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`{0}`, `optional`, defaults to :obj:`None`):
+            Mask to avoid performing attention on padding token indices.
+            Mask values selected in ``[0, 1]``:
+            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
+            `What are attention masks? <../glossary.html#attention-mask>`__
+        token_type_ids (:obj:`torch.LongTensor` of shape :obj:`{0}`, `optional`, defaults to :obj:`None`):
+            Segment token indices to indicate first and second portions of the inputs.
+            Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1``
+            corresponds to a `sentence B` token
+            `What are token type IDs? <../glossary.html#token-type-ids>`_
+        position_ids (:obj:`torch.LongTensor` of shape :obj:`{0}`, `optional`, defaults to :obj:`None`):
+            Indices of positions of each input sequence tokens in the position embeddings.
+            Selected in the range ``[0, config.max_position_embeddings - 1]``.
+            `What are position IDs? <../glossary.html#position-ids>`_
+        head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`):
+            Mask to nullify selected heads of the self-attention modules.
+            Mask values selected in ``[0, 1]``:
+            :obj:`1` indicates the head is **not masked**, :obj:`0` indicates the head is **masked**.
+        inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`):
+            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
+            This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+            than the model's internal embedding lookup matrix.
+        encoder_hidden_states  (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
+            if the model is configured as a decoder.
+        encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask
+            is used in the cross-attention if the model is configured as a decoder.
+            Mask values selected in ``[0, 1]``:
+            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
+        output_attentions (:obj:`bool`, `optional`, defaults to :obj:`None`):
+            If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail.
+"""
+[DOCS]
+@add_start_docstrings(
+    "The bare Bert Model transformer outputting raw hidden-states without any specific head on top.",
+    BERT_START_DOCSTRING,
+)
+class BertModel(BertPreTrainedModel):
+    """
+    The model can behave as an encoder (with only self-attention) as well
+    as a decoder, in which case a layer of cross-attention is added between
+    the self-attention layers, following the architecture described in `Attention is all you need`_ by Ashish Vaswani,
+    Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
+    To behave as an decoder the model needs to be initialized with the
+    :obj:`is_decoder` argument of the configuration set to :obj:`True`; an
+    :obj:`encoder_hidden_states` is expected as an input to the forward pass.
+    .. _`Attention is all you need`:
+        https://arxiv.org/abs/1706.03762
+    """
+    def __init__(self, config):
+        super().__init__(config)
+        self.config = config
+        self.embeddings = BertEmbeddings(config)
+        self.encoder = BertEncoder(config)
+        self.pooler = BertPooler(config)
+        self.init_weights()
+[DOCS]
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+[DOCS]
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+    def _prune_heads(self, heads_to_prune):
+        """ Prunes heads of the model.
+            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
+            See base class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+[DOCS]
+    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-uncased")
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        output_attentions=None,
+        output_hidden_states=None,
+    ):
+        r"""
+    Return:
+        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
+        last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        pooler_output (:obj:`torch.FloatTensor`: of shape :obj:`(batch_size, hidden_size)`):
+            Last layer hidden-state of the first token of the sequence (classification token)
+            further processed by a Linear layer and a Tanh activation function. The Linear
+            layer weights are trained from the next sentence prediction (classification)
+            objective during pre-training.
+            This output is usually *not* a good summary
+            of the semantic content of the input, you're often better with averaging or pooling
+            the sequence of hidden-states for the whole input sequence.
+        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
+            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = input_ids.size()
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+        device = input_ids.device if input_ids is not None else inputs_embeds.device
+        if attention_mask is None:
+            attention_mask = torch.ones(input_shape, device=device)
+        if token_type_ids is None:
+            token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape, device)
+        # If a 2D ou 3D attention mask is provided for the cross-attention
+        # we need to make broadcastabe to [batch_size, num_heads, seq_length, seq_length]
+        if self.config.is_decoder and encoder_hidden_states is not None:
+            encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
+            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
+            if encoder_attention_mask is None:
+                encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
+            encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+        else:
+            encoder_extended_attention_mask = None
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+        embedding_output = self.embeddings(
+            input_ids=input_ids, position_ids=position_ids, token_type_ids=token_type_ids, inputs_embeds=inputs_embeds
+        )
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_extended_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+        )
+        sequence_output = encoder_outputs[0]
+        pooled_output = self.pooler(sequence_output)
+        outputs = (sequence_output, pooled_output,) + encoder_outputs[
+            1:
+        ]  # add hidden_states and attentions if they are here
+        return outputs  # sequence_output, pooled_output, (hidden_states), (attentions)
+[DOCS]
+@add_start_docstrings(
+    """Bert Model with two heads on top as done during the pre-training: a `masked language modeling` head and
+    a `next sentence prediction (classification)` head. """,
+    BERT_START_DOCSTRING,
+)
+class BertForPreTraining(BertPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.bert = BertModel(config)
+        self.cls = BertPreTrainingHeads(config)
+        self.init_weights()
+[DOCS]
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+[DOCS]
+    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        next_sentence_label=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        **kwargs
+    ):
+        r"""
+        labels (``torch.LongTensor`` of shape ``(batch_size, sequence_length)``, `optional`, defaults to :obj:`None`):
+            Labels for computing the masked language modeling loss.
+            Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
+            Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
+            in ``[0, ..., config.vocab_size]``
+        next_sentence_label (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`, defaults to :obj:`None`):
+            Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair (see :obj:`input_ids` docstring)
+            Indices should be in ``[0, 1]``.
+            ``0`` indicates sequence B is a continuation of sequence A,
+            ``1`` indicates sequence B is a random sequence.
+        kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
+            Used to hide legacy arguments that have been deprecated.
+    Returns:
+        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
+        loss (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
+            Total loss as the sum of the masked language modeling loss and the next sequence prediction (classification) loss.
+        prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`)
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        seq_relationship_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, 2)`):
+            Prediction scores of the next sequence prediction (classification) head (scores of True/False
+            continuation before SoftMax).
+        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
+            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    Examples::
+        >>> from transformers import BertTokenizer, BertForPreTraining
+        >>> import torch
+        >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+        >>> model = BertForPreTraining.from_pretrained('bert-base-uncased')
+        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+        >>> outputs = model(**inputs)
+        >>> prediction_scores, seq_relationship_scores = outputs[:2]
+        """
+        if "masked_lm_labels" in kwargs:
+            warnings.warn(
+                "The `masked_lm_labels` argument is deprecated and will be removed in a future version, use `labels` instead.",
+                DeprecationWarning,
+            )
+            labels = kwargs.pop("masked_lm_labels")
+        assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}."
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+        )
+        sequence_output, pooled_output = outputs[:2]
+        prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output)
+        outputs = (prediction_scores, seq_relationship_score,) + outputs[
+            2:
+        ]  # add hidden states and attention if they are here
+        if labels is not None and next_sentence_label is not None:
+            loss_fct = CrossEntropyLoss()
+            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
+            next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1))
+            total_loss = masked_lm_loss + next_sentence_loss
+            outputs = (total_loss,) + outputs
+        return outputs  # (loss), prediction_scores, seq_relationship_score, (hidden_states), (attentions)
+@add_start_docstrings(
+    """Bert Model with a `language modeling` head on top for CLM fine-tuning. """, BERT_START_DOCSTRING
+)
+class BertLMHeadModel(BertPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        assert config.is_decoder, "If you want to use `BertLMHeadModel` as a standalone, add `is_decoder=True`."
+        self.bert = BertModel(config)
+        self.cls = BertOnlyMLMHead(config)
+        self.init_weights()
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        **kwargs
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
+            Labels for computing the left-to-right language modeling loss (next word prediction).
+            Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
+            Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
+            in ``[0, ..., config.vocab_size]``
+        kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
+            Used to hide legacy arguments that have been deprecated.
+    Returns:
+        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
+        ltr_lm_loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided):
+            Next token prediction loss.
+        prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`)
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
+            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    Example::
+        >>> from transformers import BertTokenizer, BertLMHeadModel, BertConfig
+        >>> import torch
+        >>> tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
+        >>> config = BertConfig.from_pretrained("bert-base-cased")
+        >>> config.is_decoder = True
+        >>> model = BertLMHeadModel.from_pretrained('bert-base-cased', config=config)
+        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+        >>> outputs = model(**inputs)
+        >>> last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
+        """
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+        )
+        sequence_output = outputs[0]
+        prediction_scores = self.cls(sequence_output)
+        outputs = (prediction_scores,) + outputs[2:]  # Add hidden states and attention if they are here
+        if labels is not None:
+            # we are doing next-token prediction; shift prediction scores and input ids by one
+            prediction_scores = prediction_scores[:, :-1, :].contiguous()
+            labels = labels[:, 1:].contiguous()
+            loss_fct = CrossEntropyLoss()
+            ltr_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
+            outputs = (ltr_lm_loss,) + outputs
+        return outputs  # (ltr_lm_loss), prediction_scores, (hidden_states), (attentions)
+    def prepare_inputs_for_generation(self, input_ids, attention_mask=None, **model_kwargs):
+        input_shape = input_ids.shape
+        # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
+        if attention_mask is None:
+            attention_mask = input_ids.new_ones(input_shape)
+        return {"input_ids": input_ids, "attention_mask": attention_mask}
+[DOCS]
+@add_start_docstrings("""Bert Model with a `language modeling` head on top. """, BERT_START_DOCSTRING)
+class BertForMaskedLM(BertPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        assert (
+            not config.is_decoder
+        ), "If you want to use `BertForMaskedLM` make sure `config.is_decoder=False` for bi-directional self-attention."
+        self.bert = BertModel(config)
+        self.cls = BertOnlyMLMHead(config)
+        self.init_weights()
+[DOCS]
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+[DOCS]
+    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-uncased")
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        **kwargs
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
+            Labels for computing the masked language modeling loss.
+            Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
+            Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
+            in ``[0, ..., config.vocab_size]``
+        kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
+            Used to hide legacy arguments that have been deprecated.
+    Returns:
+        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
+        masked_lm_loss (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
+            Masked language modeling loss.
+        prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`)
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
+            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+        """
+        if "masked_lm_labels" in kwargs:
+            warnings.warn(
+                "The `masked_lm_labels` argument is deprecated and will be removed in a future version, use `labels` instead.",
+                DeprecationWarning,
+            )
+            labels = kwargs.pop("masked_lm_labels")
+        assert "lm_labels" not in kwargs, "Use `BertWithLMHead` for autoregressive language modeling task."
+        assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}."
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+        )
+        sequence_output = outputs[0]
+        prediction_scores = self.cls(sequence_output)
+        outputs = (prediction_scores,) + outputs[2:]  # Add hidden states and attention if they are here
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()  # -100 index = padding token
+            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
+            outputs = (masked_lm_loss,) + outputs
+        return outputs  # (masked_lm_loss), prediction_scores, (hidden_states), (attentions)
+    def prepare_inputs_for_generation(self, input_ids, attention_mask=None, **model_kwargs):
+        input_shape = input_ids.shape
+        effective_batch_size = input_shape[0]
+        #  add a dummy token
+        assert self.config.pad_token_id is not None, "The PAD token should be defined for generation"
+        attention_mask = torch.cat([attention_mask, attention_mask.new_zeros((attention_mask.shape[0], 1))], dim=-1)
+        dummy_token = torch.full(
+            (effective_batch_size, 1), self.config.pad_token_id, dtype=torch.long, device=input_ids.device
+        )
+        input_ids = torch.cat([input_ids, dummy_token], dim=1)
+        return {"input_ids": input_ids, "attention_mask": attention_mask}
+[DOCS]
+@add_start_docstrings(
+    """Bert Model with a `next sentence prediction (classification)` head on top. """, BERT_START_DOCSTRING,
+)
+class BertForNextSentencePrediction(BertPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.bert = BertModel(config)
+        self.cls = BertOnlyNSPHead(config)
+        self.init_weights()
+[DOCS]
+    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        next_sentence_label=None,
+        output_attentions=None,
+        output_hidden_states=None,
+    ):
+        r"""
+        next_sentence_label (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
+            Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair (see ``input_ids`` docstring)
+            Indices should be in ``[0, 1]``.
+            ``0`` indicates sequence B is a continuation of sequence A,
+            ``1`` indicates sequence B is a random sequence.
+    Returns:
+        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
+        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`next_sentence_label` is provided):
+            Next sequence prediction (classification) loss.
+        seq_relationship_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, 2)`):
+            Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation before SoftMax).
+        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
+            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    Examples::
+        >>> from transformers import BertTokenizer, BertForNextSentencePrediction
+        >>> import torch
+        >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+        >>> model = BertForNextSentencePrediction.from_pretrained('bert-base-uncased')
+        >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
+        >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
+        >>> encoding = tokenizer(prompt, next_sentence, return_tensors='pt')
+        >>> loss, logits = model(**encoding, next_sentence_label=torch.LongTensor([1]))
+        >>> assert logits[0, 0] < logits[0, 1] # next sentence was random
+        """
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+        )
+        pooled_output = outputs[1]
+        seq_relationship_score = self.cls(pooled_output)
+        outputs = (seq_relationship_score,) + outputs[2:]  # add hidden states and attention if they are here
+        if next_sentence_label is not None:
+            loss_fct = CrossEntropyLoss()
+            next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1))
+            outputs = (next_sentence_loss,) + outputs
+        return outputs  # (next_sentence_loss), seq_relationship_score, (hidden_states), (attentions)
+[DOCS]
+@add_start_docstrings(
+    """Bert Model transformer with a sequence classification/regression head on top (a linear layer on top of
+    the pooled output) e.g. for GLUE tasks. """,
+    BERT_START_DOCSTRING,
+)
+class BertForSequenceClassification(BertPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.bert = BertModel(config)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+        self.init_weights()
+[DOCS]
+    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-uncased")
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
+            Labels for computing the sequence classification/regression loss.
+            Indices should be in :obj:`[0, ..., config.num_labels - 1]`.
+            If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
+            If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+    Returns:
+        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
+        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`label` is provided):
+            Classification (or regression if config.num_labels==1) loss.
+        logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`):
+            Classification (or regression if config.num_labels==1) scores (before SoftMax).
+        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
+            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+        """
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+        )
+        pooled_output = outputs[1]
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+        outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here
+        if labels is not None:
+            if self.num_labels == 1:
+                #  We are doing regression
+                loss_fct = MSELoss()
+                loss = loss_fct(logits.view(-1), labels.view(-1))
+            else:
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            outputs = (loss,) + outputs
+        return outputs  # (loss), logits, (hidden_states), (attentions)
+[DOCS]
+@add_start_docstrings(
+    """Bert Model with a multiple choice classification head on top (a linear layer on top of
+    the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """,
+    BERT_START_DOCSTRING,
+)
+class BertForMultipleChoice(BertPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.bert = BertModel(config)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, 1)
+        self.init_weights()
+[DOCS]
+    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, num_choices, sequence_length)"))
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-uncased")
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
+            Labels for computing the multiple choice classification loss.
+            Indices should be in ``[0, ..., num_choices-1]`` where `num_choices` is the size of the second dimension
+            of the input tensors. (see `input_ids` above)
+    Returns:
+        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
+        loss (:obj:`torch.FloatTensor` of shape `(1,)`, `optional`, returned when :obj:`labels` is provided):
+            Classification loss.
+        classification_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_choices)`):
+            `num_choices` is the second dimension of the input tensors. (see `input_ids` above).
+            Classification scores (before SoftMax).
+        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
+            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+        """
+        num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
+        input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
+        attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
+        token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
+        position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
+        inputs_embeds = (
+            inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
+            if inputs_embeds is not None
+            else None
+        )
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+        )
+        pooled_output = outputs[1]
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+        reshaped_logits = logits.view(-1, num_choices)
+        outputs = (reshaped_logits,) + outputs[2:]  # add hidden states and attention if they are here
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(reshaped_logits, labels)
+            outputs = (loss,) + outputs
+        return outputs  # (loss), reshaped_logits, (hidden_states), (attentions)
+[DOCS]
+@add_start_docstrings(
+    """Bert Model with a token classification head on top (a linear layer on top of
+    the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
+    BERT_START_DOCSTRING,
+)
+class BertForTokenClassification(BertPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.bert = BertModel(config)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+        self.init_weights()
+[DOCS]
+    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-uncased")
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
+            Labels for computing the token classification loss.
+            Indices should be in ``[0, ..., config.num_labels - 1]``.
+    Returns:
+        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
+        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when ``labels`` is provided) :
+            Classification loss.
+        scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`)
+            Classification scores (before SoftMax).
+        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
+            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+        """
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+        )
+        sequence_output = outputs[0]
+        sequence_output = self.dropout(sequence_output)
+        logits = self.classifier(sequence_output)
+        outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            # Only keep active parts of the loss
+            if attention_mask is not None:
+                active_loss = attention_mask.view(-1) == 1
+                active_logits = logits.view(-1, self.num_labels)
+                active_labels = torch.where(
+                    active_loss, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels)
+                )
+                loss = loss_fct(active_logits, active_labels)
+            else:
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            outputs = (loss,) + outputs
+        return outputs  # (loss), scores, (hidden_states), (attentions)
+[DOCS]
+@add_start_docstrings(
+    """Bert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
+    layers on top of the hidden-states output to compute `span start logits` and `span end logits`). """,
+    BERT_START_DOCSTRING,
+)
+class BertForQuestionAnswering(BertPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.bert = BertModel(config)
+        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
+        self.init_weights()
+[DOCS]
+    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-uncased")
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        start_positions=None,
+        end_positions=None,
+        output_attentions=None,
+        output_hidden_states=None,
+    ):
+        r"""
+        start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
+            Labels for position (index) of the start of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`).
+            Position outside of the sequence are not taken into account for computing the loss.
+        end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
+            Labels for position (index) of the end of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`).
+            Position outside of the sequence are not taken into account for computing the loss.
+    Returns:
+        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
+        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided):
+            Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
+        start_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length,)`):
+            Span-start scores (before SoftMax).
+        end_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length,)`):
+            Span-end scores (before SoftMax).
+        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
+            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+        """
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+        )
+        sequence_output = outputs[0]
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1)
+        end_logits = end_logits.squeeze(-1)
+        outputs = (start_logits, end_logits,) + outputs[2:]
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if len(start_positions.size()) > 1:
+                start_positions = start_positions.squeeze(-1)
+            if len(end_positions.size()) > 1:
+                end_positions = end_positions.squeeze(-1)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.size(1)
+            start_positions.clamp_(0, ignored_index)
+            end_positions.clamp_(0, ignored_index)
+            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+            outputs = (total_loss,) + outputs
+        return outputs  # (loss), start_logits, end_logits, (hidden_states), (attentions)

src/reference_code/evaluate_embeddings.py ADDED Viewed

	@@ -0,0 +1,136 @@

+from torch.utils.data import DataLoader
+import torch.nn as nn
+import torch
+import numpy
+import pickle
+import tqdm
+from ..bert import BERT
+from ..vocab import Vocab
+from ..dataset import TokenizerDataset
+import argparse
+from itertools import combinations
+def generate_subset(s):
+    subsets = []
+    for r in range(len(s) + 1):
+        combinations_result = combinations(s, r)
+        if r==1:
+            subsets.extend(([item] for sublist in combinations_result for item in sublist))
+        else:
+            subsets.extend((list(sublist) for sublist in combinations_result))
+    subsets_dict = {i:s for i, s in enumerate(subsets)}
+    return subsets_dict
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-workspace_name', type=str, default=None)
+    parser.add_argument("-seq_len", type=int, default=100, help="maximum sequence length")
+    parser.add_argument('-pretrain', type=bool, default=False)
+    parser.add_argument('-masked_pred', type=bool, default=False)
+    parser.add_argument('-epoch', type=str, default=None)
+    # parser.add_argument('-set_label', type=bool, default=False)
+    # parser.add_argument('--label_standard', nargs='+', type=str, help='List of optional tasks')
+    options = parser.parse_args()
+    folder_path = options.workspace_name+"/" if options.workspace_name else ""
+    # if options.set_label:
+    #     label_standard = generate_subset({'optional-tasks-1', 'optional-tasks-2'})
+    #     pickle.dump(label_standard, open(f"{folder_path}pretraining/pretrain_opt_label.pkl", "wb"))
+    # else:
+    #     label_standard = pickle.load(open(f"{folder_path}pretraining/pretrain_opt_label.pkl", "rb"))
+    # print(f"options.label _standard: {options.label_standard}")
+    vocab_path = f"{folder_path}check/pretraining/vocab.txt"
+    # vocab_path = f"{folder_path}pretraining/vocab.txt"
+    print("Loading Vocab", vocab_path)
+    vocab_obj = Vocab(vocab_path)
+    vocab_obj.load_vocab()
+    print("Vocab Size: ", len(vocab_obj.vocab))
+    # label_standard = list(pickle.load(open(f"dataset/CL4999_1920/{options.workspace_name}/unique_problems_list.pkl", "rb")))
+    # label_standard = generate_subset({'optional-tasks-1', 'optional-tasks-2', 'OptionalTask_1', 'OptionalTask_2'})
+    # pickle.dump(label_standard, open(f"{folder_path}pretraining/pretrain_opt_label.pkl", "wb"))
+    if options.masked_pred:
+        str_code = "masked_prediction"
+        output_name = f"{folder_path}output/bert_trained.seq_model.ep{options.epoch}"
+    else:
+        str_code = "masked"
+        output_name = f"{folder_path}output/bert_trained.seq_encoder.model.ep{options.epoch}"
+    folder_path = folder_path+"check/"
+    # folder_path = folder_path
+    if options.pretrain:
+        pretrain_file = f"{folder_path}pretraining/pretrain.txt"
+        pretrain_label = f"{folder_path}pretraining/pretrain_opt.pkl"
+        # pretrain_file = f"{folder_path}finetuning/train.txt"
+        # pretrain_label = f"{folder_path}finetuning/train_label.txt"
+        embedding_file_path = f"{folder_path}embeddings/pretrain_embeddings_{str_code}_{options.epoch}.pkl"
+        print("Loading Pretrain Dataset ", pretrain_file)
+        pretrain_dataset = TokenizerDataset(pretrain_file, pretrain_label, vocab_obj, seq_len=options.seq_len)
+        print("Creating Dataloader")
+        pretrain_data_loader = DataLoader(pretrain_dataset, batch_size=32, num_workers=4)
+    else:
+        val_file = f"{folder_path}pretraining/test.txt"
+        val_label = f"{folder_path}pretraining/test_opt.txt"
+#         val_file = f"{folder_path}finetuning/test.txt"
+#         val_label = f"{folder_path}finetuning/test_label.txt"
+        embedding_file_path = f"{folder_path}embeddings/test_embeddings_{str_code}_{options.epoch}.pkl"
+        print("Loading Validation Dataset ", val_file)
+        val_dataset = TokenizerDataset(val_file, val_label, vocab_obj, seq_len=options.seq_len)
+        print("Creating Dataloader")
+        val_data_loader = DataLoader(val_dataset, batch_size=32, num_workers=4)
+    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+    print(device)
+    print("Load Pre-trained BERT model...")
+    print(output_name)
+    bert = torch.load(output_name, map_location=device)
+#     learned_parameters = model_ep0.state_dict()
+    for param in bert.parameters():
+        param.requires_grad = False
+    if options.pretrain:
+        print("Pretrain-embeddings....")
+        data_iter = tqdm.tqdm(enumerate(pretrain_data_loader),
+                                  desc="pre-train",
+                                  total=len(pretrain_data_loader),
+                                  bar_format="{l_bar}{r_bar}")
+        pretrain_embeddings = []
+        for i, data in data_iter:
+            data = {key: value.to(device) for key, value in data.items()}
+            hrep = bert(data["bert_input"], data["segment_label"])
+            # print(hrep[:,0].cpu().detach().numpy())
+            embeddings = [h for h in hrep[:,0].cpu().detach().numpy()]
+            pretrain_embeddings.extend(embeddings)
+        pickle.dump(pretrain_embeddings, open(embedding_file_path,"wb"))
+        # pickle.dump(pretrain_embeddings, open("embeddings/finetune_cfa_train_embeddings.pkl","wb"))
+    else:
+        print("Validation-embeddings....")
+        data_iter = tqdm.tqdm(enumerate(val_data_loader),
+                                  desc="validation",
+                                  total=len(val_data_loader),
+                                  bar_format="{l_bar}{r_bar}")
+        val_embeddings = []
+        for i, data in data_iter:
+            data = {key: value.to(device) for key, value in data.items()}
+            hrep = bert(data["bert_input"], data["segment_label"])
+            # print(,hrep[:,0].shape)
+            embeddings = [h for h in hrep[:,0].cpu().detach().numpy()]
+            val_embeddings.extend(embeddings)
+        pickle.dump(val_embeddings, open(embedding_file_path,"wb"))
+        # pickle.dump(val_embeddings, open("embeddings/finetune_cfa_test_embeddings.pkl","wb"))

src/reference_code/metrics.py ADDED Viewed

	@@ -0,0 +1,149 @@

+import numpy as np
+from scipy.special import softmax
+class CELoss(object):
+    def compute_bin_boundaries(self, probabilities = np.array([])):
+        #uniform bin spacing
+        if probabilities.size == 0:
+            bin_boundaries = np.linspace(0, 1, self.n_bins + 1)
+            self.bin_lowers = bin_boundaries[:-1]
+            self.bin_uppers = bin_boundaries[1:]
+        else:
+            #size of bins
+            bin_n = int(self.n_data/self.n_bins)
+            bin_boundaries = np.array([])
+            probabilities_sort = np.sort(probabilities)
+            for i in range(0,self.n_bins):
+                bin_boundaries = np.append(bin_boundaries,probabilities_sort[i*bin_n])
+            bin_boundaries = np.append(bin_boundaries,1.0)
+            self.bin_lowers = bin_boundaries[:-1]
+            self.bin_uppers = bin_boundaries[1:]
+    def get_probabilities(self, output, labels, logits):
+        #If not probabilities apply softmax!
+        if logits:
+            self.probabilities = softmax(output, axis=1)
+        else:
+            self.probabilities = output
+        self.labels = np.argmax(labels, axis=1)
+        self.confidences = np.max(self.probabilities, axis=1)
+        self.predictions = np.argmax(self.probabilities, axis=1)
+        self.accuracies = np.equal(self.predictions, self.labels)
+    def binary_matrices(self):
+        idx = np.arange(self.n_data)
+        #make matrices of zeros
+        pred_matrix = np.zeros([self.n_data,self.n_class])
+        label_matrix = np.zeros([self.n_data,self.n_class])
+        #self.acc_matrix = np.zeros([self.n_data,self.n_class])
+        pred_matrix[idx,self.predictions] = 1
+        label_matrix[idx,self.labels] = 1
+        self.acc_matrix = np.equal(pred_matrix, label_matrix)
+    def compute_bins(self, index = None):
+        self.bin_prop = np.zeros(self.n_bins)
+        self.bin_acc = np.zeros(self.n_bins)
+        self.bin_conf = np.zeros(self.n_bins)
+        self.bin_score = np.zeros(self.n_bins)
+        if index == None:
+            confidences = self.confidences
+            accuracies = self.accuracies
+        else:
+            confidences = self.probabilities[:,index]
+            accuracies = self.acc_matrix[:,index]
+        for i, (bin_lower, bin_upper) in enumerate(zip(self.bin_lowers, self.bin_uppers)):
+            # Calculated |confidence - accuracy| in each bin
+            in_bin = np.greater(confidences,bin_lower.item()) * np.less_equal(confidences,bin_upper.item())
+            self.bin_prop[i] = np.mean(in_bin)
+            if self.bin_prop[i].item() > 0:
+                self.bin_acc[i] = np.mean(accuracies[in_bin])
+                self.bin_conf[i] = np.mean(confidences[in_bin])
+                self.bin_score[i] = np.abs(self.bin_conf[i] - self.bin_acc[i])
+class MaxProbCELoss(CELoss):
+    def loss(self, output, labels, n_bins = 15, logits = True):
+        self.n_bins = n_bins
+        super().compute_bin_boundaries()
+        super().get_probabilities(output, labels, logits)
+        super().compute_bins()
+#http://people.cs.pitt.edu/~milos/research/AAAI_Calibration.pdf
+class ECELoss(MaxProbCELoss):
+    def loss(self, output, labels, n_bins = 15, logits = True):
+        super().loss(output, labels, n_bins, logits)
+        return np.dot(self.bin_prop,self.bin_score)
+class MCELoss(MaxProbCELoss):
+    def loss(self, output, labels, n_bins = 15, logits = True):
+        super().loss(output, labels, n_bins, logits)
+        return np.max(self.bin_score)
+#https://arxiv.org/abs/1905.11001
+#Overconfidence Loss (Good in high risk applications where confident but wrong predictions can be especially harmful)
+class OELoss(MaxProbCELoss):
+    def loss(self, output, labels, n_bins = 15, logits = True):
+        super().loss(output, labels, n_bins, logits)
+        return np.dot(self.bin_prop,self.bin_conf * np.maximum(self.bin_conf-self.bin_acc,np.zeros(self.n_bins)))
+#https://arxiv.org/abs/1904.01685
+class SCELoss(CELoss):
+    def loss(self, output, labels, n_bins = 15, logits = True):
+        sce = 0.0
+        self.n_bins = n_bins
+        self.n_data = len(output)
+        self.n_class = len(output[0])
+        super().compute_bin_boundaries()
+        super().get_probabilities(output, labels, logits)
+        super().binary_matrices()
+        for i in range(self.n_class):
+            super().compute_bins(i)
+            sce += np.dot(self.bin_prop,self.bin_score)
+        return sce/self.n_class
+class TACELoss(CELoss):
+    def loss(self, output, labels, threshold = 0.01, n_bins = 15, logits = True):
+        tace = 0.0
+        self.n_bins = n_bins
+        self.n_data = len(output)
+        self.n_class = len(output[0])
+        super().get_probabilities(output, labels, logits)
+        self.probabilities[self.probabilities < threshold] = 0
+        super().binary_matrices()
+        for i in range(self.n_class):
+            super().compute_bin_boundaries(self.probabilities[:,i])
+            super().compute_bins(i)
+            tace += np.dot(self.bin_prop,self.bin_score)
+        return tace/self.n_class
+#create TACELoss with threshold fixed at 0
+class ACELoss(TACELoss):
+    def loss(self, output, labels, n_bins = 15, logits = True):
+        return super().loss(output, labels, 0.0 , n_bins, logits)

src/reference_code/pretrainer-old.py ADDED Viewed

	@@ -0,0 +1,696 @@

+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+from torch.optim import Adam, SGD
+from torch.utils.data import DataLoader
+import pickle
+from ..bert import BERT
+from ..seq_model import BERTSM
+from ..classifier_model import BERTForClassification
+from ..optim_schedule import ScheduledOptim
+import tqdm
+import sys
+import time
+import numpy as np
+# import visualization
+from sklearn.metrics import precision_score, recall_score, f1_score
+import matplotlib.pyplot as plt
+import seaborn as sns
+import pandas as pd
+from collections import defaultdict
+import os
+class ECE(nn.Module):
+    def __init__(self, n_bins=15):
+        """
+        n_bins (int): number of confidence interval bins
+        """
+        super(ECE, self).__init__()
+        bin_boundaries = torch.linspace(0, 1, n_bins + 1)
+        self.bin_lowers = bin_boundaries[:-1]
+        self.bin_uppers = bin_boundaries[1:]
+    def forward(self, logits, labels):
+        softmaxes = F.softmax(logits, dim=1)
+        confidences, predictions = torch.max(softmaxes, 1)
+        labels = torch.argmax(labels,1)
+        accuracies = predictions.eq(labels)
+        ece = torch.zeros(1, device=logits.device)
+        for bin_lower, bin_upper in zip(self.bin_lowers, self.bin_uppers):
+            # Calculated |confidence - accuracy| in each bin
+            in_bin = confidences.gt(bin_lower.item()) * confidences.le(bin_upper.item())
+            prop_in_bin = in_bin.float().mean()
+            if prop_in_bin.item() > 0:
+                accuracy_in_bin = accuracies[in_bin].float().mean()
+                avg_confidence_in_bin = confidences[in_bin].mean()
+                ece += torch.abs(avg_confidence_in_bin - accuracy_in_bin) * prop_in_bin
+        return ece
+def accurate_nb(preds, labels):
+    pred_flat = np.argmax(preds, axis=1).flatten()
+    labels_flat = np.argmax(labels, axis=1).flatten()
+    labels_flat = labels.flatten()
+    return np.sum(pred_flat == labels_flat)
+class BERTTrainer:
+    """
+    BERTTrainer pretrains BERT model on input sequence of strategies.
+    BERTTrainer make the pretrained BERT model with one training method objective.
+        1. Masked Strategy Modelling : 3.3.1 Task #1: Masked SM
+    """
+    def __init__(self, bert: BERT, vocab_size: int,
+                 train_dataloader: DataLoader, val_dataloader: DataLoader = None, test_dataloader: DataLoader = None,
+                 lr: float = 1e-4, betas=(0.9, 0.999), weight_decay: float = 0.01, warmup_steps=5000,
+                 with_cuda: bool = True, cuda_devices=None, log_freq: int = 10, same_student_prediction = False,
+                workspace_name=None, code=None):
+        """
+        :param bert: BERT model which you want to train
+        :param vocab_size: total word vocab size
+        :param train_dataloader: train dataset data loader
+        :param test_dataloader: test dataset data loader [can be None]
+        :param lr: learning rate of optimizer
+        :param betas: Adam optimizer betas
+        :param weight_decay: Adam optimizer weight decay param
+        :param with_cuda: traning with cuda
+        :param log_freq: logging frequency of the batch iteration
+        """
+        cuda_condition = torch.cuda.is_available() and with_cuda
+        self.device = torch.device("cuda:0" if cuda_condition else "cpu")
+        print(cuda_condition, " Device used = ", self.device)
+        available_gpus = list(range(torch.cuda.device_count()))
+        # This BERT model will be saved every epoch
+        self.bert = bert.to(self.device)
+        # Initialize the BERT Language Model, with BERT model
+        self.model = BERTSM(bert, vocab_size).to(self.device)
+        # Distributed GPU training if CUDA can detect more than 1 GPU
+        if with_cuda and torch.cuda.device_count() > 1:
+            print("Using %d GPUS for BERT" % torch.cuda.device_count())
+            self.model = nn.DataParallel(self.model, device_ids=available_gpus)
+        # Setting the train and test data loader
+        self.train_data = train_dataloader
+        self.val_data = val_dataloader
+        self.test_data = test_dataloader
+        # Setting the Adam optimizer with hyper-param
+        self.optim = Adam(self.model.parameters(), lr=lr, betas=betas, weight_decay=weight_decay)
+        self.optim_schedule = ScheduledOptim(self.optim, self.bert.hidden, n_warmup_steps=warmup_steps)
+        # Using Negative Log Likelihood Loss function for predicting the masked_token
+        self.criterion = nn.NLLLoss(ignore_index=0)
+        self.log_freq = log_freq
+        self.same_student_prediction = same_student_prediction
+        self.workspace_name = workspace_name
+        self.save_model = False
+        self.code = code
+        self.avg_loss = 10000
+        self.start_time = time.time()
+        print("Total Parameters:", sum([p.nelement() for p in self.model.parameters()]))
+    def train(self, epoch):
+        self.iteration(epoch, self.train_data)
+    def val(self, epoch):
+        self.iteration(epoch, self.val_data, phase="val")
+    def test(self, epoch):
+        self.iteration(epoch, self.test_data, phase="test")
+    def iteration(self, epoch, data_loader, phase="train"):
+        """
+        loop over the data_loader for training or testing
+        if on train status, backward operation is activated
+        and also auto save the model every peoch
+        :param epoch: current epoch index
+        :param data_loader: torch.utils.data.DataLoader for iteration
+        :param train: boolean value of is train or test
+        :return: None
+        """
+        # str_code = "train" if train else "test"
+        # code = "masked_prediction" if self.same_student_prediction else "masked"
+        self.log_file = f"{self.workspace_name}/logs/{self.code}/log_{phase}_pretrained.txt"
+        # bert_hidden_representations = []
+        if epoch == 0:
+            f = open(self.log_file, 'w')
+            f.close()
+            if phase == "val":
+                self.avg_loss = 10000
+        # Setting the tqdm progress bar
+        data_iter = tqdm.tqdm(enumerate(data_loader),
+                              desc="EP_%s:%d" % (phase, epoch),
+                              total=len(data_loader),
+                              bar_format="{l_bar}{r_bar}")
+        avg_loss_mask = 0.0
+        total_correct_mask = 0
+        total_element_mask = 0
+        avg_loss_pred = 0.0
+        total_correct_pred = 0
+        total_element_pred = 0
+        avg_loss = 0.0
+        if phase == "train":
+            self.model.train()
+        else:
+            self.model.eval()
+        with open(self.log_file, 'a') as f:
+            sys.stdout = f
+            for i, data in data_iter:
+                # 0. batch_data will be sent into the device(GPU or cpu)
+                data = {key: value.to(self.device) for key, value in data.items()}
+                # if i == 0:
+                #     print(f"data : {data[0]}")
+                # 1. forward the next_sentence_prediction and masked_lm model
+                # next_sent_output, mask_lm_output = self.model.forward(data["bert_input"], data["segment_label"])
+                if self.same_student_prediction:
+                    bert_hidden_rep, mask_lm_output, same_student_output = self.model.forward(data["bert_input"], data["segment_label"], self.same_student_prediction)
+                else:
+                    bert_hidden_rep, mask_lm_output = self.model.forward(data["bert_input"], data["segment_label"], self.same_student_prediction)
+                # embeddings = [h for h in bert_hidden_rep.cpu().detach().numpy()]
+                # bert_hidden_representations.extend(embeddings)
+                # 2-2. NLLLoss of predicting masked token word
+                mask_loss = self.criterion(mask_lm_output.transpose(1, 2), data["bert_label"])
+                # 2-3. Adding next_loss and mask_loss : 3.4 Pre-training Procedure
+                if self.same_student_prediction:
+                    # 2-1. NLL(negative log likelihood) loss of is_next classification result
+                    same_student_loss = self.criterion(same_student_output, data["is_same_student"])
+                    loss = same_student_loss + mask_loss
+                else:
+                    loss = mask_loss
+                # 3. backward and optimization only in train
+                if phase == "train":
+                    self.optim_schedule.zero_grad()
+                    loss.backward()
+                    self.optim_schedule.step_and_update_lr()
+                # print(f"mask_lm_output : {mask_lm_output}")
+                # non_zero_mask = (data["bert_label"] != 0).float()
+                # print(f"bert_label : {data['bert_label']}")
+                non_zero_mask = (data["bert_label"] != 0).float()
+                predictions = torch.argmax(mask_lm_output, dim=-1)
+                # print(f"predictions : {predictions}")
+                predicted_masked = predictions*non_zero_mask
+                # print(f"predicted_masked : {predicted_masked}")
+                mask_correct = ((data["bert_label"] == predicted_masked)*non_zero_mask).sum().item()
+                # print(f"mask_correct : {mask_correct}")
+                # print(f"non_zero_mask.sum().item() : {non_zero_mask.sum().item()}")
+                avg_loss_mask += loss.item()
+                total_correct_mask += mask_correct
+                total_element_mask += non_zero_mask.sum().item()
+                # total_element_mask += data["bert_label"].sum().item()
+                torch.cuda.empty_cache()
+                post_fix = {
+                    "epoch": epoch,
+                    "iter": i,
+                    "avg_loss": avg_loss_mask / (i + 1),
+                    "avg_acc_mask": (total_correct_mask / total_element_mask * 100) if total_element_mask != 0 else 0,
+                    "loss": loss.item()
+                }
+                # next sentence prediction accuracy
+                if self.same_student_prediction:
+                    correct = same_student_output.argmax(dim=-1).eq(data["is_same_student"]).sum().item()
+                    avg_loss_pred += loss.item()
+                    total_correct_pred += correct
+                    total_element_pred += data["is_same_student"].nelement()
+                # correct = next_sent_output.argmax(dim=-1).eq(data["is_next"]).sum().item()
+                    post_fix["avg_loss"] = avg_loss_pred / (i + 1)
+                    post_fix["avg_acc_pred"] = total_correct_pred / total_element_pred * 100
+                    post_fix["loss"] = loss.item()
+                avg_loss +=loss.item()
+                if i % self.log_freq == 0:
+                    data_iter.write(str(post_fix))
+                # if not train and epoch > 20 :
+                #     pickle.dump(mask_lm_output.cpu().detach().numpy(), open(f"logs/mask/mask_out_e{epoch}_{i}.pkl","wb"))
+                #     pickle.dump(data["bert_label"].cpu().detach().numpy(), open(f"logs/mask/label_e{epoch}_{i}.pkl","wb"))
+            end_time = time.time()
+            final_msg = {
+                "epoch": f"EP{epoch}_{phase}",
+                "avg_loss": avg_loss / len(data_iter),
+                "total_masked_acc": total_correct_mask * 100.0 / total_element_mask if total_element_mask != 0 else 0,
+                "time_taken_from_start": end_time - self.start_time
+            }
+            if self.same_student_prediction:
+                final_msg["total_prediction_acc"] = total_correct_pred * 100.0 / total_element_pred
+            print(final_msg)
+            f.close()
+        sys.stdout = sys.__stdout__
+        if phase == "val":
+            self.save_model = False
+            if self.avg_loss > (avg_loss / len(data_iter)):
+                self.save_model = True
+                self.avg_loss = (avg_loss / len(data_iter))
+        # pickle.dump(bert_hidden_representations, open(f"embeddings/{code}/{str_code}_embeddings_{epoch}.pkl","wb"))
+    def save(self, epoch, file_path="output/bert_trained.model"):
+        """
+        Saving the current BERT model on file_path
+        :param epoch: current epoch number
+        :param file_path: model output path which gonna be file_path+"ep%d" % epoch
+        :return: final_output_path
+        """
+#         if self.code:
+#             fpath = file_path.split("/")
+#             # output_path = fpath[0]+ "/"+ fpath[1]+f"/{self.code}/" + fpath[2] + ".ep%d" % epoch
+#             output_path = "/",join(fpath[0]+ "/"+ fpath[1]+f"/{self.code}/" + fpath[-1] + ".ep%d" % epoch
+#         else:
+        output_path = file_path + ".ep%d" % epoch
+        torch.save(self.bert.cpu(), output_path)
+        self.bert.to(self.device)
+        print("EP:%d Model Saved on:" % epoch, output_path)
+        return output_path
+class BERTFineTuneTrainer:
+    def __init__(self, bert: BERT, vocab_size: int,
+                 train_dataloader: DataLoader, test_dataloader: DataLoader = None,
+                 lr: float = 1e-4, betas=(0.9, 0.999), weight_decay: float = 0.01, warmup_steps=10000,
+                 with_cuda: bool = True, cuda_devices=None, log_freq: int = 10, workspace_name=None,
+                 num_labels=2, finetune_task=""):
+        """
+        :param bert: BERT model which you want to train
+        :param vocab_size: total word vocab size
+        :param train_dataloader: train dataset data loader
+        :param test_dataloader: test dataset data loader [can be None]
+        :param lr: learning rate of optimizer
+        :param betas: Adam optimizer betas
+        :param weight_decay: Adam optimizer weight decay param
+        :param with_cuda: traning with cuda
+        :param log_freq: logging frequency of the batch iteration
+        """
+        # Setup cuda device for BERT training, argument -c, --cuda should be true
+        cuda_condition = torch.cuda.is_available() and with_cuda
+        self.device = torch.device("cuda:0" if cuda_condition else "cpu")
+        print(with_cuda, cuda_condition, " Device used = ", self.device)
+        # This BERT model will be saved every epoch
+        self.bert = bert
+        for param in self.bert.parameters():
+            param.requires_grad = False
+        # Initialize the BERT Language Model, with BERT model
+        self.model = BERTForClassification(self.bert, vocab_size, num_labels).to(self.device)
+        # Distributed GPU training if CUDA can detect more than 1 GPU
+        if with_cuda and torch.cuda.device_count() > 1:
+            print("Using %d GPUS for BERT" % torch.cuda.device_count())
+            self.model = nn.DataParallel(self.model, device_ids=cuda_devices)
+        # Setting the train and test data loader
+        self.train_data = train_dataloader
+        self.test_data = test_dataloader
+        self.optim = Adam(self.model.parameters(), lr=lr, weight_decay=weight_decay) #, eps=1e-9
+        # self.scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=2, factor=0.1)
+        if num_labels == 1:
+            self.criterion = nn.MSELoss()
+        elif num_labels == 2:
+            self.criterion = nn.BCEWithLogitsLoss()
+            # self.criterion = nn.CrossEntropyLoss()
+        elif num_labels > 2:
+            self.criterion = nn.CrossEntropyLoss()
+            # self.criterion = nn.BCEWithLogitsLoss()
+        # self.ece_criterion = ECE().to(self.device)
+        self.log_freq = log_freq
+        self.workspace_name = workspace_name
+        self.finetune_task = finetune_task
+        self.save_model = False
+        self.avg_loss = 10000
+        self.start_time = time.time()
+        self.probability_list = []
+        print("Total Parameters:", sum([p.nelement() for p in self.model.parameters()]))
+    def train(self, epoch):
+        self.iteration(epoch, self.train_data)
+    def test(self, epoch):
+        self.iteration(epoch, self.test_data, train=False)
+    def iteration(self, epoch, data_loader, train=True):
+        """
+        loop over the data_loader for training or testing
+        if on train status, backward operation is activated
+        and also auto save the model every peoch
+        :param epoch: current epoch index
+        :param data_loader: torch.utils.data.DataLoader for iteration
+        :param train: boolean value of is train or test
+        :return: None
+        """
+        str_code = "train" if train else "test"
+        self.log_file = f"{self.workspace_name}/logs/{self.finetune_task}/log_{str_code}_finetuned.txt"
+        if epoch == 0:
+            f = open(self.log_file, 'w')
+            f.close()
+            if not train:
+                self.avg_loss = 10000
+        # Setting the tqdm progress bar
+        data_iter = tqdm.tqdm(enumerate(data_loader),
+                              desc="EP_%s:%d" % (str_code, epoch),
+                              total=len(data_loader),
+                              bar_format="{l_bar}{r_bar}")
+        avg_loss = 0.0
+        total_correct = 0
+        total_element = 0
+        plabels = []
+        tlabels = []
+        eval_accurate_nb = 0
+        nb_eval_examples = 0
+        logits_list = []
+        labels_list = []
+        if train:
+            self.model.train()
+        else:
+            self.model.eval()
+        self.probability_list = []
+        with open(self.log_file, 'a') as f:
+            sys.stdout = f
+            for i, data in data_iter:
+                # 0. batch_data will be sent into the device(GPU or cpu)
+                data = {key: value.to(self.device) for key, value in data.items()}
+                if train:
+                    h_rep, logits = self.model.forward(data["bert_input"], data["segment_label"])
+                else:
+                    with torch.no_grad():
+                        h_rep, logits = self.model.forward(data["bert_input"], data["segment_label"])
+                    # print(logits, logits.shape)
+                    logits_list.append(logits.cpu())
+                    labels_list.append(data["progress_status"].cpu())
+                # print(">>>>>>>>>>>>", progress_output)
+                # print(f"{epoch}---nelement--- {data['progress_status'].nelement()}")
+                # print(data["progress_status"].shape, logits.shape)
+                progress_loss = self.criterion(logits, data["progress_status"])
+                loss = progress_loss
+                if torch.cuda.device_count() > 1:
+                    loss = loss.mean()
+                # 3. backward and optimization only in train
+                if train:
+                    self.optim.zero_grad()
+                    loss.backward()
+                    # torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)
+                    self.optim.step()
+                # progress prediction accuracy
+                # correct = progress_output.argmax(dim=-1).eq(data["progress_status"]).sum().item()
+                probs = nn.LogSoftmax(dim=-1)(logits)
+                self.probability_list.append(probs)
+                predicted_labels = torch.argmax(probs, dim=-1)
+                true_labels = torch.argmax(data["progress_status"], dim=-1)
+                plabels.extend(predicted_labels.cpu().numpy())
+                tlabels.extend(true_labels.cpu().numpy())
+                # Compare predicted labels to true labels and calculate accuracy
+                correct = (predicted_labels == true_labels).sum().item()
+                avg_loss += loss.item()
+                total_correct += correct
+                # total_element += true_labels.nelement()
+                total_element += data["progress_status"].nelement()
+                # print(">>>>>>>>>>>>>>", predicted_labels, true_labels, correct, total_correct, total_element)
+                # if train:
+                post_fix = {
+                    "epoch": epoch,
+                    "iter": i,
+                    "avg_loss": avg_loss / (i + 1),
+                    "avg_acc": total_correct / total_element * 100,
+                    "loss": loss.item()
+                }
+#                 else:
+#                     logits = logits.detach().cpu().numpy()
+#                     label_ids = data["progress_status"].to('cpu').numpy()
+#                     tmp_eval_nb = accurate_nb(logits, label_ids)
+#                     eval_accurate_nb += tmp_eval_nb
+#                     nb_eval_examples += label_ids.shape[0]
+#                     # total_element += data["progress_status"].nelement()
+#                     # avg_loss += loss.item()
+#                     post_fix = {
+#                         "epoch": epoch,
+#                         "iter": i,
+#                         "avg_loss": avg_loss / (i + 1),
+#                         "avg_acc": tmp_eval_nb / total_element * 100,
+#                         "loss": loss.item()
+#                     }
+                if i % self.log_freq == 0:
+                    data_iter.write(str(post_fix))
+            # precisions = precision_score(plabels, tlabels, average="weighted")
+            # recalls = recall_score(plabels, tlabels, average="weighted")
+            f1_scores = f1_score(plabels, tlabels, average="weighted")
+            # if train:
+            end_time = time.time()
+            final_msg = {
+                "epoch": f"EP{epoch}_{str_code}",
+                "avg_loss": avg_loss / len(data_iter),
+                "total_acc": total_correct * 100.0 / total_element,
+                # "precisions": precisions,
+                # "recalls": recalls,
+                "f1_scores": f1_scores,
+                "time_taken_from_start": end_time - self.start_time
+            }
+#             else:
+#                 eval_accuracy = eval_accurate_nb/nb_eval_examples
+#                 logits_ece = torch.cat(logits_list)
+#                 labels_ece = torch.cat(labels_list)
+#                 ece = self.ece_criterion(logits_ece, labels_ece).item()
+#                 end_time = time.time()
+#                 final_msg = {
+#                     "epoch": f"EP{epoch}_{str_code}",
+#                     "eval_accuracy": eval_accuracy,
+#                     "ece": ece,
+#                     "avg_loss": avg_loss / len(data_iter),
+#                     "precisions": precisions,
+#                     "recalls": recalls,
+#                     "f1_scores": f1_scores,
+#                     "time_taken_from_start": end_time - self.start_time
+#                 }
+#                 if self.save_model:
+#                     conf_hist = visualization.ConfidenceHistogram()
+#                     plt_test = conf_hist.plot(np.array(logits_ece), np.array(labels_ece), title= f"Confidence Histogram {epoch}")
+#                     plt_test.savefig(f"{self.workspace_name}/plots/confidence_histogram/{self.finetune_task}/conf_histogram_test_{epoch}.png",bbox_inches='tight')
+#                     plt_test.close()
+#                     rel_diagram = visualization.ReliabilityDiagram()
+#                     plt_test_2 = rel_diagram.plot(np.array(logits_ece), np.array(labels_ece),title=f"Reliability Diagram {epoch}")
+#                     plt_test_2.savefig(f"{self.workspace_name}/plots/confidence_histogram/{self.finetune_task}/rel_diagram_test_{epoch}.png",bbox_inches='tight')
+#                     plt_test_2.close()
+            print(final_msg)
+            # print("EP%d_%s, avg_loss=" % (epoch, str_code), avg_loss / len(data_iter), "total_acc=", total_correct * 100.0 / total_element)
+            f.close()
+        sys.stdout = sys.__stdout__
+        self.save_model = False
+        if self.avg_loss > (avg_loss / len(data_iter)):
+            self.save_model = True
+            self.avg_loss = (avg_loss / len(data_iter))
+    def iteration_1(self, epoch_idx, data):
+        try:
+            data = {key: value.to(self.device) for key, value in data.items()}
+            logits = self.model(data['input_ids'], data['segment_label'])
+            # Ensure logits is a tensor, not a tuple
+            loss_fct = nn.CrossEntropyLoss()
+            loss = loss_fct(logits, data['labels'])
+            # Backpropagation and optimization
+            self.optim.zero_grad()
+            loss.backward()
+            self.optim.step()
+            if self.log_freq > 0 and epoch_idx % self.log_freq == 0:
+                print(f"Epoch {epoch_idx}: Loss = {loss.item()}")
+            return loss
+        except Exception as e:
+            print(f"Error during iteration: {e}")
+            raise
+                # plt_test.show()
+        # print("EP%d_%s, " % (epoch, str_code))
+    def save(self, epoch, file_path="output/bert_fine_tuned_trained.model"):
+        """
+        Saving the current BERT model on file_path
+        :param epoch: current epoch number
+        :param file_path: model output path which gonna be file_path+"ep%d" % epoch
+        :return: final_output_path
+        """
+        if self.finetune_task:
+            fpath = file_path.split("/")
+            output_path = fpath[0]+ "/"+ fpath[1]+f"/{self.finetune_task}/" + fpath[2] + ".ep%d" % epoch
+        else:
+            output_path = file_path + ".ep%d" % epoch
+        torch.save(self.model.cpu(), output_path)
+        self.model.to(self.device)
+        print("EP:%d Model Saved on:" % epoch, output_path)
+        return output_path
+class BERTAttention:
+    def __init__(self, bert: BERT, vocab_obj, train_dataloader: DataLoader, workspace_name=None, code=None, finetune_task=None, with_cuda=True):
+        # available_gpus = list(range(torch.cuda.device_count()))
+        cuda_condition = torch.cuda.is_available() and with_cuda
+        self.device = torch.device("cuda:0" if cuda_condition else "cpu")
+        print(with_cuda, cuda_condition, " Device used = ", self.device)
+        self.bert = bert.to(self.device)
+        # if with_cuda and torch.cuda.device_count() > 1:
+        #     print("Using %d GPUS for BERT" % torch.cuda.device_count())
+        #     self.bert = nn.DataParallel(self.bert, device_ids=available_gpus)
+        self.train_dataloader = train_dataloader
+        self.workspace_name = workspace_name
+        self.code = code
+        self.finetune_task = finetune_task
+        self.vocab_obj = vocab_obj
+    def getAttention(self):
+        # self.log_file = f"{self.workspace_name}/logs/{self.code}/log_attention.txt"
+        labels = ['PercentChange', 'NumeratorQuantity2', 'NumeratorQuantity1', 'DenominatorQuantity1',
+                  'OptionalTask_1', 'EquationAnswer', 'NumeratorFactor', 'DenominatorFactor',
+                  'OptionalTask_2', 'FirstRow1:1', 'FirstRow1:2', 'FirstRow2:1', 'FirstRow2:2', 'SecondRow',
+                  'ThirdRow', 'FinalAnswer','FinalAnswerDirection']
+        df_all = pd.DataFrame(0.0, index=labels, columns=labels)
+        # Setting the tqdm progress bar
+        data_iter = tqdm.tqdm(enumerate(self.train_dataloader),
+                              desc="attention",
+                              total=len(self.train_dataloader),
+                              bar_format="{l_bar}{r_bar}")
+        count = 0
+        for i, data in data_iter:
+            data = {key: value.to(self.device) for key, value in data.items()}
+            a = self.bert.forward(data["bert_input"], data["segment_label"])
+            non_zero = np.sum(data["segment_label"].cpu().detach().numpy())
+            # Last Transformer Layer
+            last_layer = self.bert.attention_values[-1].transpose(1,0,2,3)
+            # print(last_layer.shape)
+            head, d_model, s, s = last_layer.shape
+            for d in range(d_model):
+                seq_labels = self.vocab_obj.to_sentence(data["bert_input"].cpu().detach().numpy().tolist()[d])[1:non_zero-1]
+                # df_all = pd.DataFrame(0.0, index=seq_labels, columns=seq_labels)
+                indices_to_choose = defaultdict(int)
+                for k,s in enumerate(seq_labels):
+                    if s in labels:
+                        indices_to_choose[s] = k
+                indices_chosen = list(indices_to_choose.values())
+                selected_seq_labels = [s for l,s in enumerate(seq_labels) if l in indices_chosen]
+                # print(len(seq_labels), len(selected_seq_labels))
+                for h in range(head):
+                    # fig, ax = plt.subplots(figsize=(12, 12))
+                    # seq_labels = self.vocab_obj.to_sentence(data["bert_input"].cpu().detach().numpy().tolist()[d])#[1:non_zero-1]
+                    # seq_labels = self.vocab_obj.to_sentence(data["bert_input"].cpu().detach().numpy().tolist()[d])[1:non_zero-1]
+#                     indices_to_choose = defaultdict(int)
+#                     for k,s in enumerate(seq_labels):
+#                         if s in labels:
+#                             indices_to_choose[s] = k
+#                     indices_chosen = list(indices_to_choose.values())
+#                     selected_seq_labels = [s for l,s in enumerate(seq_labels) if l in indices_chosen]
+                    # print(f"Chosen index: {seq_labels, indices_to_choose, indices_chosen, selected_seq_labels}")
+                    df_cm = pd.DataFrame(last_layer[h][d][indices_chosen,:][:,indices_chosen], index = selected_seq_labels, columns = selected_seq_labels)
+                    df_all = df_all.add(df_cm, fill_value=0)
+                    count += 1
+                    # df_cm = pd.DataFrame(last_layer[h][d][1:non_zero-1,:][:,1:non_zero-1], index=seq_labels, columns=seq_labels)
+                    # df_all = df_all.add(df_cm, fill_value=0)
+                # df_all = df_all.reindex(index=seq_labels, columns=seq_labels)
+                # sns.heatmap(df_all, annot=False)
+                # plt.title("Attentions") #Probabilities
+                # plt.xlabel("Steps")
+                # plt.ylabel("Steps")
+                # plt.grid(True)
+                # plt.tick_params(axis='x', bottom=False, top=True, labelbottom=False, labeltop=True, labelrotation=90)
+                # plt.savefig(f"{self.workspace_name}/plots/{self.code}/{self.finetune_task}_attention_scores_over_[{h}]_head_n_data[{d}].png", bbox_inches='tight')
+                # plt.show()
+                # plt.close()
+        print(f"Count of total : {count, head * self.train_dataloader.dataset.len}")
+        df_all = df_all.div(count) # head * self.train_dataloader.dataset.len
+        df_all = df_all.reindex(index=labels, columns=labels)
+        sns.heatmap(df_all, annot=False)
+        plt.title("Attentions") #Probabilities
+        plt.xlabel("Steps")
+        plt.ylabel("Steps")
+        plt.grid(True)
+        plt.tick_params(axis='x', bottom=False, top=True, labelbottom=False, labeltop=True, labelrotation=90)
+        plt.savefig(f"{self.workspace_name}/plots/{self.code}/{self.finetune_task}_attention_scores.png", bbox_inches='tight')
+        plt.show()
+        plt.close()

src/reference_code/test.py ADDED Viewed

	@@ -0,0 +1,493 @@

+import torch
+from torch import nn, optim
+from torch.nn import functional as F
+from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
+import numpy as np
+from keras.preprocessing.sequence import pad_sequences
+from transformers  import BertTokenizer
+from transformers  import BertForSequenceClassification
+import random
+from sklearn.metrics import f1_score
+from utils import *
+import os
+import argparse
+import warnings
+warnings.filterwarnings("ignore")
+class ModelWithTemperature(nn.Module):
+    """
+    A thin decorator, which wraps a model with temperature scaling
+    model (nn.Module):
+        A classification neural network
+        NB: Output of the neural network should be the classification logits,
+            NOT the softmax (or log softmax)!
+    """
+    def __init__(self, model):
+        super(ModelWithTemperature, self).__init__()
+        self.model = model
+        self.temperature = nn.Parameter(torch.ones(1) * 1.5)
+    def forward(self, input_ids, token_type_ids, attention_mask):
+        logits = self.model(input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)[0]
+        return self.temperature_scale(logits)
+    def temperature_scale(self, logits):
+        """
+        Perform temperature scaling on logits
+        """
+        # Expand temperature to match the size of logits
+        temperature = self.temperature.unsqueeze(1).expand(logits.size(0), logits.size(1))
+        return logits / temperature
+    # This function probably should live outside of this class, but whatever
+    def set_temperature(self, valid_loader, args):
+        """
+        Tune the tempearature of the model (using the validation set).
+        We're going to set it to optimize NLL.
+        valid_loader (DataLoader): validation set loader
+        """
+        nll_criterion = nn.CrossEntropyLoss()
+        ece_criterion = ECE().to(args.device)
+        # First: collect all the logits and labels for the validation set
+        logits_list = []
+        labels_list = []
+        with torch.no_grad():
+            for step, batch in enumerate(valid_loader):
+                batch = tuple(t.to(args.device) for t in batch)
+                b_input_ids, b_input_mask, b_labels = batch
+                logits = self.model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)[0]
+                logits_list.append(logits)
+                labels_list.append(b_labels)
+            logits = torch.cat(logits_list)
+            labels = torch.cat(labels_list)
+        # Calculate NLL and ECE before temperature scaling
+        before_temperature_nll = nll_criterion(logits, labels).item()
+        before_temperature_ece = ece_criterion(logits, labels).item()
+        print('Before temperature - NLL: %.3f, ECE: %.3f' % (before_temperature_nll, before_temperature_ece))
+        # Next: optimize the temperature w.r.t. NLL
+        optimizer = optim.LBFGS([self.temperature], lr=0.01, max_iter=50)
+        def eval():
+            loss = nll_criterion(self.temperature_scale(logits), labels)
+            loss.backward()
+            return loss
+        optimizer.step(eval)
+        # Calculate NLL and ECE after temperature scaling
+        after_temperature_nll = nll_criterion(self.temperature_scale(logits), labels).item()
+        after_temperature_ece = ece_criterion(self.temperature_scale(logits), labels).item()
+        print('Optimal temperature: %.3f' % self.temperature.item())
+        print('After temperature - NLL: %.3f, ECE: %.3f' % (after_temperature_nll, after_temperature_ece))
+        return self
+class ECE(nn.Module):
+    def __init__(self, n_bins=15):
+        """
+        n_bins (int): number of confidence interval bins
+        """
+        super(ECE, self).__init__()
+        bin_boundaries = torch.linspace(0, 1, n_bins + 1)
+        self.bin_lowers = bin_boundaries[:-1]
+        self.bin_uppers = bin_boundaries[1:]
+    def forward(self, logits, labels):
+        softmaxes = F.softmax(logits, dim=1)
+        confidences, predictions = torch.max(softmaxes, 1)
+        accuracies = predictions.eq(labels)
+        ece = torch.zeros(1, device=logits.device)
+        for bin_lower, bin_upper in zip(self.bin_lowers, self.bin_uppers):
+            # Calculated |confidence - accuracy| in each bin
+            in_bin = confidences.gt(bin_lower.item()) * confidences.le(bin_upper.item())
+            prop_in_bin = in_bin.float().mean()
+            if prop_in_bin.item() > 0:
+                accuracy_in_bin = accuracies[in_bin].float().mean()
+                avg_confidence_in_bin = confidences[in_bin].mean()
+                ece += torch.abs(avg_confidence_in_bin - accuracy_in_bin) * prop_in_bin
+        return ece
+class ECE_v2(nn.Module):
+    def __init__(self, n_bins=15):
+        """
+        n_bins (int): number of confidence interval bins
+        """
+        super(ECE_v2, self).__init__()
+        bin_boundaries = torch.linspace(0, 1, n_bins + 1)
+        self.bin_lowers = bin_boundaries[:-1]
+        self.bin_uppers = bin_boundaries[1:]
+    def forward(self, softmaxes, labels):
+        confidences, predictions = torch.max(softmaxes, 1)
+        accuracies = predictions.eq(labels)
+        ece = torch.zeros(1, device=softmaxes.device)
+        for bin_lower, bin_upper in zip(self.bin_lowers, self.bin_uppers):
+            # Calculated |confidence - accuracy| in each bin
+            in_bin = confidences.gt(bin_lower.item()) * confidences.le(bin_upper.item())
+            prop_in_bin = in_bin.float().mean()
+            if prop_in_bin.item() > 0:
+                accuracy_in_bin = accuracies[in_bin].float().mean()
+                avg_confidence_in_bin = confidences[in_bin].mean()
+                ece += torch.abs(avg_confidence_in_bin - accuracy_in_bin) * prop_in_bin
+        return ece
+def accurate_nb(preds, labels):
+    pred_flat = np.argmax(preds, axis=1).flatten()
+    labels_flat = labels.flatten()
+    return np.sum(pred_flat == labels_flat)
+def set_seed(args):
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+    torch.manual_seed(args.seed)
+def apply_dropout(m):
+    if type(m) == nn.Dropout:
+        m.train()
+def main():
+    parser = argparse.ArgumentParser(description='Test code - measure the detection peformance')
+    parser.add_argument('--eva_iter', default=1, type=int, help='number of passes for mc-dropout when evaluation')
+    parser.add_argument('--model', type=str, choices=['base', 'manifold-smoothing', 'mc-dropout','temperature'], default='base')
+    parser.add_argument('--seed', type=int, default=0, help='random seed for test')
+    parser.add_argument("--epochs", default=10, type=int, help="Number of epochs for training.")
+    parser.add_argument('--index', type=int, default=0, help='random seed you used during training')
+    parser.add_argument('--in_dataset', required=True, help='target dataset: 20news')
+    parser.add_argument('--out_dataset', required=True, help='out-of-dist dataset')
+    parser.add_argument('--eval_batch_size', type=int, default=32)
+    parser.add_argument('--saved_dataset', type=str, default='n')
+    parser.add_argument('--eps_out', default=0.001, type=float, help="Perturbation size of out-of-domain adversarial training")
+    parser.add_argument("--eps_y", default=0.1, type=float, help="Perturbation size of label")
+    parser.add_argument('--eps_in', default=0.0001, type=float, help="Perturbation size of in-domain adversarial training")
+    args = parser.parse_args()
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    args.device = device
+    set_seed(args)
+    outf = 'test/'+args.model+'-'+str(args.index)
+    if not os.path.isdir(outf):
+        os.makedirs(outf)
+    if args.model == 'base':
+        dirname = '{}/BERT-base-{}'.format(args.in_dataset, args.index)
+        pretrained_dir = './model_save/{}'.format(dirname)
+        # Load a trained model and vocabulary that you have fine-tuned
+        model = BertForSequenceClassification.from_pretrained(pretrained_dir)
+        model.to(args.device)
+        print('Load Tekenizer')
+    elif args.model == 'mc-dropout':
+        dirname = '{}/BERT-base-{}'.format(args.in_dataset, args.index)
+        pretrained_dir = './model_save/{}'.format(dirname)
+        # Load a trained model and vocabulary that you have fine-tuned
+        model = BertForSequenceClassification.from_pretrained(pretrained_dir)
+        model.to(args.device)
+    elif args.model == 'temperature':
+        dirname = '{}/BERT-base-{}'.format(args.in_dataset, args.index)
+        pretrained_dir = './model_save/{}'.format(dirname)
+        orig_model = BertForSequenceClassification.from_pretrained(pretrained_dir)
+        orig_model.to(args.device)
+        model = ModelWithTemperature(orig_model)
+        model.to(args.device)
+    elif args.model == 'manifold-smoothing':
+        dirname = '{}/BERT-mf-{}-{}-{}-{}'.format(args.in_dataset, args.index, args.eps_in, args.eps_y, args.eps_out)
+        print(dirname)
+        pretrained_dir = './model_save/{}'.format(dirname)
+        model = BertForSequenceClassification.from_pretrained(pretrained_dir)
+        model.to(args.device)
+    if args.saved_dataset == 'n':
+        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
+        train_sentences, val_sentences, test_sentences, train_labels, val_labels, test_labels = load_dataset(args.in_dataset)
+        _, _, nt_test_sentences, _, _, nt_test_labels = load_dataset(args.out_dataset)
+        val_input_ids = []
+        test_input_ids = []
+        nt_test_input_ids = []
+        if args.in_dataset == '20news' or args.in_dataset == '20news-15':
+            MAX_LEN = 150
+        else:
+            MAX_LEN = 256
+        for sent in val_sentences:
+            encoded_sent = tokenizer.encode(
+                                sent,                      # Sentence to encode.
+                                add_special_tokens = True, # Add '[CLS]' and '[SEP]'
+                                truncation= True,
+                                max_length = MAX_LEN,          # Truncate all sentences.
+                                #return_tensors = 'pt',     # Return pytorch tensors.
+                        )
+        # Add the encoded sentence to the list.
+            val_input_ids.append(encoded_sent)
+        for sent in test_sentences:
+            encoded_sent = tokenizer.encode(
+                                sent,                      # Sentence to encode.
+                                add_special_tokens = True, # Add '[CLS]' and '[SEP]'
+                                truncation= True,
+                                max_length = MAX_LEN,          # Truncate all sentences.
+                                #return_tensors = 'pt',     # Return pytorch tensors.
+                        )
+        # Add the encoded sentence to the list.
+            test_input_ids.append(encoded_sent)
+        for sent in nt_test_sentences:
+            encoded_sent = tokenizer.encode(
+                                sent,
+                                add_special_tokens = True,
+                                truncation= True,
+                                max_length = MAX_LEN,
+                        )
+            nt_test_input_ids.append(encoded_sent)
+        # Pad our input tokens
+        val_input_ids = pad_sequences(val_input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")
+        test_input_ids = pad_sequences(test_input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")
+        nt_test_input_ids = pad_sequences(nt_test_input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")
+        val_attention_masks = []
+        test_attention_masks = []
+        nt_test_attention_masks = []
+        for seq in val_input_ids:
+            seq_mask = [float(i>0) for i in seq]
+            val_attention_masks.append(seq_mask)
+        for seq in test_input_ids:
+            seq_mask = [float(i>0) for i in seq]
+            test_attention_masks.append(seq_mask)
+        for seq in nt_test_input_ids:
+            seq_mask = [float(i>0) for i in seq]
+            nt_test_attention_masks.append(seq_mask)
+        val_inputs = torch.tensor(val_input_ids)
+        val_labels = torch.tensor(val_labels)
+        val_masks = torch.tensor(val_attention_masks)
+        test_inputs = torch.tensor(test_input_ids)
+        test_labels = torch.tensor(test_labels)
+        test_masks = torch.tensor(test_attention_masks)
+        nt_test_inputs = torch.tensor(nt_test_input_ids)
+        nt_test_labels = torch.tensor(nt_test_labels)
+        nt_test_masks = torch.tensor(nt_test_attention_masks)
+        val_data = TensorDataset(val_inputs, val_masks, val_labels)
+        test_data = TensorDataset(test_inputs, test_masks, test_labels)
+        nt_test_data = TensorDataset(nt_test_inputs, nt_test_masks, nt_test_labels)
+        dataset_dir = 'dataset/test'
+        if not os.path.exists(dataset_dir):
+            os.makedirs(dataset_dir)
+        torch.save(val_data, dataset_dir+'/{}_val_in_domain.pt'.format(args.in_dataset))
+        torch.save(test_data, dataset_dir+'/{}_test_in_domain.pt'.format(args.in_dataset))
+        torch.save(nt_test_data, dataset_dir+'/{}_test_out_of_domain.pt'.format(args.out_dataset))
+    else:
+        dataset_dir = 'dataset/test'
+        val_data = torch.load(dataset_dir+'/{}_val_in_domain.pt'.format(args.in_dataset))
+        test_data = torch.load(dataset_dir+'/{}_test_in_domain.pt'.format(args.in_dataset))
+        nt_test_data = torch.load(dataset_dir+'/{}_test_out_of_domain.pt'.format(args.out_dataset))
+######## saved dataset
+    test_sampler = SequentialSampler(test_data)
+    test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=args.eval_batch_size)
+    nt_test_sampler = SequentialSampler(nt_test_data)
+    nt_test_dataloader = DataLoader(nt_test_data, sampler=nt_test_sampler, batch_size=args.eval_batch_size)
+    val_sampler = SequentialSampler(val_data)
+    val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=args.eval_batch_size)
+    if args.model == 'temperature':
+        model.set_temperature(val_dataloader, args)
+    model.eval()
+    if args.model == 'mc-dropout':
+        model.apply(apply_dropout)
+    correct = 0
+    total = 0
+    output_list = []
+    labels_list = []
+##### validation dat
+    with torch.no_grad():
+        for step, batch in enumerate(val_dataloader):
+            batch = tuple(t.to(args.device) for t in batch)
+            b_input_ids, b_input_mask, b_labels = batch
+            total += b_labels.shape[0]
+            batch_output = 0
+            for j in range(args.eva_iter):
+                if args.model == 'temperature':
+                    current_batch = model(input_ids=b_input_ids, token_type_ids=None, attention_mask=b_input_mask) #logits
+                else:
+                    current_batch = model(input_ids=b_input_ids, token_type_ids=None, attention_mask=b_input_mask)[0]  #logits
+                batch_output = batch_output + F.softmax(current_batch, dim=1)
+            batch_output = batch_output/args.eva_iter
+            output_list.append(batch_output)
+            labels_list.append(b_labels)
+            score, predicted = batch_output.max(1)
+            correct += predicted.eq(b_labels).sum().item()
+    ###calculate accuracy and ECE
+    val_eval_accuracy = correct/total
+    print("Val Accuracy: {}".format(val_eval_accuracy))
+    ece_criterion = ECE_v2().to(args.device)
+    softmaxes_ece = torch.cat(output_list)
+    labels_ece = torch.cat(labels_list)
+    val_ece = ece_criterion(softmaxes_ece, labels_ece).item()
+    print('ECE on Val data: {}'.format(val_ece))
+#### Test data
+    correct = 0
+    total = 0
+    output_list = []
+    labels_list = []
+    predict_list = []
+    true_list = []
+    true_list_ood = []
+    predict_mis = []
+    predict_in = []
+    score_list = []
+    correct_index_all = []
+    ## test on in-distribution test set
+    with torch.no_grad():
+        for step, batch in enumerate(test_dataloader):
+            batch = tuple(t.to(args.device) for t in batch)
+            b_input_ids, b_input_mask, b_labels = batch
+            total += b_labels.shape[0]
+            batch_output = 0
+            for j in range(args.eva_iter):
+                if args.model == 'temperature':
+                    current_batch = model(input_ids=b_input_ids, token_type_ids=None, attention_mask=b_input_mask) #logits
+                else:
+                    current_batch = model(input_ids=b_input_ids, token_type_ids=None, attention_mask=b_input_mask)[0]  #logits
+                batch_output = batch_output + F.softmax(current_batch, dim=1)
+            batch_output = batch_output/args.eva_iter
+            output_list.append(batch_output)
+            labels_list.append(b_labels)
+            score, predicted = batch_output.max(1)
+            correct += predicted.eq(b_labels).sum().item()
+            correct_index = (predicted == b_labels)
+            correct_index_all.append(correct_index)
+            score_list.append(score)
+    ###calcutae accuracy
+    eval_accuracy = correct/total
+    print("Test Accuracy: {}".format(eval_accuracy))
+    ##calculate ece
+    ece_criterion = ECE_v2().to(args.device)
+    softmaxes_ece = torch.cat(output_list)
+    labels_ece = torch.cat(labels_list)
+    ece = ece_criterion(softmaxes_ece, labels_ece).item()
+    print('ECE on Test data: {}'.format(ece))
+    #confidence for in-distribution data
+    score_in_array = torch.cat(score_list)
+    #indices of data that are classified correctly
+    correct_array = torch.cat(correct_index_all)
+    label_array = torch.cat(labels_list)
+### test on out-of-distribution data
+    predict_ood = []
+    score_ood_list = []
+    true_list_ood = []
+    with torch.no_grad():
+        for step, batch in enumerate(nt_test_dataloader):
+            batch = tuple(t.to(args.device) for t in batch)
+            b_input_ids, b_input_mask, b_labels = batch
+            batch_output = 0
+            for j in range(args.eva_iter):
+                if args.model == 'temperature':
+                    current_batch = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
+                else:
+                    current_batch = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)[0]
+                batch_output = batch_output + F.softmax(current_batch, dim=1)
+            batch_output = batch_output/args.eva_iter
+            score_out, _ = batch_output.max(1)
+            score_ood_list.append(score_out)
+    score_ood_array = torch.cat(score_ood_list)
+    label_array = label_array.cpu().numpy()
+    score_ood_array = score_ood_array.cpu().numpy()
+    score_in_array = score_in_array.cpu().numpy()
+    correct_array = correct_array.cpu().numpy()
+ ####### calculate NBAUCC for detection task
+    predict_o = np.zeros(len(score_in_array)+len(score_ood_array))
+    true_o = np.ones(len(score_in_array)+len(score_ood_array))
+    true_o[:len(score_in_array)] = 0   ## in-distribution data as false, ood data as positive
+    true_mis = np.ones(len(score_in_array))
+    true_mis[correct_array] = 0  ##true instances as false, misclassified instances as positive
+    predict_mis = np.zeros(len(score_in_array))
+    ood_sum = 0
+    mis_sum = 0
+    ood_sum_list = []
+    mis_sum_list = []
+#### upper bound of the threshold tau for NBAUCC
+    stop_points = [0.50, 1.]
+    for threshold in np.arange(0., 1.01, 0.02):
+        predict_ood_index1 = (score_in_array < threshold)
+        predict_ood_index2 = (score_ood_array < threshold)
+        predict_ood_index = np.concatenate((predict_ood_index1, predict_ood_index2), axis=0)
+        predict_o[predict_ood_index] = 1
+        predict_mis[score_in_array<threshold] = 1
+        ood = f1_score(true_o, predict_o, average='binary') ##### detection f1 score for a specific threshold
+        mis = f1_score(true_mis, predict_mis, average='binary')
+        ood_sum += ood*0.02
+        mis_sum += mis*0.02
+        if threshold in stop_points:
+            ood_sum_list.append(ood_sum)
+            mis_sum_list.append(mis_sum)
+    for i in range(len(stop_points)):
+        print('OOD detection, NBAUCC {}: {}'.format(stop_points[i], ood_sum_list[i]/stop_points[i]))
+        print('misclassification detection, NBAUCC {}: {}'.format(stop_points[i], mis_sum_list[i]/stop_points[i]))
+if __name__ == "__main__":
+    main()

src/reference_code/utils.py ADDED Viewed

	@@ -0,0 +1,369 @@

+import torch
+from torch import nn
+from torch.nn import functional as F
+import pandas as pd
+from collections import Counter
+import numpy as np
+from sklearn.datasets import fetch_20newsgroups
+from collections import Counter, defaultdict
+from nltk.corpus import stopwords
+from sklearn.model_selection import train_test_split
+import re
+from sklearn.utils import shuffle
+def cos_dist(x, y):
+    ## cosine distance function
+    cos = nn.CosineSimilarity(dim=1, eps=1e-6)
+    batch_size = x.size(0)
+    c = torch.clamp(1 - cos(x.view(batch_size, -1), y.view(batch_size, -1)),
+                    min=0)
+    return c.mean()
+def tag_mapping(tags):
+    """
+    Create a dictionary and a mapping of tags, sorted by frequency.
+    """
+    #tags = [s[1] for s in dataset]
+    dico = Counter(tags)
+    tag_to_id, id_to_tag = create_mapping(dico)
+    print("Found %i unique named entity tags" % len(dico))
+    return dico, tag_to_id, id_to_tag
+def create_mapping(dico):
+    """
+    Create a mapping (item to ID / ID to item) from a dictionary.
+    Items are ordered by decreasing frequency.
+    """
+    sorted_items = sorted(dico.items(), key=lambda x: (-x[1], x[0]))
+    id_to_item = {i: v[0] for i, v in enumerate(sorted_items)}
+    item_to_id = {v: k for k, v in id_to_item.items()}
+    return item_to_id, id_to_item
+def clean_str(string):
+    """
+    Tokenization/string cleaning for all datasets except for SST.
+    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
+    """
+    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
+    string = re.sub(r"\'s", " \'s", string)
+    string = re.sub(r"\'ve", " \'ve", string)
+    string = re.sub(r"n\'t", " n\'t", string)
+    string = re.sub(r"\'re", " \'re", string)
+    string = re.sub(r"\'d", " \'d", string)
+    string = re.sub(r"\'ll", " \'ll", string)
+    string = re.sub(r",", " , ", string)
+    string = re.sub(r"!", " ! ", string)
+    string = re.sub(r"\(", " \( ", string)
+    string = re.sub(r"\)", " \) ", string)
+    string = re.sub(r"\?", " \? ", string)
+    string = re.sub(r"\s{2,}", " ", string)
+    return string.strip().lower()
+def clean_doc(x, word_freq):
+    stop_words = set(stopwords.words('english'))
+    clean_docs = []
+    most_commons = dict(word_freq.most_common(min(len(word_freq), 50000)))
+    for doc_content in x:
+        doc_words = []
+        cleaned = clean_str(doc_content.strip())
+        for word in cleaned.split():
+            if word not in stop_words and word_freq[word] >= 5:
+                if word in most_commons:
+                    doc_words.append(word)
+                else:
+                    doc_words.append("<UNK>")
+        doc_str = ' '.join(doc_words).strip()
+        clean_docs.append(doc_str)
+    return clean_docs
+def load_dataset(dataset):
+    if dataset == 'sst':
+        df_train = pd.read_csv("./dataset/sst/SST-2/train.tsv", delimiter='\t', header=0)
+        df_val = pd.read_csv("./dataset/sst/SST-2/dev.tsv", delimiter='\t', header=0)
+        df_test = pd.read_csv("./dataset/sst/SST-2/sst-test.tsv", delimiter='\t', header=None, names=['sentence', 'label'])
+        train_sentences = df_train.sentence.values
+        val_sentences = df_val.sentence.values
+        test_sentences = df_test.sentence.values
+        train_labels = df_train.label.values
+        val_labels = df_val.label.values
+        test_labels = df_test.label.values
+    if dataset == '20news':
+        VALIDATION_SPLIT = 0.8
+        newsgroups_train  = fetch_20newsgroups('dataset/20news', subset='train',  shuffle=True, random_state=0)
+        print(newsgroups_train.target_names)
+        print(len(newsgroups_train.data))
+        newsgroups_test  = fetch_20newsgroups('dataset/20news', subset='test',  shuffle=False)
+        print(len(newsgroups_test.data))
+        train_len = int(VALIDATION_SPLIT * len(newsgroups_train.data))
+        train_sentences = newsgroups_train.data[:train_len]
+        val_sentences = newsgroups_train.data[train_len:]
+        test_sentences = newsgroups_test.data
+        train_labels = newsgroups_train.target[:train_len]
+        val_labels = newsgroups_train.target[train_len:]
+        test_labels = newsgroups_test.target
+    if dataset == '20news-15':
+        VALIDATION_SPLIT = 0.8
+        cats = ['alt.atheism',
+        'comp.graphics',
+        'comp.os.ms-windows.misc',
+        'comp.sys.ibm.pc.hardware',
+        'comp.sys.mac.hardware',
+        'comp.windows.x',
+        'rec.autos',
+        'rec.motorcycles',
+        'rec.sport.baseball',
+        'rec.sport.hockey',
+        'misc.forsale',
+        'sci.crypt',
+        'sci.electronics',
+        'sci.med',
+        'sci.space']
+        newsgroups_train  = fetch_20newsgroups('dataset/20news', subset='train',  shuffle=True, categories=cats, random_state=0)
+        print(newsgroups_train.target_names)
+        print(len(newsgroups_train.data))
+        newsgroups_test  = fetch_20newsgroups('dataset/20news', subset='test',  shuffle=False, categories=cats)
+        print(len(newsgroups_test.data))
+        train_len = int(VALIDATION_SPLIT * len(newsgroups_train.data))
+        train_sentences = newsgroups_train.data[:train_len]
+        val_sentences = newsgroups_train.data[train_len:]
+        test_sentences = newsgroups_test.data
+        train_labels = newsgroups_train.target[:train_len]
+        val_labels = newsgroups_train.target[train_len:]
+        test_labels = newsgroups_test.target
+    if dataset == '20news-5':
+        cats = [
+        'soc.religion.christian',
+        'talk.politics.guns',
+        'talk.politics.mideast',
+        'talk.politics.misc',
+        'talk.religion.misc']
+        newsgroups_test  = fetch_20newsgroups('dataset/20news', subset='test',  shuffle=False, categories=cats)
+        print(newsgroups_test.target_names)
+        print(len(newsgroups_test.data))
+        train_sentences = None
+        val_sentences = None
+        test_sentences = newsgroups_test.data
+        train_labels = None
+        val_labels = None
+        test_labels = newsgroups_test.target
+    if dataset == 'wos':
+        TESTING_SPLIT = 0.6
+        VALIDATION_SPLIT = 0.8
+        file_path = './dataset/WebOfScience/WOS46985/X.txt'
+        with open(file_path, 'r') as read_file:
+            x_temp = read_file.readlines()
+            x_all = []
+            for x in x_temp:
+                x_all.append(str(x))
+        print(len(x_all))
+        file_path = './dataset/WebOfScience/WOS46985/Y.txt'
+        with open(file_path, 'r') as read_file:
+            y_temp= read_file.readlines()
+            y_all = []
+            for y in y_temp:
+                y_all.append(int(y))
+        print(len(y_all))
+        print(max(y_all), min(y_all))
+        x_in = []
+        y_in = []
+        for i in range(len(x_all)):
+            x_in.append(x_all[i])
+            y_in.append(y_all[i])
+        train_val_len = int(TESTING_SPLIT * len(x_in))
+        train_len = int(VALIDATION_SPLIT * train_val_len)
+        train_sentences = x_in[:train_len]
+        val_sentences = x_in[train_len:train_val_len]
+        test_sentences = x_in[train_val_len:]
+        train_labels = y_in[:train_len]
+        val_labels = y_in[train_len:train_val_len]
+        test_labels = y_in[train_val_len:]
+        print(len(train_labels))
+        print(len(val_labels))
+        print(len(test_labels))
+    if dataset == 'wos-100':
+        TESTING_SPLIT = 0.6
+        VALIDATION_SPLIT = 0.8
+        file_path = './dataset/WebOfScience/WOS46985/X.txt'
+        with open(file_path, 'r') as read_file:
+            x_temp = read_file.readlines()
+            x_all = []
+            for x in x_temp:
+                x_all.append(str(x))
+        print(len(x_all))
+        file_path = './dataset/WebOfScience/WOS46985/Y.txt'
+        with open(file_path, 'r') as read_file:
+            y_temp= read_file.readlines()
+            y_all = []
+            for y in y_temp:
+                y_all.append(int(y))
+        print(len(y_all))
+        print(max(y_all), min(y_all))
+        x_in = []
+        y_in = []
+        for i in range(len(x_all)):
+            if y_all[i] in range(100):
+                x_in.append(x_all[i])
+                y_in.append(y_all[i])
+        for i in range(133):
+            num = 0
+            for y in y_in:
+                if y == i:
+                    num = num + 1
+            # print(num)
+        train_val_len = int(TESTING_SPLIT * len(x_in))
+        train_len = int(VALIDATION_SPLIT * train_val_len)
+        train_sentences = x_in[:train_len]
+        val_sentences = x_in[train_len:train_val_len]
+        test_sentences = x_in[train_val_len:]
+        train_labels = y_in[:train_len]
+        val_labels = y_in[train_len:train_val_len]
+        test_labels = y_in[train_val_len:]
+        print(len(train_labels))
+        print(len(val_labels))
+        print(len(test_labels))
+    if dataset == 'wos-34':
+        TESTING_SPLIT = 0.6
+        VALIDATION_SPLIT = 0.8
+        file_path = './dataset/WebOfScience/WOS46985/X.txt'
+        with open(file_path, 'r') as read_file:
+            x_temp = read_file.readlines()
+            x_all = []
+            for x in x_temp:
+                x_all.append(str(x))
+        print(len(x_all))
+        file_path = './dataset/WebOfScience/WOS46985/Y.txt'
+        with open(file_path, 'r') as read_file:
+            y_temp= read_file.readlines()
+            y_all = []
+            for y in y_temp:
+                y_all.append(int(y))
+        print(len(y_all))
+        print(max(y_all), min(y_all))
+        x_in = []
+        y_in = []
+        for i in range(len(x_all)):
+            if (y_all[i] in range(100)) != True:
+                x_in.append(x_all[i])
+                y_in.append(y_all[i])
+        for i in range(133):
+            num = 0
+            for y in y_in:
+                if y == i:
+                    num = num + 1
+            # print(num)
+        train_val_len = int(TESTING_SPLIT * len(x_in))
+        train_len = int(VALIDATION_SPLIT * train_val_len)
+        train_sentences = None
+        val_sentences = None
+        test_sentences = x_in[train_val_len:]
+        train_labels = None
+        val_labels = None
+        test_labels = y_in[train_val_len:]
+        print(len(test_labels))
+    if dataset == 'agnews':
+        VALIDATION_SPLIT = 0.8
+        labels_in_domain = [1, 2]
+        train_df = pd.read_csv('./dataset/agnews/train.csv', header=None)
+        train_df.rename(columns={0: 'label',1: 'title', 2:'sentence'}, inplace=True)
+        # train_df = pd.concat([train_df, pd.get_dummies(train_df['label'],prefix='label')], axis=1)
+        print(train_df.dtypes)
+        train_in_df_sentence = []
+        train_in_df_label = []
+        for i in range(len(train_df.sentence.values)):
+            sentence_temp = ''.join(str(train_df.sentence.values[i]))
+            train_in_df_sentence.append(sentence_temp)
+            train_in_df_label.append(train_df.label.values[i]-1)
+        test_df = pd.read_csv('./dataset/agnews/test.csv', header=None)
+        test_df.rename(columns={0: 'label',1: 'title', 2:'sentence'}, inplace=True)
+        # test_df = pd.concat([test_df, pd.get_dummies(test_df['label'],prefix='label')], axis=1)
+        test_in_df_sentence = []
+        test_in_df_label = []
+        for i in range(len(test_df.sentence.values)):
+            test_in_df_sentence.append(str(test_df.sentence.values[i]))
+            test_in_df_label.append(test_df.label.values[i]-1)
+        train_len = int(VALIDATION_SPLIT * len(train_in_df_sentence))
+        train_sentences = train_in_df_sentence[:train_len]
+        val_sentences = train_in_df_sentence[train_len:]
+        test_sentences = test_in_df_sentence
+        train_labels = train_in_df_label[:train_len]
+        val_labels = train_in_df_label[train_len:]
+        test_labels = test_in_df_label
+        print(len(train_sentences))
+        print(len(val_sentences))
+        print(len(test_sentences))
+    return train_sentences, val_sentences, test_sentences, train_labels, val_labels, test_labels

src/reference_code/visualization.py ADDED Viewed

	@@ -0,0 +1,78 @@

+import numpy as np
+#import matplotlib as mpl
+#mpl.use('Agg')
+import matplotlib.pyplot as plt
+import metrics
+class ConfidenceHistogram(metrics.MaxProbCELoss):
+    def plot(self, output, labels, n_bins = 15, logits = True, title = None):
+        super().loss(output, labels, n_bins, logits)
+        #scale each datapoint
+        n = len(labels)
+        w = np.ones(n)/n
+        plt.rcParams["font.family"] = "serif"
+        #size and axis limits
+        plt.figure(figsize=(3,3))
+        plt.xlim(0,1)
+        plt.ylim(0,1)
+        plt.xticks([0.0, 0.2, 0.4, 0.6, 0.8, 1.0], ['0.0', '0.2', '0.4', '0.6', '0.8', '1.0'])
+        plt.yticks([0.0, 0.2, 0.4, 0.6, 0.8, 1.0], ['0.0', '0.2', '0.4', '0.6', '0.8', '1.0'])
+        #plot grid
+        plt.grid(color='tab:grey', linestyle=(0, (1, 5)), linewidth=1,zorder=0)
+        #plot histogram
+        plt.hist(self.confidences,n_bins,weights = w,color='b',range=(0.0,1.0),edgecolor = 'k')
+        #plot vertical dashed lines
+        acc = np.mean(self.accuracies)
+        conf = np.mean(self.confidences)
+        plt.axvline(x=acc, color='tab:grey', linestyle='--', linewidth = 3)
+        plt.axvline(x=conf, color='tab:grey', linestyle='--', linewidth = 3)
+        if acc > conf:
+            plt.text(acc+0.03,0.9,'Accuracy',rotation=90,fontsize=11)
+            plt.text(conf-0.07,0.9,'Avg. Confidence',rotation=90, fontsize=11)
+        else:
+            plt.text(acc-0.07,0.9,'Accuracy',rotation=90,fontsize=11)
+            plt.text(conf+0.03,0.9,'Avg. Confidence',rotation=90, fontsize=11)
+        plt.ylabel('% of Samples',fontsize=13)
+        plt.xlabel('Confidence',fontsize=13)
+        plt.tight_layout()
+        if title is not None:
+            plt.title(title,fontsize=16)
+        return plt
+class ReliabilityDiagram(metrics.MaxProbCELoss):
+    def plot(self, output, labels, n_bins = 15, logits = True, title = None):
+        super().loss(output, labels, n_bins, logits)
+        #computations
+        delta = 1.0/n_bins
+        x = np.arange(0,1,delta)
+        mid = np.linspace(delta/2,1-delta/2,n_bins)
+        error = np.abs(np.subtract(mid,self.bin_acc))
+        plt.rcParams["font.family"] = "serif"
+        #size and axis limits
+        plt.figure(figsize=(3,3))
+        plt.xlim(0,1)
+        plt.ylim(0,1)
+        #plot grid
+        plt.grid(color='tab:grey', linestyle=(0, (1, 5)), linewidth=1,zorder=0)
+        #plot bars and identity line
+        plt.bar(x, self.bin_acc, color = 'b', width=delta,align='edge',edgecolor = 'k',label='Outputs',zorder=5)
+        plt.bar(x, error, bottom=np.minimum(self.bin_acc,mid), color = 'mistyrose', alpha=0.5, width=delta,align='edge',edgecolor = 'r',hatch='/',label='Gap',zorder=10)
+        ident = [0.0, 1.0]
+        plt.plot(ident,ident,linestyle='--',color='tab:grey',zorder=15)
+        #labels and legend
+        plt.ylabel('Accuracy',fontsize=13)
+        plt.xlabel('Confidence',fontsize=13)
+        plt.legend(loc='upper left',framealpha=1.0,fontsize='medium')
+        if title is not None:
+            plt.title(title,fontsize=16)
+        plt.tight_layout()
+        return plt

src/seq_model.py CHANGED Viewed

@@ -1,6 +1,10 @@
 import torch.nn as nn
 from bert import BERT
 class BERTSM(nn.Module):
@@ -18,6 +22,12 @@ class BERTSM(nn.Module):
         super().__init__()
         self.bert = bert
         self.mask_lm = MaskedSequenceModel(self.bert.hidden, vocab_size)
         self.same_student = SameStudentPrediction(self.bert.hidden)
     def forward(self, x, segment_label, pred=False):
@@ -28,6 +38,7 @@ class BERTSM(nn.Module):
             return x[:, 0], self.mask_lm(x), self.same_student(x)
         else:
             return x[:, 0], self.mask_lm(x)
 class MaskedSequenceModel(nn.Module):
@@ -46,6 +57,9 @@ class MaskedSequenceModel(nn.Module):
         self.softmax = nn.LogSoftmax(dim=-1)
     def forward(self, x):
         return self.softmax(self.linear(x))
@@ -62,3 +76,4 @@ class SameStudentPrediction(nn.Module):
     def forward(self, x):
         return self.softmax(self.linear(x[:, 0]))

 import torch.nn as nn
+<<<<<<< HEAD
+from .bert import BERT
+=======
 from bert import BERT
+>>>>>>> bffd3381ccb717f802fe651d4111ec0a268e3896
 class BERTSM(nn.Module):
         super().__init__()
         self.bert = bert
         self.mask_lm = MaskedSequenceModel(self.bert.hidden, vocab_size)
+<<<<<<< HEAD
+    def forward(self, x, segment_label):
+        x = self.bert(x, segment_label)
+        return self.mask_lm(x), x[:, 0]
+=======
         self.same_student = SameStudentPrediction(self.bert.hidden)
     def forward(self, x, segment_label, pred=False):
             return x[:, 0], self.mask_lm(x), self.same_student(x)
         else:
             return x[:, 0], self.mask_lm(x)
+>>>>>>> bffd3381ccb717f802fe651d4111ec0a268e3896
 class MaskedSequenceModel(nn.Module):
         self.softmax = nn.LogSoftmax(dim=-1)
     def forward(self, x):
+<<<<<<< HEAD
+        return self.softmax(self.linear(x))
+=======
         return self.softmax(self.linear(x))
     def forward(self, x):
         return self.softmax(self.linear(x[:, 0]))
+>>>>>>> bffd3381ccb717f802fe651d4111ec0a268e3896

src/transformer.py CHANGED Viewed

@@ -1,7 +1,12 @@
 import torch.nn as nn
 from attention import MultiHeadedAttention
 from transformer_component import SublayerConnection, PositionwiseFeedForward
 class TransformerBlock(nn.Module):
     """
@@ -25,6 +30,12 @@ class TransformerBlock(nn.Module):
         self.dropout = nn.Dropout(p=dropout)
     def forward(self, x, mask):
         x = self.input_sublayer(x, lambda _x: self.attention.forward(_x, _x, _x, mask=mask))
         x = self.output_sublayer(x, self.feed_forward)
         return self.dropout(x)

 import torch.nn as nn
+<<<<<<< HEAD
+from .attention import MultiHeadedAttention
+from .transformer_component import SublayerConnection, PositionwiseFeedForward
+=======
 from attention import MultiHeadedAttention
 from transformer_component import SublayerConnection, PositionwiseFeedForward
+>>>>>>> bffd3381ccb717f802fe651d4111ec0a268e3896
 class TransformerBlock(nn.Module):
     """
         self.dropout = nn.Dropout(p=dropout)
     def forward(self, x, mask):
+<<<<<<< HEAD
+        attn_output, p_attn = self.attention.forward(x, x, x, mask=mask)
+        self.p_attn = p_attn.cpu().detach().numpy()
+        x = self.input_sublayer(x, lambda _x: attn_output)
+=======
         x = self.input_sublayer(x, lambda _x: self.attention.forward(_x, _x, _x, mask=mask))
+>>>>>>> bffd3381ccb717f802fe651d4111ec0a268e3896
         x = self.output_sublayer(x, self.feed_forward)
         return self.dropout(x)

src/vocab.py CHANGED Viewed

@@ -1,9 +1,22 @@
 import collections
 import tqdm
 class Vocab(object):
     """
     Special tokens predefined in the vocab file are:
     -[UNK]
     -[MASK]
     -[CLS]
@@ -35,7 +48,11 @@ class Vocab(object):
         words = [self.invocab[index] if index < len(self.invocab)
                  else "[%d]" % index for index in seq ]
         return " ".join(words)
 # if __init__ == "__main__":

 import collections
 import tqdm
+<<<<<<< HEAD
+import os
+from pathlib import Path
+head_directory = Path(__file__).resolve().parent.parent
+# print(head_directory)
+os.chdir(head_directory)
+=======
+>>>>>>> bffd3381ccb717f802fe651d4111ec0a268e3896
 class Vocab(object):
     """
     Special tokens predefined in the vocab file are:
+<<<<<<< HEAD
+    -[PAD]
+=======
+>>>>>>> bffd3381ccb717f802fe651d4111ec0a268e3896
     -[UNK]
     -[MASK]
     -[CLS]
         words = [self.invocab[index] if index < len(self.invocab)
                  else "[%d]" % index for index in seq ]
+<<<<<<< HEAD
+        return words #" ".join(words)
+=======
         return " ".join(words)
+>>>>>>> bffd3381ccb717f802fe651d4111ec0a268e3896
 # if __init__ == "__main__":

test.py ADDED Viewed

	@@ -0,0 +1,8 @@

+import subprocess
+subprocess.run([
+    "python", "new_test_saved_finetuned_model.py",
+    "-workspace_name", "ratio_proportion_change3_2223/sch_largest_100-coded",
+    "-finetune_task", "highGRschool10",
+    "-finetuned_bert_classifier_checkpoint",
+    "ratio_proportion_change3_2223/sch_largest_100-coded/output/highGRschool10/bert_fine_tuned.model.ep42"
+])

test.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

test_hint_fine_tuned.py ADDED Viewed

	@@ -0,0 +1,45 @@

+import torch
+from torch.utils.data import DataLoader
+from src.vocab import Vocab
+from src.dataset import TokenizerDataset
+from hint_fine_tuning import CustomBERTModel
+import argparse
+def test_model(opt):
+    print(f"Loading Vocab {opt.vocab_path}")
+    vocab = Vocab(opt.vocab_path)
+    vocab.load_vocab()
+    print(f"Vocab Size: {len(vocab.vocab)}")
+    test_dataset = TokenizerDataset(opt.test_dataset, opt.test_label, vocab, seq_len=50)  # Using sequence length 50
+    print(f"Creating Dataloader")
+    test_data_loader = DataLoader(test_dataset, batch_size=32, num_workers=4)
+    # Load the entire fine-tuned model (including both architecture and weights)
+    print(f"Loading Model from {opt.finetuned_bert_checkpoint}")
+    model = torch.load(opt.finetuned_bert_checkpoint, map_location="cpu")
+    print(f"Number of Labels: {opt.num_labels}")
+    model.eval()
+    for batch_idx, data in enumerate(test_data_loader):
+        inputs = data["input"].to("cpu")
+        segment_info = data["segment_label"].to("cpu")
+        with torch.no_grad():
+            logits = model(inputs, segment_info)
+        print(f"Batch {batch_idx} logits: {logits}")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-t", "--test_dataset", type=str, default="/home/jupyter/bert/dataset/hint_based/ratio_proportion_change_3/er/er_test_dataset.csv", help="test set for evaluating fine-tuned model")
+    parser.add_argument("-tlabel", "--test_label", type=str, default="/home/jupyter/bert/dataset/hint_based/ratio_proportion_change_3/er/test_infos_only.csv", help="label set for evaluating fine-tuned model")
+    parser.add_argument("-c", "--finetuned_bert_checkpoint", type=str, default="/home/jupyter/bert/ratio_proportion_change3_1920/_Aug23/output/hint_classification/fine_tuned_model_2.pth", help="checkpoint of the saved fine-tuned BERT model")
+    parser.add_argument("-v", "--vocab_path", type=str, default="/home/jupyter/bert/ratio_proportion_change3_1920/_Aug23/pretraining/vocab.txt", help="built vocab model path")
+    parser.add_argument("-num_labels", type=int, default=2, help="Number of labels")
+    opt = parser.parse_args()
+    test_model(opt)

test_saved_model.py ADDED Viewed

	@@ -0,0 +1,234 @@

+# import torch.nn as nn
+# import torch
+import argparse
+from torch.utils.data import DataLoader
+import torch.nn as nn
+from torch.optim import Adam, SGD
+import torch
+from sklearn.metrics import precision_score, recall_score, f1_score
+from src.pretrainer import BERTFineTuneTrainer1
+from src.dataset import TokenizerDataset
+from src.vocab import Vocab
+import tqdm
+import numpy as np
+import time
+from src.bert import BERT
+from hint_fine_tuning import CustomBERTModel
+# from vocab import Vocab
+# class BERTForSequenceClassification(nn.Module):
+#     """
+#     Since its classification,
+#     n_labels = 2
+#     """
+#     def __init__(self, vocab_size, n_labels, layers=None, hidden=768, n_layers=12, attn_heads=12, dropout=0.1):
+#         super().__init__()
+#         device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+#         print(device)
+#         # model_ep0 = torch.load("output_1/bert_trained.model.ep0", map_location=device)
+#         self.bert = torch.load("output_1/bert_trained.model.ep0", map_location=device)
+#         self.dropout = nn.Dropout(dropout)
+#         # add an output layer
+#         self.
+#     def forward(self, x, segment_info):
+#         return x
+class BERTFineTunedTrainer:
+    def __init__(self, bert: CustomBERTModel, vocab_size: int,
+                 train_dataloader: DataLoader = None, test_dataloader: DataLoader = None,
+                 lr: float = 1e-4, betas=(0.9, 0.999), weight_decay: float = 0.01, warmup_steps=10000,
+                 with_cuda: bool = True, cuda_devices=None, log_freq: int = 10, workspace_name=None, num_labels=2):
+        """
+        :param bert: BERT model which you want to train
+        :param vocab_size: total word vocab size
+        :param train_dataloader: train dataset data loader
+        :param test_dataloader: test dataset data loader [can be None]
+        :param lr: learning rate of optimizer
+        :param betas: Adam optimizer betas
+        :param weight_decay: Adam optimizer weight decay param
+        :param with_cuda: traning with cuda
+        :param log_freq: logging frequency of the batch iteration
+        """
+        self.device = "cpu"
+        self.model = bert
+        self.test_data = test_dataloader
+        self.log_freq = log_freq
+        self.workspace_name = workspace_name
+        # print("Total Parameters:", sum([p.nelement() for p in self.model.parameters()]))
+    def test(self, epoch):
+        self.iteration(epoch, self.test_data, train=False)
+    def iteration(self, epoch, data_loader, train=True):
+        """
+        loop over the data_loader for training or testing
+        if on train status, backward operation is activated
+        and also auto save the model every peoch
+        :param epoch: current epoch index
+        :param data_loader: torch.utils.data.DataLoader for iteration
+        :param train: boolean value of is train or test
+        :return: None
+        """
+        str_code = "train" if train else "test"
+        # Setting the tqdm progress bar
+        data_iter = tqdm.tqdm(enumerate(data_loader),
+                              desc="EP_%s:%d" % (str_code, epoch),
+                              total=len(data_loader),
+                              bar_format="{l_bar}{r_bar}")
+        avg_loss = 0.0
+        total_correct = 0
+        total_element = 0
+        plabels = []
+        tlabels = []
+        logits_list = []
+        labels_list = []
+        positive_class_probs = []
+        self.model.eval()
+        for i, data in data_iter:
+            data = {key: value.to(self.device) for key, value in data.items()}
+            with torch.no_grad():
+                h_rep, logits = self.model.forward(data["input"], data["segment_label"])
+            # print(logits, logits.shape)
+            logits_list.append(logits.cpu())
+            labels_list.append(data["label"].cpu())
+            probs = F.Softmax(dim=-1)(logits)
+            predicted_labels = torch.argmax(probs, dim=-1)
+            true_labels = torch.argmax(data["label"], dim=-1)
+            positive_class_probs.extend(probs[:, 1])
+            plabels.extend(predicted_labels.cpu().numpy())
+            tlabels.extend(true_labels.cpu().numpy())
+            # print(">>>>>>>>>>>>>>", predicted_labels, true_labels)
+            # Compare predicted labels to true labels and calculate accuracy
+            correct = (predicted_labels == true_labels).sum().item()
+            total_correct += correct
+            total_element += data["label"].nelement()
+        precisions = precision_score(tlabels, plabels, average="weighted")
+        recalls = recall_score(tlabels, plabels, average="weighted")
+        f1_scores = f1_score(tlabels, plabels, average="weighted")
+        accuracy = total_correct * 100.0 / total_element
+        auc_score = roc_auc_score(tlabels.cpu(), plabels.cpu())
+        final_msg = {
+            "epoch": f"EP{epoch}_{str_code}",
+            "accuracy": accuracy,
+            "avg_loss": avg_loss / len(data_iter),
+            "precisions": precisions,
+            "recalls": recalls,
+            "f1_scores": f1_scores
+        }
+        print(final_msg)
+            # print("EP%d_%s, avg_loss=" % (epoch, str_code), avg_loss / len(data_iter), "total_acc=", total_correct * 100.0 / total_element)
+if __name__ == "__main__":
+    # device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+    # print(device)
+    # is_model = torch.load("ratio_proportion_change4/output/bert_fine_tuned.IS.model.ep40", map_location=device)
+#     learned_parameters = model_ep0.state_dict()
+#     for param_name, param_tensor in learned_parameters.items():
+#         print(param_name)
+#         print(param_tensor)
+    # # print(model_ep0.state_dict())
+    # # model_ep0.add_module("out", nn.Linear(10,2))
+    # # print(model_ep0)
+    # seq_vocab = Vocab("pretraining/vocab_file.txt")
+    # seq_vocab.load_vocab()
+    # classifier = BERTForSequenceClassification(len(seq_vocab.vocab), 2)
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-workspace_name', type=str, default="ratio_proportion_change3_1920")
+    # parser.add_argument("-t", "--test_dataset", type=str, default="finetuning/before_June/train_in.txt", help="test set for evaluate fine tune train set")
+    # parser.add_argument("-tlabel", "--test_label", type=str, default="finetuning/before_June/train_in_label.txt", help="test set for evaluate fine tune train set")
+    # ##### change Checkpoint
+    # parser.add_argument("-c", "--finetuned_bert_checkpoint", type=str, default="ratio_proportion_change3/output/before_June/bert_fine_tuned.FS.model.ep30", help="checkpoint of saved pretrained bert model")
+    # parser.add_argument("-v", "--vocab_path", type=str, default="pretraining/vocab.txt", help="built vocab model path with bert-vocab")
+    parser.add_argument("-t", "--test_dataset", type=str, default="/home/jupyter/bert/dataset/hint_based/ratio_proportion_change_3/er/er_test_dataset.csv", help="test set for evaluate fine tune train set")
+    parser.add_argument("-tlabel", "--test_label", type=str, default="/home/jupyter/bert/dataset/hint_based/ratio_proportion_change_3/er/test_infos_only.csv", help="test set for evaluate fine tune train set")
+    ##### change Checkpoint
+    parser.add_argument("-c", "--finetuned_bert_checkpoint", type=str, default="/home/jupyter/bert/ratio_proportion_change3_1920/_Aug23/output/hint_classification/fine_tuned_model_2.pth", help="checkpoint of saved pretrained bert model")
+    parser.add_argument("-v", "--vocab_path", type=str, default="/home/jupyter/bert/ratio_proportion_change3_1920/_Aug23/pretraining/vocab.txt", help="built vocab model path with bert-vocab")
+    parser.add_argument("-num_labels", type=int, default=2, help="Number of labels")
+    parser.add_argument("-hs", "--hidden", type=int, default=64, help="hidden size of transformer model")
+    parser.add_argument("-l", "--layers", type=int, default=4, help="number of layers")
+    parser.add_argument("-a", "--attn_heads", type=int, default=8, help="number of attention heads")
+    parser.add_argument("-s", "--seq_len", type=int, default=100, help="maximum sequence length")
+    parser.add_argument("-b", "--batch_size", type=int, default=32, help="number of batch_size")
+    parser.add_argument("-e", "--epochs", type=int, default=1, help="number of epochs")
+    # Use 50 for pretrain, and 10 for fine tune
+    parser.add_argument("-w", "--num_workers", type=int, default=4, help="dataloader worker size")
+    # Later run with cuda
+    parser.add_argument("--with_cuda", type=bool, default=False, help="training with CUDA: true, or false")
+    parser.add_argument("--log_freq", type=int, default=10, help="printing loss every n iter: setting n")
+    parser.add_argument("--corpus_lines", type=int, default=None, help="total number of lines in corpus")
+    parser.add_argument("--cuda_devices", type=int, nargs='+', default=None, help="CUDA device ids")
+    parser.add_argument("--on_memory", type=bool, default=True, help="Loading on memory: true or false")
+    parser.add_argument("--dropout", type=float, default=0.1, help="dropout of network")
+    parser.add_argument("--lr", type=float, default=1e-3, help="learning rate of adam")
+    parser.add_argument("--adam_weight_decay", type=float, default=0.01, help="weight_decay of adam")
+    parser.add_argument("--adam_beta1", type=float, default=0.9, help="adam first beta value")
+    parser.add_argument("--adam_beta2", type=float, default=0.999, help="adam first beta value")
+    args = parser.parse_args()
+    for k,v in vars(args).items():
+        if ('dataset' in k) or ('path' in k) or ('label' in k):
+            if v:
+                # setattr(args, f"{k}", args.workspace_name+"/"+v)
+                print(f"args.{k} : {getattr(args, f'{k}')}")
+    print("Loading Vocab", args.vocab_path)
+    vocab_obj = Vocab(args.vocab_path)
+    vocab_obj.load_vocab()
+    print("Vocab Size: ", len(vocab_obj.vocab))
+    print("Loading Test Dataset", args.test_dataset)
+    test_dataset = TokenizerDataset(args.test_dataset, args.test_label, vocab_obj, seq_len=args.seq_len)
+    print("Creating Dataloader")
+    test_data_loader = DataLoader(test_dataset, batch_size=args.batch_size, num_workers=args.num_workers)
+    bert = torch.load(args.finetuned_bert_checkpoint, map_location="cpu")
+    num_labels = 2
+    print(f"Number of Labels : {num_labels}")
+    print("Creating BERT Fine Tune Trainer")
+    trainer = BERTFineTuneTrainer1(bert, len(vocab_obj.vocab), train_dataloader=None, test_dataloader=test_data_loader,
+                                   lr=args.lr, betas=(args.adam_beta1, args.adam_beta2), weight_decay=args.adam_weight_decay, with_cuda=args.with_cuda, cuda_devices=args.cuda_devices, log_freq=args.log_freq, workspace_name = args.workspace_name, num_labels=args.num_labels)
+    print("Testing Start....")
+    start_time = time.time()
+    for epoch in range(args.epochs):
+        trainer.test(epoch)
+    end_time = time.time()
+    print("Time Taken to fine tune dataset = ", end_time - start_time)
+    # bert/ratio_proportion_change3_2223/sch_largest_100-coded/output/Opts/bert_fine_tuned.model.ep22

visualization.py ADDED Viewed

	@@ -0,0 +1,78 @@

+import numpy as np
+#import matplotlib as mpl
+#mpl.use('Agg')
+import matplotlib.pyplot as plt
+import metrics
+class ConfidenceHistogram(metrics.MaxProbCELoss):
+    def plot(self, output, labels, n_bins = 15, logits = True, title = None):
+        super().loss(output, labels, n_bins, logits)
+        #scale each datapoint
+        n = len(labels)
+        w = np.ones(n)/n
+        plt.rcParams["font.family"] = "serif"
+        #size and axis limits
+        plt.figure(figsize=(4,3))
+        plt.xlim(0,1)
+        plt.ylim(0,1)
+        plt.xticks([0.0, 0.2, 0.4, 0.6, 0.8, 1.0], ['0.0', '0.2', '0.4', '0.6', '0.8', '1.0'])
+        plt.yticks([0.0, 0.2, 0.4, 0.6, 0.8, 1.0], ['0.0', '0.2', '0.4', '0.6', '0.8', '1.0'])
+        #plot grid
+        plt.grid(color='tab:grey', linestyle=(0, (1, 5)), linewidth=1,zorder=0)
+        #plot histogram
+        plt.hist(self.confidences,n_bins,weights = w,color='b',range=(0.0,1.0),edgecolor = 'k')
+        #plot vertical dashed lines
+        acc = np.mean(self.accuracies)
+        conf = np.mean(self.confidences)
+        plt.axvline(x=acc, color='tab:grey', linestyle='--', linewidth = 3)
+        plt.axvline(x=conf, color='tab:grey', linestyle='--', linewidth = 3)
+        if acc > conf:
+            plt.text(acc+0.03,0.4,'Accuracy',rotation=90,fontsize=11)
+            plt.text(conf-0.07,0.4,'Avg. Confidence',rotation=90, fontsize=11)
+        else:
+            plt.text(acc-0.07,0.4,'Accuracy',rotation=90,fontsize=11)
+            plt.text(conf+0.03,0.4,'Avg. Confidence',rotation=90, fontsize=11)
+        plt.ylabel('% of Samples',fontsize=13)
+        plt.xlabel('Confidence',fontsize=13)
+        plt.tight_layout()
+        if title is not None:
+            plt.title(title,fontsize=16)
+        return plt
+class ReliabilityDiagram(metrics.MaxProbCELoss):
+    def plot(self, output, labels, n_bins = 15, logits = True, title = None):
+        super().loss(output, labels, n_bins, logits)
+        #computations
+        delta = 1.0/n_bins
+        x = np.arange(0,1,delta)
+        mid = np.linspace(delta/2,1-delta/2,n_bins)
+        error = np.concatenate((np.zeros(shape=7), np.abs(np.subtract(mid[7:],self.bin_acc[7:]))))
+        plt.rcParams["font.family"] = "serif"
+        #size and axis limits
+        plt.figure(figsize=(4,4))
+        plt.xlim(0,1)
+        plt.ylim(0,1)
+        #plot grid
+        plt.grid(color='tab:grey', linestyle=(0, (1, 5)), linewidth=1,zorder=0)
+        #plot bars and identity line
+        plt.bar(x, self.bin_acc, color = 'b', width=delta,align='edge',edgecolor = 'k',label='Outputs',zorder=5)
+        plt.bar(x, error, bottom=np.minimum(self.bin_acc,mid), color = 'mistyrose', alpha=0.5, width=delta,align='edge',edgecolor = 'r',hatch='/',label='Gap',zorder=10)
+        ident = [0.0, 1.0]
+        plt.plot(ident,ident,linestyle='--',color='tab:grey',zorder=15)
+        #labels and legend
+        plt.ylabel('Accuracy',fontsize=13)
+        plt.xlabel('Confidence',fontsize=13)
+        plt.legend(loc='upper left',framealpha=1.0,fontsize='medium')
+        if title is not None:
+            plt.title(title,fontsize=16)
+        plt.tight_layout()
+        return plt