File size: 4,228 Bytes
5285b7f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import datasets
import numpy as np
import torch
import transformers
from config import epochs, batch_size, learning_rate
from model import tokenizer, multitask_model
from mtm import MultitaskTrainer, NLPDataCollator, DataLoaderWithTaskname
import pandas as pd
# from data_5_LT23 import features_dict,extra_feature_dict
from data_predict import convert_to_stsb_features,convert_to_features

features_dict = {}
extra_feature_dict = {}
sentinews_location = ""

df_document_croatian_test = pd.read_csv(sentinews_location+"textlabel.tsv", sep="\t")
df_document_croatian_test = df_document_croatian_test[["content"]]

# gather everyone if you want to have a single DatasetDict
document = DatasetDict({
    # "train": Dataset.from_pandas(df_document_sl_hr_train),
    # "valid": Dataset.from_pandas(df_document_sl_hr_valid),
    "test": Dataset.from_pandas(df_document_croatian_test)
})

dataset_dict = {
    "document": document,
}

for task_name, dataset in dataset_dict.items():
    print(task_name)
    print(dataset_dict[task_name]["test"][0])
    print()


convert_func_dict = {
    "document": convert_to_stsb_features,
    # "paragraph": convert_to_stsb_features,
    # "sentence": convert_to_stsb_features,
}

features_dict = convert_to_features(dataset_dict, convert_func_dict)

from huggingface_hub import hf_hub_download,snapshot_download
snapshot_download(repo_id="FFZG-cleopatra/Croatian-News-Classifier")



# multitask_model.from_pretrained(, config="/media/gaurish/angela/projects/CroatianSlovenEnglishBert/i-got-u-brother-cleopatra-workshop/src/models/multitask_model_3ep/config.json")
multitask_model.load_state_dict(torch.load(
    "/home/gaurishthakkar/projects/i-got-u-brother-cleopatra-workshop/src/models/multitask_model_3ep/pytorch_model.bin"
    ))
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
multitask_model.to(device)
predictions = []
for _, batch in enumerate(features_dict["document"]['test']):
    for key, value in batch.items():
        batch[key] = batch[key].to(device)
    
    task_model = multitask_model.get_model("document")
    classifier_output = task_model.forward(
            torch.unsqueeze(batch["input_ids"], 0),
            torch.unsqueeze(batch["attention_mask"], 0),)
    
    print(tokenizer.decode(batch["input_ids"],skip_special_tokens=True))
    prediction =torch.max(classifier_output.logits, axis=1)
    predictions.append(prediction.indices.item())

pd.DataFrame({"original_predictions":predictions}).to_csv("eacl_slavic.tsv")


trainer = MultitaskTrainer(
    model=multitask_model,
    args=transformers.TrainingArguments(
        learning_rate=learning_rate,
        output_dir="/tmp",
        do_train=False,
        do_eval=True,
        # evaluation_strategy ="steps",
        # num_train_epochs=epochs,
        fp16=True,
        # Adjust batch size if this doesn't fit on the Colab GPU
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        save_steps=3000,
        # eval_steps=50,
        load_best_model_at_end=True,
        
    ),
    data_collator=NLPDataCollator(tokenizer=tokenizer),
    callbacks=[],
    
)
print(features_dict["document"]["test"])
tests_dict = {}
for task_name in ["document"]: # "paragraph", "sentence"
    test_dataloader = DataLoaderWithTaskname(
        task_name,
        trainer.get_eval_dataloader(features_dict[task_name]["test"])
    )
    print(len(trainer.get_eval_dataloader(features_dict[task_name]["test"])))
    print(test_dataloader.data_loader.collate_fn)
    print(len(test_dataloader.data_loader))
    tests_dict[task_name] = trainer.prediction_loop(
        test_dataloader,
        description=f"Testing: {task_name}"
    )
print(tests_dict)
for task_name in ["document",  ]: #"paragraph","sentence"
    for metric in ["precision", "recall", "f1"]:
        print("test {} {}:".format(metric, task_name),
              datasets.load_metric(metric,
                                   name="dev {} {}".format(metric, task_name)).compute(
                  predictions=np.argmax(
                      tests_dict[task_name].predictions, axis=1),
                  references=tests_dict[task_name].label_ids, average="macro"
              ))
print()