Spaces:
Sleeping
Sleeping
calculating metrics
Browse files- src/classifier.ipynb +120 -1
- src/utils.py +19 -0
src/classifier.ipynb
CHANGED
@@ -35,7 +35,9 @@
|
|
35 |
"from sklearn.preprocessing import LabelEncoder\n",
|
36 |
"import torch\n",
|
37 |
"from transformers import AutoModel, AutoTokenizer\n",
|
38 |
-
"import xgboost as xgb"
|
|
|
|
|
39 |
]
|
40 |
},
|
41 |
{
|
@@ -910,6 +912,123 @@
|
|
910 |
" learning_rate=0.1)\n",
|
911 |
"xgb_model.fit(X_train, y_train_encoded)"
|
912 |
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
913 |
}
|
914 |
],
|
915 |
"metadata": {
|
|
|
35 |
"from sklearn.preprocessing import LabelEncoder\n",
|
36 |
"import torch\n",
|
37 |
"from transformers import AutoModel, AutoTokenizer\n",
|
38 |
+
"import xgboost as xgb\n",
|
39 |
+
"\n",
|
40 |
+
"from utils import evaluate_predictions"
|
41 |
]
|
42 |
},
|
43 |
{
|
|
|
912 |
" learning_rate=0.1)\n",
|
913 |
"xgb_model.fit(X_train, y_train_encoded)"
|
914 |
]
|
915 |
+
},
|
916 |
+
{
|
917 |
+
"cell_type": "markdown",
|
918 |
+
"metadata": {},
|
919 |
+
"source": [
|
920 |
+
"Now let's have a look at some metrics for our models."
|
921 |
+
]
|
922 |
+
},
|
923 |
+
{
|
924 |
+
"cell_type": "code",
|
925 |
+
"execution_count": 14,
|
926 |
+
"metadata": {},
|
927 |
+
"outputs": [
|
928 |
+
{
|
929 |
+
"name": "stdout",
|
930 |
+
"output_type": "stream",
|
931 |
+
"text": [
|
932 |
+
"Logistic Regression\n",
|
933 |
+
"\n",
|
934 |
+
"Train set:\n",
|
935 |
+
"Accuracy: 0.3448095238095238\n",
|
936 |
+
"F1 macro average: 0.30283202516650803\n",
|
937 |
+
"F1 weighted average: 0.35980803167526537\n",
|
938 |
+
"--------------------------------------------------\n",
|
939 |
+
"Test set:\n",
|
940 |
+
"Accuracy: 0.2324\n",
|
941 |
+
"F1 macro average: 0.15894661492139023\n",
|
942 |
+
"F1 weighted average: 0.2680459740545796\n"
|
943 |
+
]
|
944 |
+
}
|
945 |
+
],
|
946 |
+
"source": [
|
947 |
+
"lr_train_preds = lr_model.predict(X_train)\n",
|
948 |
+
"lr_test_preds = lr_model.predict(X_test)\n",
|
949 |
+
"\n",
|
950 |
+
"evaluate_predictions(model=\"Logistic Regression\", \n",
|
951 |
+
" train_preds= lr_train_preds, y_train=y_train,\n",
|
952 |
+
" test_preds=lr_test_preds, y_test=y_test)"
|
953 |
+
]
|
954 |
+
},
|
955 |
+
{
|
956 |
+
"cell_type": "code",
|
957 |
+
"execution_count": 15,
|
958 |
+
"metadata": {},
|
959 |
+
"outputs": [
|
960 |
+
{
|
961 |
+
"name": "stdout",
|
962 |
+
"output_type": "stream",
|
963 |
+
"text": [
|
964 |
+
"Random Forest\n",
|
965 |
+
"\n",
|
966 |
+
"Train set:\n",
|
967 |
+
"Accuracy: 0.5817619047619048\n",
|
968 |
+
"F1 macro average: 0.6302920544396868\n",
|
969 |
+
"F1 weighted average: 0.5817320656440126\n",
|
970 |
+
"--------------------------------------------------\n",
|
971 |
+
"Test set:\n",
|
972 |
+
"Accuracy: 0.2188\n",
|
973 |
+
"F1 macro average: 0.11020842424166737\n",
|
974 |
+
"F1 weighted average: 0.2054551695522176\n"
|
975 |
+
]
|
976 |
+
}
|
977 |
+
],
|
978 |
+
"source": [
|
979 |
+
"rf_train_preds = rf_model.predict(X_train)\n",
|
980 |
+
"rf_test_preds = rf_model.predict(X_test)\n",
|
981 |
+
"\n",
|
982 |
+
"evaluate_predictions(model=\"Random Forest\", \n",
|
983 |
+
" train_preds= rf_train_preds, y_train=y_train,\n",
|
984 |
+
" test_preds=rf_test_preds, y_test=y_test)"
|
985 |
+
]
|
986 |
+
},
|
987 |
+
{
|
988 |
+
"cell_type": "code",
|
989 |
+
"execution_count": 16,
|
990 |
+
"metadata": {},
|
991 |
+
"outputs": [
|
992 |
+
{
|
993 |
+
"name": "stderr",
|
994 |
+
"output_type": "stream",
|
995 |
+
"text": [
|
996 |
+
"/home/mehdi/miniconda3/envs/adc/lib/python3.10/site-packages/xgboost/core.py:160: UserWarning: [12:38:31] WARNING: /home/conda/feedstock_root/build_artifacts/xgboost-split_1705650282415/work/src/common/error_msg.cc:58: Falling back to prediction using DMatrix due to mismatched devices. This might lead to higher memory usage and slower performance. XGBoost is running on: cuda:0, while the input data is on: cpu.\n",
|
997 |
+
"Potential solutions:\n",
|
998 |
+
"- Use a data structure that matches the device ordinal in the booster.\n",
|
999 |
+
"- Set the device for booster before call to inplace_predict.\n",
|
1000 |
+
"\n",
|
1001 |
+
"This warning will only be shown once.\n",
|
1002 |
+
"\n",
|
1003 |
+
" warnings.warn(smsg, UserWarning)\n"
|
1004 |
+
]
|
1005 |
+
},
|
1006 |
+
{
|
1007 |
+
"name": "stdout",
|
1008 |
+
"output_type": "stream",
|
1009 |
+
"text": [
|
1010 |
+
"XgBoost\n",
|
1011 |
+
"\n",
|
1012 |
+
"Train set:\n",
|
1013 |
+
"Accuracy: 0.9998571428571429\n",
|
1014 |
+
"F1 macro average: 0.9998485323510583\n",
|
1015 |
+
"F1 weighted average: 0.9998571499183782\n",
|
1016 |
+
"--------------------------------------------------\n",
|
1017 |
+
"Test set:\n",
|
1018 |
+
"Accuracy: 0.3552\n",
|
1019 |
+
"F1 macro average: 0.13665190979200587\n",
|
1020 |
+
"F1 weighted average: 0.288613804297705\n"
|
1021 |
+
]
|
1022 |
+
}
|
1023 |
+
],
|
1024 |
+
"source": [
|
1025 |
+
"xgb_train_preds = xgb_model.predict(X_train)\n",
|
1026 |
+
"xgb_test_preds = xgb_model.predict(X_test)\n",
|
1027 |
+
"\n",
|
1028 |
+
"evaluate_predictions(model=\"XgBoost\", \n",
|
1029 |
+
" train_preds= xgb_train_preds, y_train=y_train_encoded,\n",
|
1030 |
+
" test_preds=xgb_test_preds, y_test=y_test_encoded)"
|
1031 |
+
]
|
1032 |
}
|
1033 |
],
|
1034 |
"metadata": {
|
src/utils.py
CHANGED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from sklearn.metrics import accuracy_score, f1_score
|
2 |
+
|
3 |
+
|
4 |
+
def get_metrics(y_true, y_preds):
|
5 |
+
accuracy = accuracy_score(y_true, y_preds)
|
6 |
+
f1_macro = f1_score(y_true, y_preds, average="macro")
|
7 |
+
f1_weighted = f1_score(y_true, y_preds, average="weighted")
|
8 |
+
print(f"Accuracy: {accuracy}")
|
9 |
+
print(f"F1 macro average: {f1_macro}")
|
10 |
+
print(f"F1 weighted average: {f1_weighted}")
|
11 |
+
|
12 |
+
|
13 |
+
def evaluate_predictions(model:str, train_preds, y_train, test_preds, y_test):
|
14 |
+
print(model)
|
15 |
+
print("\nTrain set:")
|
16 |
+
get_metrics(y_train, train_preds)
|
17 |
+
print("-"*50)
|
18 |
+
print("Test set:")
|
19 |
+
get_metrics(y_test, test_preds)
|