Spaces:

Pranjal2041
/

SemSup-XC

Runtime error

App Files Files Community

Pranjal2041 commited on Dec 19, 2022

Commit

4014562

•

1 Parent(s): 353ec7a

Initial Commit

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitignore +4 -0
.vscode/launch.json +13 -0
ExamplesCreator.ipynb +116 -0
Scrape.ipynb +0 -0
amzn_examples.json +0 -0
cleaned_code/Demo.ipynb +0 -0
cleaned_code/DemoFast.ipynb +875 -0
cleaned_code/bert_coil_map_dict_lemma255K_isotropic.json +0 -0
cleaned_code/ckpt/Amzn13K/amzn_main_model.bin +3 -0
cleaned_code/configs/PredsRemover.ipynb +149 -0
cleaned_code/configs/ablation_amzn_1_coil.yml +85 -0
cleaned_code/configs/ablation_amzn_1_descs.yml +89 -0
cleaned_code/configs/ablation_amzn_1_hier.yml +85 -0
cleaned_code/configs/ablation_amzn_1_relax.yml +86 -0
cleaned_code/configs/ablation_amzn_eda.yml +81 -0
cleaned_code/configs/ablation_amzn_eda_base.yml +85 -0
cleaned_code/configs/ablation_amzn_eda_base2.yml +84 -0
cleaned_code/configs/ablation_eurlex_1_base.yml +85 -0
cleaned_code/configs/ablation_eurlex_1_coil.yml +88 -0
cleaned_code/configs/ablation_eurlex_1_descs.yml +91 -0
cleaned_code/configs/ablation_eurlex_1_hier_descs.yml +91 -0
cleaned_code/configs/ablation_eurlex_1_hierarchy.yml +88 -0
cleaned_code/configs/ablation_eurlex_1_relax.yml +86 -0
cleaned_code/configs/ablation_eurlex_eda.yml +82 -0
cleaned_code/configs/amzn13k_active_hfwnet.yml +79 -0
cleaned_code/configs/amzn13k_active_highfreq.yml +87 -0
cleaned_code/configs/amzn13k_active_random.yml +81 -0
cleaned_code/configs/amzn13k_active_wnet.yml +79 -0
cleaned_code/configs/amzn13k_active_wnet2.yml +86 -0
cleaned_code/configs/amzn13k_baseline.yml +73 -0
cleaned_code/configs/amzn13k_baseline_descs.yml +81 -0
cleaned_code/configs/amzn13k_baseline_descs_edaaug.yml +75 -0
cleaned_code/configs/amzn13k_baseline_descs_fullsup.yml +74 -0
cleaned_code/configs/amzn13k_baseline_descs_masked_0.0.yml +75 -0
cleaned_code/configs/amzn13k_baseline_descs_masked_0.2.yml +75 -0
cleaned_code/configs/amzn13k_baseline_descs_masked_0.5.yml +75 -0
cleaned_code/configs/amzn13k_baseline_descs_masked_0.9.yml +75 -0
cleaned_code/configs/amzn13k_baseline_descs_merge.yml +76 -0
cleaned_code/configs/amzn13k_baseline_fs.yml +80 -0
cleaned_code/configs/amzn13k_baseline_fs2.yml +80 -0
cleaned_code/configs/amzn13k_baseline_fs5.yml +80 -0
cleaned_code/configs/amzn13k_baseline_hierdescs.yml +84 -0
cleaned_code/configs/amzn13k_baseline_hierdescs_seen.yml +82 -0
cleaned_code/configs/baseline.yml +52 -0
cleaned_code/configs/eurlex4.3k_baseline.yml +87 -0
cleaned_code/configs/eurlex4.3k_baseline2.yml +84 -0
cleaned_code/configs/eurlex4.3k_baseline_fs.yml +90 -0
cleaned_code/configs/eurlex4.3k_baseline_fs20.yml +90 -0
cleaned_code/configs/eurlex4.3k_baseline_fs5.yml +78 -0
cleaned_code/configs/eurlex4.3k_baseline_nl.yml +88 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,4 @@

+cleaned_code/temp_file.pkl
+cleaned_code/precomputed/Amzn13K/amzn_base_labels_data2.pkl
+cleaned_code/precomputed/Amzn13K/amzn_base_labels_data3.pkl
+__pycache__

.vscode/launch.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+    "version": "0.2.0",
+    "configurations": [
+        {
+            "name": "Python: Current File",
+            "type": "python",
+            "request": "launch",
+            "program": "${file}",
+            "console": "integratedTerminal",
+            "justMyCode": true
+        }
+    ]
+}

ExamplesCreator.ipynb ADDED Viewed

	@@ -0,0 +1,116 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "/n/fs/nlp-pranjal\n"
+     ]
+    }
+   ],
+   "source": [
+    "%cd ../../../"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "/n/fs/nlp-pranjal\n"
+     ]
+    }
+   ],
+   "source": [
+    "!pwd"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "/n/fs/nlp-pranjal/SemSup-LMLC/training/datasets/Amzn13K\n"
+     ]
+    }
+   ],
+   "source": [
+    "%cd SemSup-LMLC/training/datasets/Amzn13K"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "td = [json.loads(x) for x in open('test.jsonl')]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "examples = np.random.choice(td, 100, replace=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "json.dump(list(examples), open('amzn_examples.json','w'), indent=2)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "interpreter": {
+   "hash": "90fcbf6f06d9a30c70fdaff45e14c5534421a599dc22a7267c486c9cb67dea6d"
+  },
+  "kernelspec": {
+   "display_name": "Python 3.9.12 ('base')",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.12"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

Scrape.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

amzn_examples.json ADDED Viewed

The diff for this file is too large to render. See raw diff

cleaned_code/Demo.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

cleaned_code/DemoFast.ipynb ADDED Viewed

	@@ -0,0 +1,875 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import numpy as np\n",
+    "import pickle\n",
+    "import h5py\n",
+    "from tqdm import tqdm\n",
+    "from transformers import AutoTokenizer\n",
+    "from scipy.special import expit "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def compute_tok_score_cart(doc_reps, doc_input_ids, qry_reps, qry_input_ids, qry_attention_mask):\n",
+    "    qry_input_ids = qry_input_ids.unsqueeze(2).unsqueeze(3)  # Q * LQ * 1 * 1\n",
+    "    doc_input_ids = doc_input_ids.unsqueeze(0).unsqueeze(1)  # 1 * 1 * D * LD\n",
+    "    exact_match = doc_input_ids == qry_input_ids  # Q * LQ * D * LD\n",
+    "    exact_match = exact_match.float()\n",
+    "    scores_no_masking = torch.matmul(\n",
+    "        qry_reps.view(-1, 16),  # (Q * LQ) * d\n",
+    "        doc_reps.view(-1, 16).transpose(0, 1)  # d * (D * LD)\n",
+    "    )\n",
+    "    scores_no_masking = scores_no_masking.view(\n",
+    "        *qry_reps.shape[:2], *doc_reps.shape[:2])  # Q * LQ * D * LD\n",
+    "    scores, _ = (scores_no_masking * exact_match).max(dim=3)  # Q * LQ * D\n",
+    "    tok_scores = (scores * qry_attention_mask.reshape(-1, qry_attention_mask.shape[-1]).unsqueeze(2))[:, 1:].sum(1)\n",
+    "    \n",
+    "    return tok_scores\n",
+    "\n",
+    "import torch\n",
+    "from typing import Optional\n",
+    "def coil_fast_eval_forward(\n",
+    "    input_ids: Optional[torch.Tensor] = None,\n",
+    "    doc_reps = None,\n",
+    "    logits: Optional[torch.Tensor] = None,\n",
+    "    desc_input_ids = None,\n",
+    "    desc_attention_mask = None,\n",
+    "    lab_reps = None,\n",
+    "    label_embeddings = None\n",
+    "):\n",
+    "    tok_scores = compute_tok_score_cart(\n",
+    "            doc_reps, input_ids,\n",
+    "            lab_reps, desc_input_ids.reshape(-1, desc_input_ids.shape[-1]), desc_attention_mask\n",
+    "    )\n",
+    "    logits = (logits.unsqueeze(0) @ label_embeddings.T)\n",
+    "    new_tok_scores = torch.zeros(logits.shape, device = logits.device)\n",
+    "    for i in range(tok_scores.shape[1]):\n",
+    "        stride = tok_scores.shape[0]//tok_scores.shape[1]\n",
+    "        new_tok_scores[i] = tok_scores[i*stride: i*stride + stride ,i]\n",
+    "    return (logits + new_tok_scores).squeeze()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "label_list = [x.strip() for x in open('datasets/Amzn13K/all_labels.txt')]\n",
+    "unseen_label_list = [x.strip() for x in open('datasets/Amzn13K/unseen_labels_split6500_2.txt')]\n",
+    "num_labels = len(label_list)\n",
+    "label_list.sort() # For consistency\n",
+    "l2i = {v: i for i, v in enumerate(label_list)}\n",
+    "unseen_label_indexes = [l2i[x] for x in unseen_label_list]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "coil_cluster_map = json.load(open('bert_coil_map_dict_lemma255K_isotropic.json'))  "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "label_preds = pickle.load(open('/n/fs/nlp-pranjal/SemSup-LMLC/training/ablation_amzn_1_main_labels_zsl.pkl','rb'))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "label_preds = pickle.load(open('/n/fs/scratch/pranjal/seed_experiments/ablation_amzn_eda_labels_zsl_seed2.pkl','rb'))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 38,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|██████████| 13330/13330 [00:00<00:00, 64680.71it/s]\n"
+     ]
+    }
+   ],
+   "source": [
+    "all_lab_reps, all_label_embeddings, all_desc_input_ids, all_desc_attention_mask = [], [], [], []\n",
+    "for l in tqdm(label_list):\n",
+    "    ll = label_preds[l]\n",
+    "    lab_reps, label_embeddings, desc_input_ids, desc_attention_mask = ll[np.random.randint(len(ll))]   \n",
+    "    all_lab_reps.append(lab_reps.squeeze())\n",
+    "    all_label_embeddings.append(label_embeddings.squeeze())\n",
+    "    all_desc_input_ids.append(desc_input_ids.squeeze())\n",
+    "    all_desc_attention_mask.append(desc_attention_mask.squeeze())\n",
+    "all_lab_reps = torch.stack(all_lab_reps).cpu()\n",
+    "all_label_embeddings = torch.stack(all_label_embeddings).cpu()\n",
+    "all_desc_input_ids = torch.stack(all_desc_input_ids).cpu()\n",
+    "all_desc_attention_mask = torch.stack(all_desc_attention_mask).cpu()\n",
+    "all_desc_input_ids_clus = torch.tensor([[coil_cluster_map[str(x.item())] for x in xx]  for xx in all_desc_input_ids])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pickle.dump([all_lab_reps, all_label_embeddings, all_desc_input_ids, all_desc_input_ids_clus, all_desc_attention_mask], open('precomputed/Amzn13K/amzn_base_labels_data1_4.pkl','wb'))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "device = 'cuda' if torch.cuda.is_available() else 'cpu'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "all_lab_reps1, all_label_embeddings1, _, all_desc_input_ids1, all_desc_attention_mask1 = pickle.load(open('precomputed/Amzn13K/amzn_base_labels_data1.pkl','rb'))\n",
+    "all_lab_reps2, all_label_embeddings2, _, all_desc_input_ids2, all_desc_attention_mask2 = pickle.load(open('precomputed/Amzn13K/amzn_base_labels_data2.pkl','rb'))\n",
+    "all_lab_reps3, all_label_embeddings3, _, all_desc_input_ids3, all_desc_attention_mask3 = pickle.load(open('precomputed/Amzn13K/amzn_base_labels_data3.pkl','rb'))\n",
+    "\n",
+    "\n",
+    "all_lab_reps = [all_lab_reps1.to(device), all_lab_reps2.to(device), all_lab_reps3.to(device)]\n",
+    "all_label_embeddings = [all_label_embeddings1.to(device), all_label_embeddings2.to(device), all_label_embeddings3.to(device)]\n",
+    "all_desc_input_ids = [all_desc_input_ids1.to(device), all_desc_input_ids2.to(device), all_desc_input_ids3.to(device)]\n",
+    "all_desc_attention_mask = [all_desc_attention_mask1.to(device), all_desc_attention_mask2.to(device), all_desc_attention_mask3.to(device)]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Yaml Config is:\n",
+      "--------------------------------------------------------------------------------\n",
+      "{'task_name': 'amazon13k', 'dataset_name': 'amazon13k', 'dataset_config_name': None, 'max_seq_length': 160, 'overwrite_output_dir': False, 'overwrite_cache': False, 'pad_to_max_length': True, 'load_from_local': True, 'max_train_samples': None, 'max_eval_samples': 15000, 'max_predict_samples': None, 'train_file': '/n/fs/nlp-pranjal/SemSup-LMLC/training/datasets/Amzn13K/train_split6500_2.jsonl', 'validation_file': '/n/fs/nlp-pranjal/SemSup-LMLC/training/datasets/Amzn13K/test_unseen_split6500_2.jsonl', 'test_file': '/n/fs/nlp-pranjal/SemSup-LMLC/training/datasets/Amzn13K/test_unseen_split6500_2.jsonl', 'label_max_seq_length': 160, 'descriptions_file': '/n/fs/nlp-pranjal/SemSup-LMLC/training/datasets/Amzn13K/heir_withdescriptions_v3_v3_unseen_edaaug.json', 'test_descriptions_file': '/n/fs/nlp-pranjal/SemSup-LMLC/training/datasets/Amzn13K/heir_withdescriptions_v3_v3.json', 'all_labels': '/n/fs/nlp-pranjal/SemSup-LMLC/training/datasets/Amzn13K/all_labels.txt', 'test_labels': '/n/fs/nlp-pranjal/SemSup-LMLC/training/datasets/Amzn13K/unseen_labels_split6500_2.txt', 'contrastive_learning_samples': 1000, 'cl_min_positive_descs': 1, 'coil_cluster_mapping_path': 'bert_coil_map_dict_lemma255K_isotropic.json', 'model_name_or_path': 'bert-base-uncased', 'config_name': None, 'tokenizer_name': None, 'cache_dir': None, 'use_fast_tokenizer': True, 'model_revision': 'main', 'use_auth_token': False, 'ignore_mismatched_sizes': False, 'negative_sampling': 'none', 'semsup': True, 'label_model_name_or_path': 'prajjwal1/bert-small', 'encoder_model_type': 'bert', 'use_custom_optimizer': 'adamw', 'output_learning_rate': 0.0001, 'arch_type': 2, 'add_label_name': True, 'normalize_embeddings': False, 'tie_weights': False, 'coil': True, 'colbert': False, 'token_dim': 16, 'label_frozen_layers': 2, 'do_train': True, 'do_eval': True, 'do_predict': False, 'per_device_train_batch_size': 1, 'gradient_accumulation_steps': 8, 'per_device_eval_batch_size': 1, 'learning_rate': 5e-05, 'num_train_epochs': 2, 'save_steps': 4900, 'evaluation_strategy': 'steps', 'eval_steps': 3000000, 'fp16': True, 'fp16_opt_level': 'O1', 'lr_scheduler_type': 'linear', 'dataloader_num_workers': 16, 'label_names': ['labels'], 'scenario': 'unseen_labels', 'ddp_find_unused_parameters': False, 'ignore_data_skip': True, 'seed': -1, 'EXP_NAME': 'semsup_descs_100ep_newds_cosine', 'EXP_DESC': 'SemSup Descriptions ran for 100 epochs', 'output_dir': 'demo_tmp'}\n",
+      "--------------------------------------------------------------------------------\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Some weights of the model checkpoint at prajjwal1/bert-small were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias']\n",
+      "- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
+      "- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
+      "Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias']\n",
+      "- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
+      "- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Config is BertConfig {\n",
+      "  \"_name_or_path\": \"bert-base-uncased\",\n",
+      "  \"arch_type\": 2,\n",
+      "  \"architectures\": [\n",
+      "    \"BertForMaskedLM\"\n",
+      "  ],\n",
+      "  \"attention_probs_dropout_prob\": 0.1,\n",
+      "  \"classifier_dropout\": null,\n",
+      "  \"coil\": true,\n",
+      "  \"colbert\": false,\n",
+      "  \"encoder_model_type\": \"bert\",\n",
+      "  \"finetuning_task\": \"amazon13k\",\n",
+      "  \"gradient_checkpointing\": false,\n",
+      "  \"hidden_act\": \"gelu\",\n",
+      "  \"hidden_dropout_prob\": 0.1,\n",
+      "  \"hidden_size\": 768,\n",
+      "  \"initializer_range\": 0.02,\n",
+      "  \"intermediate_size\": 3072,\n",
+      "  \"label_hidden_size\": 512,\n",
+      "  \"layer_norm_eps\": 1e-12,\n",
+      "  \"max_position_embeddings\": 512,\n",
+      "  \"model_name_or_path\": \"bert-base-uncased\",\n",
+      "  \"model_type\": \"bert\",\n",
+      "  \"negative_sampling\": \"none\",\n",
+      "  \"num_attention_heads\": 12,\n",
+      "  \"num_hidden_layers\": 12,\n",
+      "  \"pad_token_id\": 0,\n",
+      "  \"position_embedding_type\": \"absolute\",\n",
+      "  \"problem_type\": \"multi_label_classification\",\n",
+      "  \"semsup\": true,\n",
+      "  \"token_dim\": 16,\n",
+      "  \"transformers_version\": \"4.20.0\",\n",
+      "  \"type_vocab_size\": 2,\n",
+      "  \"use_cache\": true,\n",
+      "  \"vocab_size\": 30522\n",
+      "}\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "from src import BertForSemanticEmbedding, getLabelModel\n",
+    "from src import DataTrainingArguments, ModelArguments, CustomTrainingArguments, read_yaml_config\n",
+    "from src import dataset_classification_type\n",
+    "from src import SemSupDataset\n",
+    "from transformers import AutoConfig, HfArgumentParser, AutoTokenizer\n",
+    "import torch\n",
+    "\n",
+    "import json\n",
+    "from tqdm import tqdm\n",
+    "\n",
+    "ARGS_FILE = 'configs/ablation_amzn_eda.yml'\n",
+    "\n",
+    "parser = HfArgumentParser((ModelArguments, DataTrainingArguments, CustomTrainingArguments))\n",
+    "model_args, data_args, training_args = parser.parse_dict(read_yaml_config(ARGS_FILE, output_dir = 'demo_tmp',  extra_args = {}))\n",
+    "\n",
+    "config = AutoConfig.from_pretrained(\n",
+    "    model_args.config_name if model_args.config_name else model_args.model_name_or_path,\n",
+    "    finetuning_task=data_args.task_name,\n",
+    "    cache_dir=model_args.cache_dir,\n",
+    "    revision=model_args.model_revision,\n",
+    "    use_auth_token=True if model_args.use_auth_token else None,\n",
+    ")\n",
+    "\n",
+    "config.model_name_or_path = model_args.model_name_or_path\n",
+    "config.problem_type = dataset_classification_type[data_args.task_name]\n",
+    "config.negative_sampling = model_args.negative_sampling\n",
+    "config.semsup = model_args.semsup\n",
+    "config.encoder_model_type = model_args.encoder_model_type\n",
+    "config.arch_type = model_args.arch_type\n",
+    "config.coil = model_args.coil\n",
+    "config.token_dim = model_args.token_dim\n",
+    "config.colbert = model_args.colbert\n",
+    "\n",
+    "label_model, label_tokenizer = getLabelModel(data_args, model_args)\n",
+    "config.label_hidden_size = label_model.config.hidden_size\n",
+    "model = BertForSemanticEmbedding(config)\n",
+    "model.label_model = label_model\n",
+    "model.label_tokenizer = label_tokenizer\n",
+    "model.config.label2id = {l: i for i, l in enumerate(label_list)}\n",
+    "model.config.id2label = {id: label for label, id in config.label2id.items()}\n",
+    "\n",
+    "tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "BertForSemanticEmbedding(\n",
+       "  (encoder): BertModel(\n",
+       "    (embeddings): BertEmbeddings(\n",
+       "      (word_embeddings): Embedding(30522, 768, padding_idx=0)\n",
+       "      (position_embeddings): Embedding(512, 768)\n",
+       "      (token_type_embeddings): Embedding(2, 768)\n",
+       "      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+       "      (dropout): Dropout(p=0.1, inplace=False)\n",
+       "    )\n",
+       "    (encoder): BertEncoder(\n",
+       "      (layer): ModuleList(\n",
+       "        (0): BertLayer(\n",
+       "          (attention): BertAttention(\n",
+       "            (self): BertSelfAttention(\n",
+       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            )\n",
+       "            (output): BertSelfOutput(\n",
+       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            )\n",
+       "          )\n",
+       "          (intermediate): BertIntermediate(\n",
+       "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
+       "            (intermediate_act_fn): GELUActivation()\n",
+       "          )\n",
+       "          (output): BertOutput(\n",
+       "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
+       "            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "        )\n",
+       "        (1): BertLayer(\n",
+       "          (attention): BertAttention(\n",
+       "            (self): BertSelfAttention(\n",
+       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            )\n",
+       "            (output): BertSelfOutput(\n",
+       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            )\n",
+       "          )\n",
+       "          (intermediate): BertIntermediate(\n",
+       "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
+       "            (intermediate_act_fn): GELUActivation()\n",
+       "          )\n",
+       "          (output): BertOutput(\n",
+       "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
+       "            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "        )\n",
+       "        (2): BertLayer(\n",
+       "          (attention): BertAttention(\n",
+       "            (self): BertSelfAttention(\n",
+       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            )\n",
+       "            (output): BertSelfOutput(\n",
+       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            )\n",
+       "          )\n",
+       "          (intermediate): BertIntermediate(\n",
+       "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
+       "            (intermediate_act_fn): GELUActivation()\n",
+       "          )\n",
+       "          (output): BertOutput(\n",
+       "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
+       "            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "        )\n",
+       "        (3): BertLayer(\n",
+       "          (attention): BertAttention(\n",
+       "            (self): BertSelfAttention(\n",
+       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            )\n",
+       "            (output): BertSelfOutput(\n",
+       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            )\n",
+       "          )\n",
+       "          (intermediate): BertIntermediate(\n",
+       "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
+       "            (intermediate_act_fn): GELUActivation()\n",
+       "          )\n",
+       "          (output): BertOutput(\n",
+       "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
+       "            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "        )\n",
+       "        (4): BertLayer(\n",
+       "          (attention): BertAttention(\n",
+       "            (self): BertSelfAttention(\n",
+       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            )\n",
+       "            (output): BertSelfOutput(\n",
+       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            )\n",
+       "          )\n",
+       "          (intermediate): BertIntermediate(\n",
+       "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
+       "            (intermediate_act_fn): GELUActivation()\n",
+       "          )\n",
+       "          (output): BertOutput(\n",
+       "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
+       "            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "        )\n",
+       "        (5): BertLayer(\n",
+       "          (attention): BertAttention(\n",
+       "            (self): BertSelfAttention(\n",
+       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            )\n",
+       "            (output): BertSelfOutput(\n",
+       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            )\n",
+       "          )\n",
+       "          (intermediate): BertIntermediate(\n",
+       "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
+       "            (intermediate_act_fn): GELUActivation()\n",
+       "          )\n",
+       "          (output): BertOutput(\n",
+       "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
+       "            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "        )\n",
+       "        (6): BertLayer(\n",
+       "          (attention): BertAttention(\n",
+       "            (self): BertSelfAttention(\n",
+       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            )\n",
+       "            (output): BertSelfOutput(\n",
+       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            )\n",
+       "          )\n",
+       "          (intermediate): BertIntermediate(\n",
+       "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
+       "            (intermediate_act_fn): GELUActivation()\n",
+       "          )\n",
+       "          (output): BertOutput(\n",
+       "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
+       "            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "        )\n",
+       "        (7): BertLayer(\n",
+       "          (attention): BertAttention(\n",
+       "            (self): BertSelfAttention(\n",
+       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            )\n",
+       "            (output): BertSelfOutput(\n",
+       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            )\n",
+       "          )\n",
+       "          (intermediate): BertIntermediate(\n",
+       "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
+       "            (intermediate_act_fn): GELUActivation()\n",
+       "          )\n",
+       "          (output): BertOutput(\n",
+       "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
+       "            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "        )\n",
+       "        (8): BertLayer(\n",
+       "          (attention): BertAttention(\n",
+       "            (self): BertSelfAttention(\n",
+       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            )\n",
+       "            (output): BertSelfOutput(\n",
+       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            )\n",
+       "          )\n",
+       "          (intermediate): BertIntermediate(\n",
+       "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
+       "            (intermediate_act_fn): GELUActivation()\n",
+       "          )\n",
+       "          (output): BertOutput(\n",
+       "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
+       "            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "        )\n",
+       "        (9): BertLayer(\n",
+       "          (attention): BertAttention(\n",
+       "            (self): BertSelfAttention(\n",
+       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            )\n",
+       "            (output): BertSelfOutput(\n",
+       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            )\n",
+       "          )\n",
+       "          (intermediate): BertIntermediate(\n",
+       "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
+       "            (intermediate_act_fn): GELUActivation()\n",
+       "          )\n",
+       "          (output): BertOutput(\n",
+       "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
+       "            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "        )\n",
+       "        (10): BertLayer(\n",
+       "          (attention): BertAttention(\n",
+       "            (self): BertSelfAttention(\n",
+       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            )\n",
+       "            (output): BertSelfOutput(\n",
+       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            )\n",
+       "          )\n",
+       "          (intermediate): BertIntermediate(\n",
+       "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
+       "            (intermediate_act_fn): GELUActivation()\n",
+       "          )\n",
+       "          (output): BertOutput(\n",
+       "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
+       "            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "        )\n",
+       "        (11): BertLayer(\n",
+       "          (attention): BertAttention(\n",
+       "            (self): BertSelfAttention(\n",
+       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            )\n",
+       "            (output): BertSelfOutput(\n",
+       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            )\n",
+       "          )\n",
+       "          (intermediate): BertIntermediate(\n",
+       "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
+       "            (intermediate_act_fn): GELUActivation()\n",
+       "          )\n",
+       "          (output): BertOutput(\n",
+       "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
+       "            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "        )\n",
+       "      )\n",
+       "    )\n",
+       "    (pooler): BertPooler(\n",
+       "      (dense): Linear(in_features=768, out_features=768, bias=True)\n",
+       "      (activation): Tanh()\n",
+       "    )\n",
+       "  )\n",
+       "  (tok_proj): Linear(in_features=768, out_features=16, bias=True)\n",
+       "  (dropout): Dropout(p=0.1, inplace=False)\n",
+       "  (label_projection): Linear(in_features=768, out_features=512, bias=False)\n",
+       "  (label_model): BertModel(\n",
+       "    (embeddings): BertEmbeddings(\n",
+       "      (word_embeddings): Embedding(30522, 512, padding_idx=0)\n",
+       "      (position_embeddings): Embedding(512, 512)\n",
+       "      (token_type_embeddings): Embedding(2, 512)\n",
+       "      (LayerNorm): LayerNorm((512,), eps=1e-12, elementwise_affine=True)\n",
+       "      (dropout): Dropout(p=0.1, inplace=False)\n",
+       "    )\n",
+       "    (encoder): BertEncoder(\n",
+       "      (layer): ModuleList(\n",
+       "        (0): BertLayer(\n",
+       "          (attention): BertAttention(\n",
+       "            (self): BertSelfAttention(\n",
+       "              (query): Linear(in_features=512, out_features=512, bias=True)\n",
+       "              (key): Linear(in_features=512, out_features=512, bias=True)\n",
+       "              (value): Linear(in_features=512, out_features=512, bias=True)\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            )\n",
+       "            (output): BertSelfOutput(\n",
+       "              (dense): Linear(in_features=512, out_features=512, bias=True)\n",
+       "              (LayerNorm): LayerNorm((512,), eps=1e-12, elementwise_affine=True)\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            )\n",
+       "          )\n",
+       "          (intermediate): BertIntermediate(\n",
+       "            (dense): Linear(in_features=512, out_features=2048, bias=True)\n",
+       "            (intermediate_act_fn): GELUActivation()\n",
+       "          )\n",
+       "          (output): BertOutput(\n",
+       "            (dense): Linear(in_features=2048, out_features=512, bias=True)\n",
+       "            (LayerNorm): LayerNorm((512,), eps=1e-12, elementwise_affine=True)\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "        )\n",
+       "        (1): BertLayer(\n",
+       "          (attention): BertAttention(\n",
+       "            (self): BertSelfAttention(\n",
+       "              (query): Linear(in_features=512, out_features=512, bias=True)\n",
+       "              (key): Linear(in_features=512, out_features=512, bias=True)\n",
+       "              (value): Linear(in_features=512, out_features=512, bias=True)\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            )\n",
+       "            (output): BertSelfOutput(\n",
+       "              (dense): Linear(in_features=512, out_features=512, bias=True)\n",
+       "              (LayerNorm): LayerNorm((512,), eps=1e-12, elementwise_affine=True)\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            )\n",
+       "          )\n",
+       "          (intermediate): BertIntermediate(\n",
+       "            (dense): Linear(in_features=512, out_features=2048, bias=True)\n",
+       "            (intermediate_act_fn): GELUActivation()\n",
+       "          )\n",
+       "          (output): BertOutput(\n",
+       "            (dense): Linear(in_features=2048, out_features=512, bias=True)\n",
+       "            (LayerNorm): LayerNorm((512,), eps=1e-12, elementwise_affine=True)\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "        )\n",
+       "        (2): BertLayer(\n",
+       "          (attention): BertAttention(\n",
+       "            (self): BertSelfAttention(\n",
+       "              (query): Linear(in_features=512, out_features=512, bias=True)\n",
+       "              (key): Linear(in_features=512, out_features=512, bias=True)\n",
+       "              (value): Linear(in_features=512, out_features=512, bias=True)\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            )\n",
+       "            (output): BertSelfOutput(\n",
+       "              (dense): Linear(in_features=512, out_features=512, bias=True)\n",
+       "              (LayerNorm): LayerNorm((512,), eps=1e-12, elementwise_affine=True)\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            )\n",
+       "          )\n",
+       "          (intermediate): BertIntermediate(\n",
+       "            (dense): Linear(in_features=512, out_features=2048, bias=True)\n",
+       "            (intermediate_act_fn): GELUActivation()\n",
+       "          )\n",
+       "          (output): BertOutput(\n",
+       "            (dense): Linear(in_features=2048, out_features=512, bias=True)\n",
+       "            (LayerNorm): LayerNorm((512,), eps=1e-12, elementwise_affine=True)\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "        )\n",
+       "        (3): BertLayer(\n",
+       "          (attention): BertAttention(\n",
+       "            (self): BertSelfAttention(\n",
+       "              (query): Linear(in_features=512, out_features=512, bias=True)\n",
+       "              (key): Linear(in_features=512, out_features=512, bias=True)\n",
+       "              (value): Linear(in_features=512, out_features=512, bias=True)\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            )\n",
+       "            (output): BertSelfOutput(\n",
+       "              (dense): Linear(in_features=512, out_features=512, bias=True)\n",
+       "              (LayerNorm): LayerNorm((512,), eps=1e-12, elementwise_affine=True)\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            )\n",
+       "          )\n",
+       "          (intermediate): BertIntermediate(\n",
+       "            (dense): Linear(in_features=512, out_features=2048, bias=True)\n",
+       "            (intermediate_act_fn): GELUActivation()\n",
+       "          )\n",
+       "          (output): BertOutput(\n",
+       "            (dense): Linear(in_features=2048, out_features=512, bias=True)\n",
+       "            (LayerNorm): LayerNorm((512,), eps=1e-12, elementwise_affine=True)\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "        )\n",
+       "      )\n",
+       "    )\n",
+       "    (pooler): BertPooler(\n",
+       "      (dense): Linear(in_features=512, out_features=512, bias=True)\n",
+       "      (activation): Tanh()\n",
+       "    )\n",
+       "  )\n",
+       ")"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "model.to(device)\n",
+    "model.eval()\n",
+    "torch.set_grad_enabled(False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 65,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<All keys matched successfully>"
+      ]
+     },
+     "execution_count": 65,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "model.load_state_dict(torch.load('ckpt/Amzn13K/amzn_main_model.bin', map_location = device))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 88,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "text = '''SanDisk Cruzer Blade 32GB USB Flash Drive\\nUltra-compact and portable USB flash drive,Capless design\n",
+    "Share your photos, videos, songs and other files between computers with ease,care number:18001205899/18004195592\n",
+    "Protect your private files with included SanDisk SecureAccess software\n",
+    "Includes added protection of secure online backup (up to 2GB optionally available) offered by YuuWaa\n",
+    "Password-protect your sensitive files. Customer care:IndiaSupport@sandisk.com\n",
+    "Importer Details:Rashi Peripherals Pvt. Ltd. Rashi Complex,A Building,Survey186,Dongaripada,Poman Village,Vasai Bhiwandi Road, Dist. Thane,Maharastra 401208, India\n",
+    "Share your work files between computers with ease\n",
+    "Manufacturer Name & Address: SanDisk International LTD, C/O Unit 100, Airside Business Park, Lakeshore Drive, Swords, Co Dublin, Ireland.\n",
+    "Consumer Complaint Details: indiasupport@sandisk.com/18001022055'''"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 89,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "item = tokenizer(text, padding='max_length', max_length=data_args.max_seq_length, truncation=True)\n",
+    "item = {k:torch.tensor(v, device = device).unsqueeze(0) for k,v in item.items()}\n",
+    "\n",
+    "outputs_doc, logits = model.forward_input_encoder(**item)\n",
+    "doc_reps = model.tok_proj(outputs_doc.last_hidden_state)\n",
+    "\n",
+    "input_ids = torch.tensor([coil_cluster_map[str(x.item())] for x in item['input_ids'][0]]).to(device).unsqueeze(0)\n",
+    "all_logits = []\n",
+    "for adi, ada, alr, ale in zip(all_desc_input_ids, all_desc_attention_mask, all_lab_reps, all_label_embeddings):\n",
+    "    all_logits.append(coil_fast_eval_forward(input_ids, doc_reps, logits, adi, ada, alr, ale))\n",
+    "\n",
+    "final_logits = sum([expit(x.cpu()) for x in all_logits]) / len(all_logits)\n",
+    "\n",
+    "outs = torch.topk(final_logits, k = 5)\n",
+    "preds_dic = dict()\n",
+    "for i,v in zip(outs.indices, outs.values):\n",
+    "    preds_dic[label_list[i]] = v.item()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 90,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'electronics': 0.9989226460456848,\n",
+       " 'computers & accessories': 0.981508731842041,\n",
+       " 'computer components': 0.9518740177154541,\n",
+       " 'computer accessories': 0.7639468312263489,\n",
+       " 'hardware': 0.6584190726280212}"
+      ]
+     },
+     "execution_count": 90,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "preds_dic"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 78,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "torch.Size([13330])"
+      ]
+     },
+     "execution_count": 78,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "final_logits.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "interpreter": {
+   "hash": "90fcbf6f06d9a30c70fdaff45e14c5534421a599dc22a7267c486c9cb67dea6d"
+  },
+  "kernelspec": {
+   "display_name": "Python 3.9.12 ('base')",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.12"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

cleaned_code/bert_coil_map_dict_lemma255K_isotropic.json ADDED Viewed

The diff for this file is too large to render. See raw diff

cleaned_code/ckpt/Amzn13K/amzn_main_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d0899b193b49dc3e8acf2caa984fbaee1520933bbc2f61cbb3e594363a702708
+size 554726619

cleaned_code/configs/PredsRemover.ipynb ADDED Viewed

	@@ -0,0 +1,149 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import json\n",
+    "from os.path import join"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "OUT_DIR = 'output/'\n",
+    "EXP_DIR = join(OUT_DIR, 'semsup_descs_amzn13k_curie_nocoil', 'predictions')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "/n/fs/nlp-pranjal/SemSup-LMLC/training\n"
+     ]
+    }
+   ],
+   "source": [
+    "%cd .."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "files = dict()\n",
+    "for file in os.listdir(EXP_DIR):\n",
+    "    t = float(file.split('_')[-1].replace('.pkl',''))\n",
+    "    if t not in files:\n",
+    "        files[t] = []\n",
+    "    files[t] += [join(EXP_DIR, file)]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "21.792958695441484"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import itertools\n",
+    "tsize = 0\n",
+    "for file in itertools.chain(*files.values()):\n",
+    "    tsize += os.path.getsize(file)\n",
+    "tsize/ (1024**3)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "files = {k:files[k] for k in sorted(files.keys())}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "10.170047391206026"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import random\n",
+    "tsize = 0\n",
+    "for k in sorted(list(files.keys()))[10:]:\n",
+    "    if random.random() > 0.6:\n",
+    "        continue\n",
+    "    for f in files[k]:\n",
+    "        tsize += os.path.getsize(f)\n",
+    "        os.remove(f)\n",
+    "tsize/ (1024**3)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "interpreter": {
+   "hash": "90fcbf6f06d9a30c70fdaff45e14c5534421a599dc22a7267c486c9cb67dea6d"
+  },
+  "kernelspec": {
+   "display_name": "Python 3.9.12 ('base')",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.12"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

cleaned_code/configs/ablation_amzn_1_coil.yml ADDED Viewed

	@@ -0,0 +1,85 @@

+EXP_NAME: "semsup_descs_100ep_newds_cosine"
+EXP_DESC: "SemSup Descriptions ran for 100 epochs"
+DATA:
+    task_name: amazon13k
+    dataset_name: amazon13k
+    dataset_config_name: null
+    max_seq_length: 160
+    overwrite_output_dir: false # Set to false, if using one_hour_job
+    overwrite_cache: false
+    pad_to_max_length: true
+    load_from_local: true
+    max_train_samples: null
+    max_eval_samples: null
+    max_predict_samples: null
+    train_file: datasets/Amzn13K/train_split6500_2.jsonl
+    validation_file: datasets/Amzn13K/test_unseen_split6500_2.jsonl
+    test_file: datasets/Amzn13K/test_unseen_split6500_2.jsonl
+    label_max_seq_length: 160
+    # descriptions_file: datasets/Amzn13K/amzn_curie_descsriptions.json
+    descriptions_file: datasets/Amzn13K/heir_withdescriptions_v3_v3_unseen_final.json
+    test_descriptions_file: datasets/Amzn13K/heir_withdescriptions_v3_v3.json
+    all_labels : datasets/Amzn13K/all_labels.txt
+    test_labels: datasets/Amzn13K/unseen_labels_split6500_2.txt
+    contrastive_learning_samples: 1000
+    cl_min_positive_descs: 1
+    # bm_short_file: datasets/eurlex4.3k/train_bmshort.txt
+    # ignore_pos_labels_file: datasets/Amzn13K/ignore_train_split6500_fs5.txt
+    coil_cluster_mapping_path: bert_coil_map_dict_lemma255K_isotropic.json
+MODEL:
+    model_name_or_path: bert-base-uncased
+    # pretrained_model_path: output/semsup_descs_amzn13k_web_6500_small/checkpoint-20000/pytorch_model.bin
+    config_name: null
+    tokenizer_name: null
+    cache_dir: null
+    use_fast_tokenizer: true
+    model_revision: main
+    use_auth_token: false
+    ignore_mismatched_sizes: false
+    negative_sampling: "none"
+    semsup: true
+    # label_model_name_or_path: bert-base-uncased # prajjwal1/bert-small
+    label_model_name_or_path: prajjwal1/bert-small
+    encoder_model_type: bert
+    use_custom_optimizer: adamw
+    output_learning_rate: 1.e-4
+    arch_type : 2
+    add_label_name: true
+    normalize_embeddings: false
+    tie_weights: false
+    coil: false
+    colbert: false
+    # use_precomputed_embeddings: datasets/eurlex4.3k/heir_withdescriptions_4.3k_v1_embs_bert_9_96.npy
+    token_dim: 16
+    label_frozen_layers: 2
+TRAINING:
+    do_train: true
+    do_eval: true
+    do_predict: false
+    per_device_train_batch_size: 1
+    gradient_accumulation_steps: 8
+    per_device_eval_batch_size: 1
+    learning_rate: 5.e-5 # Will point to input encoder lr, if user_custom_optimizer is False
+    num_train_epochs: 2
+    save_steps: 4900
+    evaluation_strategy: steps
+    eval_steps: 3000000
+    fp16: true
+    fp16_opt_level: O1
+    lr_scheduler_type: "linear" # defaults to 'linear'
+    dataloader_num_workers: 16
+    label_names: [labels]
+    scenario: "unseen_labels"
+    ddp_find_unused_parameters: false
+    max_eval_samples: 15000
+    ignore_data_skip: true
+    # one_hour_job: true
+    seed: -1

cleaned_code/configs/ablation_amzn_1_descs.yml ADDED Viewed

	@@ -0,0 +1,89 @@

+EXP_NAME: "semsup_descs_100ep_newds_cosine"
+EXP_DESC: "SemSup Descriptions ran for 100 epochs"
+DATA:
+    task_name: amazon13k
+    dataset_name: amazon13k
+    dataset_config_name: null
+    max_seq_length: 160
+    overwrite_output_dir: true # Set to false, if using one_hour_job
+    overwrite_cache: false
+    pad_to_max_length: true
+    load_from_local: true
+    max_train_samples: null
+    max_eval_samples: null
+    max_predict_samples: null
+    train_file: datasets/Amzn13K/train_split6500_2.jsonl
+    validation_file: datasets/Amzn13K/test_unseen_split6500_2.jsonl
+    test_file: datasets/Amzn13K/test_unseen_split6500_2.jsonl
+    # validation_file: datasets/Amzn13K/test.jsonl
+    # test_file: datasets/Amzn13K/test.jsonl
+    label_max_seq_length: 64
+    # descriptions_file: datasets/Amzn13K/amzn_curie_descsriptions.json
+    descriptions_file: datasets/Amzn13K/heir_withoutdescriptions_v3_v3_unseen.json
+    test_descriptions_file: datasets/Amzn13K/heir_withoutdescriptions_v3_v3.json
+    all_labels : datasets/Amzn13K/all_labels.txt
+    # test_labels : datasets/Amzn13K/all_labels.txt
+    test_labels: datasets/Amzn13K/unseen_labels_split6500_2.txt
+    contrastive_learning_samples: 1000
+    cl_min_positive_descs: 1
+    # bm_short_file: datasets/eurlex4.3k/train_bmshort.txt
+    # ignore_pos_labels_file: datasets/Amzn13K/ignore_train_split6500_fs5.txt
+    coil_cluster_mapping_path: bert_coil_map_dict_lemma255K_isotropic.json
+MODEL:
+    model_name_or_path: bert-base-uncased
+    # pretrained_model_path: output/ablation_amzn_1_descs/checkpoint-21000/pytorch_model.bin
+    config_name: null
+    tokenizer_name: null
+    cache_dir: null
+    use_fast_tokenizer: true
+    model_revision: main
+    use_auth_token: false
+    ignore_mismatched_sizes: false
+    negative_sampling: "none"
+    semsup: true
+    # label_model_name_or_path: bert-base-uncased # prajjwal1/bert-small
+    label_model_name_or_path: prajjwal1/bert-small
+    encoder_model_type: bert
+    use_custom_optimizer: adamw
+    output_learning_rate: 1.e-4
+    arch_type : 2
+    add_label_name: true
+    normalize_embeddings: false
+    tie_weights: false
+    coil: true
+    colbert: false
+    # use_precomputed_embeddings: datasets/eurlex4.3k/heir_withdescriptions_4.3k_v1_embs_bert_9_96.npy
+    token_dim: 16
+    label_frozen_layers: 2
+TRAINING:
+    do_train: false
+    do_eval: true
+    do_predict: false
+    per_device_train_batch_size: 4
+    gradient_accumulation_steps: 4
+    per_device_eval_batch_size: 1
+    learning_rate: 5.e-5 # Will point to input encoder lr, if user_custom_optimizer is False
+    num_train_epochs: 2
+    save_steps: 4900
+    evaluation_strategy: steps
+    eval_steps: 3000000
+    fp16: true
+    fp16_opt_level: O1
+    lr_scheduler_type: "linear" # defaults to 'linear'
+    dataloader_num_workers: 16
+    label_names: [labels]
+    scenario: "unseen_labels"
+    ddp_find_unused_parameters: false
+    max_eval_samples: 15000
+    ignore_data_skip: true
+    # one_hour_job: true
+    seed: -1

cleaned_code/configs/ablation_amzn_1_hier.yml ADDED Viewed

	@@ -0,0 +1,85 @@

+EXP_NAME: "semsup_descs_100ep_newds_cosine"
+EXP_DESC: "SemSup Descriptions ran for 100 epochs"
+DATA:
+    task_name: amazon13k
+    dataset_name: amazon13k
+    dataset_config_name: null
+    max_seq_length: 160
+    overwrite_output_dir: true # Set to false, if using one_hour_job
+    overwrite_cache: false
+    pad_to_max_length: true
+    load_from_local: true
+    max_train_samples: null
+    max_eval_samples: null
+    max_predict_samples: null
+    train_file: datasets/Amzn13K/train_split6500_2.jsonl
+    validation_file: datasets/Amzn13K/test_unseen_split6500_2.jsonl
+    test_file: datasets/Amzn13K/test_unseen_split6500_2.jsonl
+    label_max_seq_length: 96
+    # descriptions_file: datasets/Amzn13K/amzn_curie_descsriptions.json
+    descriptions_file: datasets/Amzn13K/amzn_descs_refined_v3_v3.json
+    test_descriptions_file: datasets/Amzn13K/amzn_descs_refined_v3_v3.json
+    all_labels : datasets/Amzn13K/all_labels.txt
+    test_labels: datasets/Amzn13K/unseen_labels_split6500_2.txt
+    contrastive_learning_samples: 1000
+    cl_min_positive_descs: 1
+    # bm_short_file: datasets/eurlex4.3k/train_bmshort.txt
+    # ignore_pos_labels_file: datasets/Amzn13K/ignore_train_split6500_fs5.txt
+    coil_cluster_mapping_path: bert_coil_map_dict_lemma255K_isotropic.json
+MODEL:
+    model_name_or_path: bert-base-uncased
+    # pretrained_model_path: output/semsup_descs_amzn13k_web_6500_small/checkpoint-20000/pytorch_model.bin
+    config_name: null
+    tokenizer_name: null
+    cache_dir: null
+    use_fast_tokenizer: true
+    model_revision: main
+    use_auth_token: false
+    ignore_mismatched_sizes: false
+    negative_sampling: "none"
+    semsup: true
+    # label_model_name_or_path: bert-base-uncased # prajjwal1/bert-small
+    label_model_name_or_path: prajjwal1/bert-small
+    encoder_model_type: bert
+    use_custom_optimizer: adamw
+    output_learning_rate: 1.e-4
+    arch_type : 2
+    add_label_name: true
+    normalize_embeddings: false
+    tie_weights: false
+    coil: true
+    colbert: false
+    # use_precomputed_embeddings: datasets/eurlex4.3k/heir_withdescriptions_4.3k_v1_embs_bert_9_96.npy
+    token_dim: 16
+    label_frozen_layers: 2
+TRAINING:
+    do_train: true
+    do_eval: true
+    do_predict: false
+    per_device_train_batch_size: 2
+    gradient_accumulation_steps: 8
+    per_device_eval_batch_size: 1
+    learning_rate: 5.e-5 # Will point to input encoder lr, if user_custom_optimizer is False
+    num_train_epochs: 2
+    save_steps: 4900
+    evaluation_strategy: steps
+    eval_steps: 3000000
+    fp16: true
+    fp16_opt_level: O1
+    lr_scheduler_type: "linear" # defaults to 'linear'
+    dataloader_num_workers: 16
+    label_names: [labels]
+    scenario: "unseen_labels"
+    ddp_find_unused_parameters: false
+    max_eval_samples: 15000
+    ignore_data_skip: true
+    # one_hour_job: true
+    seed: -1

cleaned_code/configs/ablation_amzn_1_relax.yml ADDED Viewed

	@@ -0,0 +1,86 @@

+EXP_NAME: "semsup_descs_100ep_newds_cosine"
+EXP_DESC: "SemSup Descriptions ran for 100 epochs"
+DATA:
+    task_name: amazon13k
+    dataset_name: amazon13k
+    dataset_config_name: null
+    max_seq_length: 160
+    overwrite_output_dir: false # Set to false, if using one_hour_job
+    overwrite_cache: false
+    pad_to_max_length: true
+    load_from_local: true
+    max_train_samples: null
+    max_eval_samples: null
+    max_predict_samples: null
+    train_file: datasets/Amzn13K/train_split6500_2.jsonl
+    validation_file: datasets/Amzn13K/test_unseen_split6500_2.jsonl
+    test_file: datasets/Amzn13K/test_unseen_split6500_2.jsonl
+    label_max_seq_length: 160
+    # descriptions_file: datasets/Amzn13K/amzn_curie_descsriptions.json
+    descriptions_file: datasets/Amzn13K/heir_withdescriptions_v3_v3_unseen_final.json
+    test_descriptions_file: datasets/Amzn13K/heir_withdescriptions_v3_v3.json
+    all_labels : datasets/Amzn13K/all_labels.txt
+    test_labels: datasets/Amzn13K/unseen_labels_split6500_2.txt
+    contrastive_learning_samples: 1000
+    cl_min_positive_descs: 1
+    # bm_short_file: datasets/eurlex4.3k/train_bmshort.txt
+    # ignore_pos_labels_file: datasets/Amzn13K/ignore_train_split6500_fs5.txt
+    # coil_cluster_mapping_path: bert_coil_map_dict_lemma255K_isotropic.json
+MODEL:
+    model_name_or_path: bert-base-uncased
+    # pretrained_model_path: output/semsup_descs_amzn13k_web_6500_small/checkpoint-20000/pytorch_model.bin
+    config_name: null
+    tokenizer_name: null
+    cache_dir: null
+    use_fast_tokenizer: true
+    model_revision: main
+    use_auth_token: false
+    ignore_mismatched_sizes: false
+    negative_sampling: "none"
+    semsup: true
+    # label_model_name_or_path: bert-base-uncased # prajjwal1/bert-small
+    label_model_name_or_path: prajjwal1/bert-small
+    encoder_model_type: bert
+    use_custom_optimizer: adamw
+    output_learning_rate: 1.e-4
+    arch_type : 2
+    add_label_name: true
+    normalize_embeddings: false
+    tie_weights: false
+    coil: true
+    colbert: false
+    # use_precomputed_embeddings: datasets/eurlex4.3k/heir_withdescriptions_4.3k_v1_embs_bert_9_96.npy
+    token_dim: 16
+    label_frozen_layers: 2
+TRAINING:
+    do_train: true
+    do_eval: true
+    do_predict: false
+    per_device_train_batch_size: 1
+    gradient_accumulation_steps: 8
+    per_device_eval_batch_size: 1
+    learning_rate: 5.e-5 # Will point to input encoder lr, if user_custom_optimizer is False
+    num_train_epochs: 2
+    save_steps: 4900
+    evaluation_strategy: steps
+    eval_steps: 3000000
+    fp16: true
+    fp16_opt_level: O1
+    lr_scheduler_type: "linear" # defaults to 'linear'
+    dataloader_num_workers: 16
+    label_names: [labels]
+    scenario: "unseen_labels"
+    ddp_find_unused_parameters: false
+    max_eval_samples: 15000
+    ignore_data_skip: true
+    # one_hour_job: true
+    seed: -1

cleaned_code/configs/ablation_amzn_eda.yml ADDED Viewed

	@@ -0,0 +1,81 @@

+DATA:
+    task_name: amazon13k
+    dataset_name: amazon13k
+    dataset_config_name: null
+    max_seq_length: 160
+    overwrite_output_dir: false # Set to false, if using one_hour_job
+    overwrite_cache: false
+    pad_to_max_length: true
+    load_from_local: true
+    max_train_samples: null
+    max_eval_samples: null
+    max_predict_samples: null
+    train_file: /n/fs/nlp-pranjal/SemSup-LMLC/training/datasets/Amzn13K/train_split6500_2.jsonl
+    validation_file: /n/fs/nlp-pranjal/SemSup-LMLC/training/datasets/Amzn13K/test_unseen_split6500_2.jsonl
+    test_file: /n/fs/nlp-pranjal/SemSup-LMLC/training/datasets/Amzn13K/test_unseen_split6500_2.jsonl
+    label_max_seq_length: 160
+    descriptions_file: /n/fs/nlp-pranjal/SemSup-LMLC/training/datasets/Amzn13K/heir_withdescriptions_v3_v3_unseen_edaaug.json
+    test_descriptions_file: /n/fs/nlp-pranjal/SemSup-LMLC/training/datasets/Amzn13K/heir_withdescriptions_v3_v3.json
+    all_labels : /n/fs/nlp-pranjal/SemSup-LMLC/training/datasets/Amzn13K/all_labels.txt
+    test_labels: /n/fs/nlp-pranjal/SemSup-LMLC/training/datasets/Amzn13K/unseen_labels_split6500_2.txt
+    contrastive_learning_samples: 1000
+    cl_min_positive_descs: 1
+    # bm_short_file: /n/fs/nlp-pranjal/SemSup-LMLC/training/datasets/eurlex4.3k/train_bmshort.txt
+    # ignore_pos_labels_file: /n/fs/nlp-pranjal/SemSup-LMLC/training/datasets/Amzn13K/ignore_train_split6500_fs5.txt
+    coil_cluster_mapping_path: bert_coil_map_dict_lemma255K_isotropic.json
+MODEL:
+    model_name_or_path: bert-base-uncased
+    # pretrained_model_path: output/semsup_descs_amzn13k_web_6500_small/checkpoint-20000/pytorch_model.bin
+    config_name: null
+    tokenizer_name: null
+    cache_dir: null
+    use_fast_tokenizer: true
+    model_revision: main
+    use_auth_token: false
+    ignore_mismatched_sizes: false
+    negative_sampling: "none"
+    semsup: true
+    # label_model_name_or_path: bert-base-uncased # prajjwal1/bert-small
+    label_model_name_or_path: prajjwal1/bert-small
+    encoder_model_type: bert
+    use_custom_optimizer: adamw
+    output_learning_rate: 1.e-4
+    arch_type : 2
+    add_label_name: true
+    normalize_embeddings: false
+    tie_weights: false
+    coil: true
+    colbert: false
+    # use_precomputed_embeddings: /n/fs/nlp-pranjal/SemSup-LMLC/training/datasets/eurlex4.3k/heir_withdescriptions_4.3k_v1_embs_bert_9_96.npy
+    token_dim: 16
+    label_frozen_layers: 2
+TRAINING:
+    do_train: true
+    do_eval: true
+    do_predict: false
+    per_device_train_batch_size: 1
+    gradient_accumulation_steps: 8
+    per_device_eval_batch_size: 1
+    learning_rate: 5.e-5 # Will point to input encoder lr, if user_custom_optimizer is False
+    num_train_epochs: 2
+    save_steps: 4900
+    evaluation_strategy: steps
+    eval_steps: 3000000
+    fp16: false
+    # fp16_opt_level: O1
+    lr_scheduler_type: "linear" # defaults to 'linear'
+    dataloader_num_workers: 16
+    label_names: [labels]
+    scenario: "unseen_labels"
+    ddp_find_unused_parameters: false
+    max_eval_samples: 15000
+    ignore_data_skip: true
+    # one_hour_job: true
+    seed: -1

cleaned_code/configs/ablation_amzn_eda_base.yml ADDED Viewed

	@@ -0,0 +1,85 @@

+EXP_NAME: "semsup_descs_100ep_newds_cosine"
+EXP_DESC: "SemSup Descriptions ran for 100 epochs"
+DATA:
+    task_name: amazon13k
+    dataset_name: amazon13k
+    dataset_config_name: null
+    max_seq_length: 160
+    overwrite_output_dir: false # Set to false, if using one_hour_job
+    overwrite_cache: false
+    pad_to_max_length: true
+    load_from_local: true
+    max_train_samples: null
+    max_eval_samples: null
+    max_predict_samples: null
+    train_file: datasets/Amzn13K/train_split6500_2.jsonl
+    validation_file: datasets/Amzn13K/test_unseen_split6500_2.jsonl
+    test_file: datasets/Amzn13K/test_unseen_split6500_2.jsonl
+    label_max_seq_length: 160
+    # descriptions_file: datasets/Amzn13K/amzn_curie_descsriptions.json
+    descriptions_file: datasets/Amzn13K/heir_withdescriptions_v3_v3_unseen_final.json
+    test_descriptions_file: datasets/Amzn13K/heir_withdescriptions_v3_v3.json
+    all_labels : datasets/Amzn13K/all_labels.txt
+    test_labels: datasets/Amzn13K/unseen_labels_split6500_2.txt
+    contrastive_learning_samples: 1000
+    cl_min_positive_descs: 1
+    # bm_short_file: datasets/eurlex4.3k/train_bmshort.txt
+    # ignore_pos_labels_file: datasets/Amzn13K/ignore_train_split6500_fs5.txt
+    coil_cluster_mapping_path: bert_coil_map_dict_lemma255K_isotropic.json
+MODEL:
+    model_name_or_path: bert-base-uncased
+    # pretrained_model_path: output/semsup_descs_amzn13k_web_6500_small/checkpoint-20000/pytorch_model.bin
+    config_name: null
+    tokenizer_name: null
+    cache_dir: null
+    use_fast_tokenizer: true
+    model_revision: main
+    use_auth_token: false
+    ignore_mismatched_sizes: false
+    negative_sampling: "none"
+    semsup: true
+    # label_model_name_or_path: bert-base-uncased # prajjwal1/bert-small
+    label_model_name_or_path: prajjwal1/bert-small
+    encoder_model_type: bert
+    use_custom_optimizer: adamw
+    output_learning_rate: 1.e-4
+    arch_type : 2
+    add_label_name: true
+    normalize_embeddings: false
+    tie_weights: false
+    coil: true
+    colbert: false
+    # use_precomputed_embeddings: datasets/eurlex4.3k/heir_withdescriptions_4.3k_v1_embs_bert_9_96.npy
+    token_dim: 16
+    label_frozen_layers: 2
+TRAINING:
+    do_train: true
+    do_eval: true
+    do_predict: false
+    per_device_train_batch_size: 1
+    gradient_accumulation_steps: 8
+    per_device_eval_batch_size: 1
+    learning_rate: 5.e-5 # Will point to input encoder lr, if user_custom_optimizer is False
+    num_train_epochs: 2
+    save_steps: 4900
+    evaluation_strategy: steps
+    eval_steps: 3000000
+    fp16: true
+    fp16_opt_level: O1
+    lr_scheduler_type: "linear" # defaults to 'linear'
+    dataloader_num_workers: 16
+    label_names: [labels]
+    scenario: "unseen_labels"
+    ddp_find_unused_parameters: false
+    max_eval_samples: 15000
+    ignore_data_skip: true
+    # one_hour_job: true
+    seed: -1

cleaned_code/configs/ablation_amzn_eda_base2.yml ADDED Viewed

	@@ -0,0 +1,84 @@

+EXP_NAME: "semsup_descs_100ep_newds_cosine"
+EXP_DESC: "SemSup Descriptions ran for 100 epochs"
+DATA:
+    task_name: amazon13k
+    dataset_name: amazon13k
+    dataset_config_name: null
+    max_seq_length: 128
+    overwrite_output_dir: true # Set to false, if using one_hour_job
+    overwrite_cache: false
+    pad_to_max_length: true
+    load_from_local: true
+    max_train_samples: null
+    max_eval_samples: null
+    max_predict_samples: null
+    train_file: datasets/Amzn13K/train_split6500_2.jsonl
+    validation_file: datasets/Amzn13K/test_unseen_split6500_2.jsonl
+    test_file: datasets/Amzn13K/test_unseen_split6500_2.jsonl
+    label_max_seq_length: 96
+    # descriptions_file: datasets/Amzn13K/amzn_curie_descsriptions.json
+    descriptions_file: datasets/Amzn13K/heir_withdescriptions_v3_v3_unseen.json
+    test_descriptions_file: datasets/Amzn13K/heir_withdescriptions_v3_v3.json
+    all_labels : datasets/Amzn13K/all_labels.txt
+    test_labels: datasets/Amzn13K/unseen_labels_split6500_2.txt
+    contrastive_learning_samples: 2000
+    cl_min_positive_descs: 1
+    # bm_short_file: datasets/eurlex4.3k/train_bmshort.txt
+    # ignore_pos_labels_file: datasets/Amzn13K/ignore_train_split6500_fs5.txt
+    coil_cluster_mapping_path: bert_coil_map_dict_lemma255K_isotropic.json
+MODEL:
+    model_name_or_path: bert-base-uncased
+    # pretrained_model_path: output/semsup_descs_amzn13k_web_6500_small/checkpoint-20000/pytorch_model.bin
+    config_name: null
+    tokenizer_name: null
+    cache_dir: null
+    use_fast_tokenizer: true
+    model_revision: main
+    use_auth_token: false
+    ignore_mismatched_sizes: false
+    negative_sampling: "none"
+    semsup: true
+    # label_model_name_or_path: bert-base-uncased # prajjwal1/bert-small
+    label_model_name_or_path: prajjwal1/bert-small
+    encoder_model_type: bert
+    use_custom_optimizer: adamw
+    output_learning_rate: 1.e-4
+    arch_type : 2
+    add_label_name: true
+    normalize_embeddings: false
+    tie_weights: false
+    coil: true
+    colbert: false
+    # use_precomputed_embeddings: datasets/eurlex4.3k/heir_withdescriptions_4.3k_v1_embs_bert_9_96.npy
+    token_dim: 16
+    label_frozen_layers: 2
+TRAINING:
+    do_train: true
+    do_eval: true
+    do_predict: false
+    per_device_train_batch_size: 1
+    gradient_accumulation_steps: 8
+    per_device_eval_batch_size: 1
+    learning_rate: 5.e-5 # Will point to input encoder lr, if user_custom_optimizer is False
+    num_train_epochs: 3
+    save_steps: 4900
+    evaluation_strategy: steps
+    eval_steps: 3000
+    fp16: true
+    fp16_opt_level: O1
+    lr_scheduler_type: "linear" # defaults to 'linear'
+    dataloader_num_workers: 4
+    label_names: [labels]
+    scenario: "unseen_labels"
+    ddp_find_unused_parameters: false
+    max_eval_samples: 15000
+    ignore_data_skip: true
+    # one_hour_job: true

cleaned_code/configs/ablation_eurlex_1_base.yml ADDED Viewed

	@@ -0,0 +1,85 @@

+EXP_NAME: "semsup_descs_100ep_newds_cosine"
+EXP_DESC: "SemSup Descriptions ran for 100 epochs"
+DATA:
+    task_name: eurlex57k
+    dataset_name: eurlex
+    dataset_config_name: null
+    max_seq_length: 512
+    overwrite_output_dir: true
+    overwrite_cache: false
+    pad_to_max_length: true
+    load_from_local: true
+    max_train_samples: null
+    max_eval_samples: null
+    max_predict_samples: null
+    train_file: ../training/datasets/eurlex4.3k/train_split1057.jsonl
+    # validation_file: ../training/datasets/eurlex4.3k/test_unseen_split1057.jsonl
+    # test_file: ../training/datasets/eurlex4.3k/test_unseen_split1057.jsonl
+    validation_file: ../training/datasets/eurlex4.3k/test.jsonl
+    test_file: ../training/datasets/eurlex4.3k/test.jsonl
+    # validation_file: ../training/datasets/eurlex4.3k/test_unseen_hr.jsonl
+    # test_file: ../training/datasets/eurlex4.3k/test_unseen_hr.jsonl
+    label_max_seq_length: 128
+    # descriptions_file: ../training/datasets/eurlex4.3k/heir_withdescriptions_4.3k_v1_nl_unseen.json
+    # test_descriptions_file: ../training/datasets/eurlex4.3k/heir_withdescriptions_4.3k_v1_nl.json
+    descriptions_file: ../training/datasets/eurlex4.3k/heir_withdescriptions_4.3k_web_nl_unseen.json
+    test_descriptions_file: ../training/datasets/eurlex4.3k/heir_withdescriptions_4.3k_web_nl.json
+    all_labels : ../training/datasets/eurlex4.3k/all_labels.txt
+    test_labels : ../training/datasets/eurlex4.3k/all_labels.txt
+    # test_labels: ../training/datasets/eurlex4.3k/unseen_labels_split1057.txt
+    # test_labels: ../training/datasets/eurlex4.3k/unseen_labels.txt
+    contrastive_learning_samples: 1500
+    cl_min_positive_descs: 1
+    coil_cluster_mapping_path: bert_coil_map_dict_lemma255K_isotropic.json
+MODEL:
+    model_name_or_path: bert-base-uncased
+    # pretrained_model_path: /n/fs/scratch/pranjal/seed_experiments/ablation_eurlex_1_base_web_128_seed2/checkpoint-21600/pytorch_model.bin
+    config_name: null
+    tokenizer_name: null
+    cache_dir: null
+    use_fast_tokenizer: true
+    model_revision: main
+    use_auth_token: false
+    ignore_mismatched_sizes: false
+    negative_sampling: "none"
+    semsup: true
+    label_model_name_or_path: prajjwal1/bert-small
+    encoder_model_type: bert
+    use_custom_optimizer: adamw
+    output_learning_rate: 1.e-4
+    arch_type : 2
+    add_label_name: false
+    normalize_embeddings: false
+    tie_weights: false
+    coil: true
+    # use_precomputed_embeddings: ../training/datasets/eurlex4.3k/heir_withdescriptions_4.3k_v1_embs_bert_9_96.npy
+    token_dim: 16
+    label_frozen_layers: 2
+TRAINING:
+    do_train: true
+    do_eval: true
+    per_device_train_batch_size: 1
+    gradient_accumulation_steps: 8
+    per_device_eval_batch_size: 1
+    learning_rate: 5.e-5 # Will point to input encoder lr, if user_custom_optimizer is False
+    num_train_epochs: 10
+    save_steps: 5400
+    evaluation_strategy: steps
+    eval_steps: 5000
+    fp16: true
+    fp16_opt_level: O1
+    lr_scheduler_type: "linear" # defaults to 'linear'
+    dataloader_num_workers: 8
+    label_names: [labels]
+    scenario: "unseen_labels"
+    ddp_find_unused_parameters: false
+    seed: -1

cleaned_code/configs/ablation_eurlex_1_coil.yml ADDED Viewed

	@@ -0,0 +1,88 @@

+EXP_NAME: "semsup_descs_100ep_newds_cosine"
+EXP_DESC: "SemSup Descriptions ran for 100 epochs"
+DATA:
+    task_name: eurlex57k
+    dataset_name: eurlex
+    dataset_config_name: null
+    max_seq_length: 512
+    overwrite_output_dir: true
+    overwrite_cache: false
+    pad_to_max_length: true
+    load_from_local: true
+    max_train_samples: null
+    max_eval_samples: null
+    max_predict_samples: null
+    train_file: datasets/eurlex4.3k/train_split1057.jsonl
+    validation_file: datasets/eurlex4.3k/test_unseen_split1057.jsonl
+    test_file: datasets/eurlex4.3k/test_unseen_split1057.jsonl
+    # validation_file: datasets/eurlex4.3k/test_unseen_hr.jsonl
+    # test_file: datasets/eurlex4.3k/test_unseen_hr.jsonl
+    label_max_seq_length: 128
+    # descriptions_file: datasets/eurlex4.3k/heir_withdescriptions_4.3k_v1_nl_unseen.json
+    # test_descriptions_file: datasets/eurlex4.3k/heir_withdescriptions_4.3k_v1_nl.json
+    descriptions_file: datasets/eurlex4.3k/heir_withdescriptions_4.3k_web_nl_unseen.json
+    test_descriptions_file: datasets/eurlex4.3k/heir_withdescriptions_4.3k_web_nl.json
+    all_labels : datasets/eurlex4.3k/all_labels.txt
+    test_labels: datasets/eurlex4.3k/unseen_labels_split1057.txt
+    # test_labels: datasets/eurlex4.3k/unseen_labels.txt
+    # max_descs_per_label: 5
+    contrastive_learning_samples: 1500
+    cl_min_positive_descs: 1
+    # bm_short_file: datasets/eurlex4.3k/train_bmshort.txt
+    # coil_cluster_mapping_path: bert_coil_map_dict_lemma255K_isotropic.json
+    # coil_cluster_mapping_path: bert_coil_map_dict_lemma.json
+MODEL:
+    model_name_or_path: bert-base-uncased
+    pretrained_model_path: /n/fs/scratch/pranjal/seed_experiments/ablation_eurlex_1_coil_web_seed2/checkpoint-5400/pytorch_model.bin
+    config_name: null
+    tokenizer_name: null
+    cache_dir: null
+    use_fast_tokenizer: true
+    model_revision: main
+    use_auth_token: false
+    ignore_mismatched_sizes: false
+    negative_sampling: "none"
+    semsup: true
+    label_model_name_or_path: prajjwal1/bert-small
+    # label_model_name_or_path: bert-base-uncased
+    # label_model_name_or_path: prajjwal1/bert-tiny
+    encoder_model_type: bert
+    use_custom_optimizer: adamw
+    output_learning_rate: 1.e-4
+    arch_type : 2
+    add_label_name: false
+    normalize_embeddings: false
+    tie_weights: false
+    coil: false
+    # use_precomputed_embeddings: datasets/eurlex4.3k/heir_withdescriptions_4.3k_v1_embs_bert_9_96.npy
+    token_dim: 16
+    # num_frozen_layers: 9
+    label_frozen_layers: 2
+TRAINING:
+    do_train: false
+    do_eval: true
+    per_device_train_batch_size: 1
+    gradient_accumulation_steps: 8
+    per_device_eval_batch_size: 1
+    learning_rate: 5.e-5 # Will point to input encoder lr, if user_custom_optimizer is False
+    num_train_epochs: 10
+    save_steps: 5400
+    evaluation_strategy: steps
+    eval_steps: 5000
+    fp16: true
+    fp16_opt_level: O1
+    lr_scheduler_type: "linear" # defaults to 'linear'
+    dataloader_num_workers: 8
+    label_names: [labels]
+    scenario: "unseen_labels"
+    ddp_find_unused_parameters: false
+    seed: -1

cleaned_code/configs/ablation_eurlex_1_descs.yml ADDED Viewed

	@@ -0,0 +1,91 @@

+EXP_NAME: "semsup_descs_100ep_newds_cosine"
+EXP_DESC: "SemSup Descriptions ran for 100 epochs"
+DATA:
+    task_name: eurlex57k
+    dataset_name: eurlex
+    dataset_config_name: null
+    max_seq_length: 512
+    overwrite_output_dir: true
+    overwrite_cache: false
+    pad_to_max_length: true
+    load_from_local: true
+    max_train_samples: null
+    max_eval_samples: null
+    max_predict_samples: null
+    train_file: datasets/eurlex4.3k/train_split1057.jsonl
+    # validation_file: datasets/eurlex4.3k/test_unseen_split1057.jsonl
+    # test_file: datasets/eurlex4.3k/test_unseen_split1057.jsonl
+    validation_file: datasets/eurlex4.3k/test.jsonl
+    test_file: datasets/eurlex4.3k/test.jsonl
+    # validation_file: datasets/eurlex4.3k/test_unseen_hr.jsonl
+    # test_file: datasets/eurlex4.3k/test_unseen_hr.jsonl
+    label_max_seq_length: 64
+    descriptions_file: datasets/eurlex4.3k/heir_withoutdescriptions_4.3k_web_nl_unseen.json
+    test_descriptions_file: datasets/eurlex4.3k/heir_withoutdescriptions_4.3k_web_nl.json
+    # descriptions_file: datasets/eurlex4.3k/heir_descriptions_4.3k_v1.json
+    # descriptions_file: datasets/eurlex4.3k/curie_descriptions_4.3k_v1.json
+    all_labels : datasets/eurlex4.3k/all_labels.txt
+    test_labels : datasets/eurlex4.3k/all_labels.txt
+    # test_labels: datasets/eurlex4.3k/unseen_labels_split1057.txt
+    # test_labels: datasets/eurlex4.3k/unseen_labels.txt
+    # max_descs_per_label: 5
+    contrastive_learning_samples: 1500
+    cl_min_positive_descs: 1
+    # bm_short_file: datasets/eurlex4.3k/train_bmshort.txt
+    coil_cluster_mapping_path: bert_coil_map_dict_lemma255K_isotropic.json
+    # coil_cluster_mapping_path: bert_coil_map_dict_lemma.json
+MODEL:
+    model_name_or_path: bert-base-uncased
+    pretrained_model_path: /n/fs/scratch/pranjal/seed_experiments/ablation_eurlex_1_descs_seed3/checkpoint-27000/pytorch_model.bin
+    config_name: null
+    tokenizer_name: null
+    cache_dir: null
+    use_fast_tokenizer: true
+    model_revision: main
+    use_auth_token: false
+    ignore_mismatched_sizes: false
+    negative_sampling: "none"
+    semsup: true
+    label_model_name_or_path: prajjwal1/bert-small
+    # label_model_name_or_path: bert-base-uncased
+    # label_model_name_or_path: prajjwal1/bert-tiny
+    encoder_model_type: bert
+    use_custom_optimizer: adamw
+    output_learning_rate: 1.e-4
+    arch_type : 2
+    add_label_name: false
+    normalize_embeddings: false
+    tie_weights: false
+    coil: true
+    # use_precomputed_embeddings: datasets/eurlex4.3k/heir_withdescriptions_4.3k_v1_embs_bert_9_96.npy
+    token_dim: 16
+    # num_frozen_layers: 9
+    label_frozen_layers: 2
+TRAINING:
+    do_train: false
+    do_eval: true
+    per_device_train_batch_size: 1
+    gradient_accumulation_steps: 8
+    per_device_eval_batch_size: 1
+    learning_rate: 5.e-5 # Will point to input encoder lr, if user_custom_optimizer is False
+    num_train_epochs: 10
+    save_steps: 5400
+    evaluation_strategy: steps
+    eval_steps: 5000
+    fp16: true
+    fp16_opt_level: O1
+    lr_scheduler_type: "linear" # defaults to 'linear'
+    dataloader_num_workers: 8
+    label_names: [labels]
+    scenario: "unseen_labels"
+    ddp_find_unused_parameters: false
+    seed: -1

cleaned_code/configs/ablation_eurlex_1_hier_descs.yml ADDED Viewed

	@@ -0,0 +1,91 @@

+EXP_NAME: "semsup_descs_100ep_newds_cosine"
+EXP_DESC: "SemSup Descriptions ran for 100 epochs"
+DATA:
+    task_name: eurlex57k
+    dataset_name: eurlex
+    dataset_config_name: null
+    max_seq_length: 512
+    overwrite_output_dir: true
+    overwrite_cache: false
+    pad_to_max_length: true
+    load_from_local: true
+    max_train_samples: null
+    max_eval_samples: null
+    max_predict_samples: null
+    # train_file: datasets/eurlex4.3k/train_hr.jsonl
+    # train_file: datasets/eurlex4.3k/train.jsonl
+    # validation_file: datasets/eurlex4.3k/test_unseen.jsonl
+    # test_file: datasets/eurlex4.3k/test_unseen.jsonl
+    # validation_file: datasets/eurlex4.3k/test.jsonl
+    # test_file: datasets/eurlex4.3k/test.jsonl
+    train_file: datasets/eurlex4.3k/train_split1057.jsonl
+    validation_file: datasets/eurlex4.3k/test_unseen_split1057.jsonl
+    test_file: datasets/eurlex4.3k/test_unseen_split1057.jsonl
+    # validation_file: datasets/eurlex4.3k/test_unseen_hr.jsonl
+    # test_file: datasets/eurlex4.3k/test_unseen_hr.jsonl
+    label_max_seq_length: 8
+    descriptions_file: datasets/eurlex4.3k/all_names.json
+    test_descriptions_file: datasets/eurlex4.3k/all_names.json
+    # descriptions_file: datasets/eurlex4.3k/heir_descriptions_4.3k_v1.json
+    # descriptions_file: datasets/eurlex4.3k/curie_descriptions_4.3k_v1.json
+    all_labels : datasets/eurlex4.3k/all_labels.txt
+    test_labels: datasets/eurlex4.3k/unseen_labels_split1057.txt
+    # test_labels: datasets/eurlex4.3k/unseen_labels.txt
+    # max_descs_per_label: 5
+    # contrastive_learning_samples: 1500
+    # cl_min_positive_descs: 1
+    # bm_short_file: datasets/eurlex4.3k/train_bmshort.txt
+    coil_cluster_mapping_path: bert_coil_map_dict_lemma255K_isotropic.json
+    # coil_cluster_mapping_path: bert_coil_map_dict_lemma.json
+MODEL:
+    model_name_or_path: bert-base-uncased
+    config_name: null
+    tokenizer_name: null
+    cache_dir: null
+    use_fast_tokenizer: true
+    model_revision: main
+    use_auth_token: false
+    ignore_mismatched_sizes: false
+    negative_sampling: "none"
+    semsup: true
+    label_model_name_or_path: prajjwal1/bert-small
+    # label_model_name_or_path: bert-base-uncased
+    # label_model_name_or_path: prajjwal1/bert-tiny
+    encoder_model_type: bert
+    use_custom_optimizer: adamw
+    output_learning_rate: 1.e-4
+    arch_type : 2
+    add_label_name: false
+    normalize_embeddings: false
+    tie_weights: false
+    coil: true
+    # use_precomputed_embeddings: datasets/eurlex4.3k/heir_withdescriptions_4.3k_v1_embs_bert_9_96.npy
+    token_dim: 16
+    # num_frozen_layers: 9
+    label_frozen_layers: 2
+TRAINING:
+    do_train: true
+    do_eval: true
+    per_device_train_batch_size: 4
+    gradient_accumulation_steps: 4
+    per_device_eval_batch_size: 1
+    learning_rate: 5.e-5 # Will point to input encoder lr, if user_custom_optimizer is False
+    num_train_epochs: 10
+    save_steps: 10000
+    evaluation_strategy: steps
+    eval_steps: 500
+    fp16: true
+    fp16_opt_level: O1
+    lr_scheduler_type: "linear" # defaults to 'linear'
+    dataloader_num_workers: 8
+    label_names: [labels]
+    scenario: "unseen_labels"
+    ddp_find_unused_parameters: false

cleaned_code/configs/ablation_eurlex_1_hierarchy.yml ADDED Viewed

	@@ -0,0 +1,88 @@

+EXP_NAME: "semsup_descs_100ep_newds_cosine"
+EXP_DESC: "SemSup Descriptions ran for 100 epochs"
+DATA:
+    task_name: eurlex57k
+    dataset_name: eurlex
+    dataset_config_name: null
+    max_seq_length: 512
+    overwrite_output_dir: true
+    overwrite_cache: false
+    pad_to_max_length: true
+    load_from_local: true
+    max_train_samples: null
+    max_eval_samples: null
+    max_predict_samples: null
+    train_file: datasets/eurlex4.3k/train_split1057.jsonl
+    validation_file: datasets/eurlex4.3k/test_unseen_split1057.jsonl
+    test_file: datasets/eurlex4.3k/test_unseen_split1057.jsonl
+    # validation_file: datasets/eurlex4.3k/test_unseen_hr.jsonl
+    # test_file: datasets/eurlex4.3k/test_unseen_hr.jsonl
+    label_max_seq_length: 96
+    # descriptions_file: datasets/eurlex4.3k/curie_descriptions_4.3k_v1.json
+    # test_descriptions_file: datasets/eurlex4.3k/curie_descriptions_4.3k_v1.json
+    descriptions_file: datasets/eurlex4.3k/eurlex_descs_refined_v3_v3.json
+    test_descriptions_file: datasets/eurlex4.3k/eurlex_descs_refined_v3_v3.json
+    all_labels : datasets/eurlex4.3k/all_labels.txt
+    test_labels: datasets/eurlex4.3k/unseen_labels_split1057.txt
+    # test_labels: datasets/eurlex4.3k/unseen_labels.txt
+    max_descs_per_label: 5
+    contrastive_learning_samples: 1500
+    cl_min_positive_descs: 1
+    # bm_short_file: datasets/eurlex4.3k/train_bmshort.txt
+    coil_cluster_mapping_path: bert_coil_map_dict_lemma255K_isotropic.json
+    # coil_cluster_mapping_path: bert_coil_map_dict_lemma.json
+MODEL:
+    model_name_or_path: bert-base-uncased
+    pretrained_model_path: seed_experiments/ablation_eurlex_1_hierarchy_web_seed3/checkpoint-5400/pytorch_model.bin
+    config_name: null
+    tokenizer_name: null
+    cache_dir: null
+    use_fast_tokenizer: true
+    model_revision: main
+    use_auth_token: false
+    ignore_mismatched_sizes: false
+    negative_sampling: "none"
+    semsup: true
+    label_model_name_or_path: prajjwal1/bert-small
+    # label_model_name_or_path: bert-base-uncased
+    # label_model_name_or_path: prajjwal1/bert-tiny
+    encoder_model_type: bert
+    use_custom_optimizer: adamw
+    output_learning_rate: 1.e-4
+    arch_type : 2
+    add_label_name: false
+    normalize_embeddings: false
+    tie_weights: false
+    coil: true
+    # use_precomputed_embeddings: datasets/eurlex4.3k/heir_withdescriptions_4.3k_v1_embs_bert_9_96.npy
+    token_dim: 16
+    # num_frozen_layers: 9
+    label_frozen_layers: 2
+TRAINING:
+    do_train: false
+    do_eval: true
+    per_device_train_batch_size: 1
+    gradient_accumulation_steps: 8
+    per_device_eval_batch_size: 1
+    learning_rate: 5.e-5 # Will point to input encoder lr, if user_custom_optimizer is False
+    num_train_epochs: 10
+    save_steps: 5400
+    evaluation_strategy: steps
+    eval_steps: 5000
+    fp16: true
+    fp16_opt_level: O1
+    lr_scheduler_type: "linear" # defaults to 'linear'
+    dataloader_num_workers: 8
+    label_names: [labels]
+    scenario: "unseen_labels"
+    ddp_find_unused_parameters: false
+    seed: -1

cleaned_code/configs/ablation_eurlex_1_relax.yml ADDED Viewed

	@@ -0,0 +1,86 @@

+EXP_NAME: "semsup_descs_100ep_newds_cosine"
+EXP_DESC: "SemSup Descriptions ran for 100 epochs"
+DATA:
+    task_name: eurlex57k
+    dataset_name: eurlex
+    dataset_config_name: null
+    max_seq_length: 512
+    overwrite_output_dir: true # Set to false, if using one_hour_job
+    overwrite_cache: false
+    pad_to_max_length: true
+    load_from_local: true
+    max_train_samples: null
+    max_eval_samples: null
+    max_predict_samples: null
+    train_file: datasets/eurlex4.3k/train_split1057.jsonl
+    validation_file: datasets/eurlex4.3k/test_unseen_split1057.jsonl
+    test_file: datasets/eurlex4.3k/test_unseen_split1057.jsonl
+    label_max_seq_length: 128
+    # descriptions_file: datasets/eurlex4.3k/heir_withdescriptions_4.3k_v1_nl_unseen.json
+    # test_descriptions_file: datasets/eurlex4.3k/heir_withdescriptions_4.3k_v1_nl.json
+    descriptions_file: datasets/eurlex4.3k/heir_withdescriptions_4.3k_web_nl_unseen.json
+    test_descriptions_file: datasets/eurlex4.3k/heir_withdescriptions_4.3k_web_nl.json
+    all_labels : datasets/eurlex4.3k/all_labels.txt
+    test_labels: datasets/eurlex4.3k/unseen_labels_split1057.txt
+    # test_labels: datasets/eurlex4.3k/unseen_labels.txt
+    contrastive_learning_samples: 1500
+    cl_min_positive_descs: 1
+    # bm_short_file: datasets/eurlex4.3k/train_bmshort.txt
+    # ignore_pos_labels_file: datasets/Amzn13K/ignore_train_split6500_fs5.txt
+    # coil_cluster_mapping_path: bert_coil_map_dict_lemma255K_isotropic.json
+MODEL:
+    model_name_or_path: bert-base-uncased
+    pretrained_model_path: seed_experiments/ablation_eurlex_1_relax_web_seed3/checkpoint-4900/pytorch_model.bin
+    config_name: null
+    tokenizer_name: null
+    cache_dir: null
+    use_fast_tokenizer: true
+    model_revision: main
+    use_auth_token: false
+    ignore_mismatched_sizes: false
+    negative_sampling: "none"
+    semsup: true
+    # label_model_name_or_path: bert-base-uncased # prajjwal1/bert-small
+    label_model_name_or_path: prajjwal1/bert-small
+    encoder_model_type: bert
+    use_custom_optimizer: adamw
+    output_learning_rate: 1.e-4
+    arch_type : 2
+    add_label_name: false
+    normalize_embeddings: false
+    tie_weights: false
+    coil: true
+    # use_precomputed_embeddings: datasets/eurlex4.3k/heir_withdescriptions_4.3k_v1_embs_bert_9_96.npy
+    token_dim: 16
+    label_frozen_layers: 2
+TRAINING:
+    do_train: false
+    do_eval: true
+    per_device_train_batch_size: 1
+    gradient_accumulation_steps: 8
+    per_device_eval_batch_size: 1
+    learning_rate: 5.e-5 # Will point to input encoder lr, if user_custom_optimizer is False
+    num_train_epochs: 10
+    save_steps: 5400
+    evaluation_strategy: steps
+    eval_steps: 5000
+    fp16: true
+    fp16_opt_level: O1
+    lr_scheduler_type: "linear" # defaults to 'linear'
+    dataloader_num_workers: 16
+    label_names: [labels]
+    scenario: "unseen_labels"
+    ddp_find_unused_parameters: false
+    # max_eval_samples: 15000
+    # ignore_data_skip: true
+    # one_hour_job: true
+    seed: -1

cleaned_code/configs/ablation_eurlex_eda.yml ADDED Viewed

	@@ -0,0 +1,82 @@

+EXP_NAME: "semsup_descs_100ep_newds_cosine"
+EXP_DESC: "SemSup Descriptions ran for 100 epochs"
+DATA:
+    task_name: eurlex57k
+    dataset_name: eurlex
+    dataset_config_name: null
+    max_seq_length: 512
+    overwrite_output_dir: true
+    overwrite_cache: false
+    pad_to_max_length: true
+    load_from_local: true
+    max_train_samples: null
+    max_eval_samples: null
+    max_predict_samples: null
+    train_file: datasets/eurlex4.3k/train_split1057.jsonl
+    validation_file: datasets/eurlex4.3k/test_unseen_split1057.jsonl
+    test_file: datasets/eurlex4.3k/test_unseen_split1057.jsonl
+    # validation_file: datasets/eurlex4.3k/test_unseen_hr.jsonl
+    # test_file: datasets/eurlex4.3k/test_unseen_hr.jsonl
+    label_max_seq_length: 128
+    # descriptions_file: datasets/eurlex4.3k/heir_withdescriptions_4.3k_v1_nl_unseen.json
+    # test_descriptions_file: datasets/eurlex4.3k/heir_withdescriptions_4.3k_v1_nl.json
+    descriptions_file: datasets/eurlex4.3k/heir_withdescriptions_4.3k_web_nl_unseen_edaaug.json
+    test_descriptions_file: datasets/eurlex4.3k/heir_withdescriptions_4.3k_web_nl.json
+    all_labels : datasets/eurlex4.3k/all_labels.txt
+    test_labels: datasets/eurlex4.3k/unseen_labels_split1057.txt
+    # test_labels: datasets/eurlex4.3k/unseen_labels.txt
+    contrastive_learning_samples: 1500
+    cl_min_positive_descs: 1
+    coil_cluster_mapping_path: bert_coil_map_dict_lemma255K_isotropic.json
+MODEL:
+    model_name_or_path: bert-base-uncased
+    pretrained_model_path: seed_experiments/ablation_eurlex_1_eda_web_128_seed3/checkpoint-5400/pytorch_model.bin
+    config_name: null
+    tokenizer_name: null
+    cache_dir: null
+    use_fast_tokenizer: true
+    model_revision: main
+    use_auth_token: false
+    ignore_mismatched_sizes: false
+    negative_sampling: "none"
+    semsup: true
+    label_model_name_or_path: prajjwal1/bert-small
+    encoder_model_type: bert
+    use_custom_optimizer: adamw
+    output_learning_rate: 1.e-4
+    arch_type : 2
+    add_label_name: false
+    normalize_embeddings: false
+    tie_weights: false
+    coil: true
+    # use_precomputed_embeddings: datasets/eurlex4.3k/heir_withdescriptions_4.3k_v1_embs_bert_9_96.npy
+    token_dim: 16
+    label_frozen_layers: 2
+TRAINING:
+    do_train: false
+    do_eval: true
+    per_device_train_batch_size: 1
+    gradient_accumulation_steps: 4
+    per_device_eval_batch_size: 2
+    learning_rate: 5.e-5 # Will point to input encoder lr, if user_custom_optimizer is False
+    num_train_epochs: 10
+    save_steps: 5400
+    evaluation_strategy: steps
+    eval_steps: 5000
+    fp16: true
+    fp16_opt_level: O1
+    lr_scheduler_type: "linear" # defaults to 'linear'
+    dataloader_num_workers: 16
+    label_names: [labels]
+    scenario: "unseen_labels"
+    ddp_find_unused_parameters: false
+    seed: -1

cleaned_code/configs/amzn13k_active_hfwnet.yml ADDED Viewed

	@@ -0,0 +1,79 @@

+EXP_NAME: "semsup_descs_100ep_newds_cosine"
+EXP_DESC: "SemSup Descriptions ran for 100 epochs"
+DATA:
+    task_name: amazon13k
+    dataset_name: amazon13k
+    dataset_config_name: null
+    max_seq_length: 128
+    overwrite_output_dir: true # Set to false, if using one_hour_job
+    overwrite_cache: false
+    pad_to_max_length: true
+    load_from_local: true
+    max_train_samples: null
+    max_eval_samples: null
+    max_predict_samples: null
+    train_file: datasets/Amzn13K/train_split1668_hfwnet.jsonl
+    validation_file: datasets/Amzn13K/test_unseen_split6500_2.jsonl
+    test_file: datasets/Amzn13K/test_unseen_split6500_2.jsonl
+    label_max_seq_length: 32
+    # descriptions_file: datasets/Amzn13K/amzn_curie_descsriptions.json
+    descriptions_file: datasets/Amzn13K/amzn_descs_refined_v3_v3.json
+    all_labels : datasets/Amzn13K/all_labels.txt
+    test_labels: datasets/Amzn13K/unseen_labels_split6500_2.txt
+    # max_descs_per_label: 10
+    # contrastive_learning_samples: 5000
+    # cl_min_positive_descs: 1
+    # bm_short_file: datasets/eurlex4.3k/train_bmshort.txt
+    # ignore_pos_labels_file: datasets/Amzn13K/ignore_train_split6500_fs5.txt
+MODEL:
+    model_name_or_path: bert-base-uncased
+    config_name: null
+    tokenizer_name: null
+    cache_dir: null
+    use_fast_tokenizer: true
+    model_revision: main
+    use_auth_token: false
+    ignore_mismatched_sizes: false
+    negative_sampling: "none"
+    semsup: true
+    # label_model_name_or_path: bert-base-uncased # prajjwal1/bert-small
+    label_model_name_or_path: prajjwal1/bert-small
+    encoder_model_type: bert
+    use_custom_optimizer: adamw
+    output_learning_rate: 1.e-4
+    arch_type : 2
+    add_label_name: true
+    normalize_embeddings: false
+    tie_weights: false
+    coil: true
+    colbert: false
+    # use_precomputed_embeddings: datasets/eurlex4.3k/heir_withdescriptions_4.3k_v1_embs_bert_9_96.npy
+    token_dim: 16
+    label_frozen_layers: 2
+TRAINING:
+    do_train: true
+    do_eval: true
+    do_predict: false
+    per_device_train_batch_size: 1
+    gradient_accumulation_steps: 4
+    per_device_eval_batch_size: 1
+    learning_rate: 5.e-5 # Will point to input encoder lr, if user_custom_optimizer is False
+    num_train_epochs: 20
+    save_steps: 5000
+    evaluation_strategy: steps
+    eval_steps: 2000
+    fp16: true
+    fp16_opt_level: O1
+    lr_scheduler_type: "linear" # defaults to 'linear'
+    dataloader_num_workers: 4
+    label_names: [labels]
+    scenario: "unseen_labels"
+    ddp_find_unused_parameters: false
+    max_eval_samples: 15000
+    # ignore_data_skip: true
+    # one_hour_job: true

cleaned_code/configs/amzn13k_active_highfreq.yml ADDED Viewed

	@@ -0,0 +1,87 @@

+EXP_NAME: "semsup_descs_100ep_newds_cosine"
+EXP_DESC: "SemSup Descriptions ran for 100 epochs"
+DATA:
+    task_name: amazon13k
+    dataset_name: amazon13k
+    dataset_config_name: null
+    max_seq_length: 128
+    overwrite_output_dir: true # Set to false, if using one_hour_job
+    overwrite_cache: false
+    pad_to_max_length: true
+    load_from_local: true
+    max_train_samples: null
+    max_eval_samples: null
+    max_predict_samples: null
+    # train_file: datasets/Amzn13K/train_split1668_highfreq_fs50.jsonl
+    # train_file: datasets/Amzn13K/train_split1668_highfreq.jsonl
+    train_file: datasets/Amzn13K/train_split1106_highfreq_bot.jsonl
+    validation_file: datasets/Amzn13K/test_unseen_split6500_2.jsonl
+    test_file: datasets/Amzn13K/test_unseen_split6500_2.jsonl
+    label_max_seq_length: 96
+    # descriptions_file: datasets/Amzn13K/amzn_curie_descsriptions.json
+    # descriptions_file: datasets/Amzn13K/amzn_descs_refined_v3_v3.json
+    descriptions_file: datasets/Amzn13K/heir_withdescriptions_v3_v3_unseen.json
+    test_descriptions_file: datasets/Amzn13K/heir_withdescriptions_v3_v3.json
+    all_labels : datasets/Amzn13K/all_labels.txt
+    test_labels: datasets/Amzn13K/unseen_labels_split6500_2.txt
+    # max_descs_per_label: 10
+    # contrastive_learning_samples: 5000
+    # cl_min_positive_descs: 1
+    # bm_short_file: datasets/eurlex4.3k/train_bmshort.txt
+    # ignore_pos_labels_file: datasets/Amzn13K/ignore_train_split1668_highfreq_fs50.txt
+MODEL:
+    model_name_or_path: bert-base-uncased
+    config_name: null
+    tokenizer_name: null
+    cache_dir: null
+    use_fast_tokenizer: true
+    model_revision: main
+    use_auth_token: false
+    ignore_mismatched_sizes: false
+    negative_sampling: "none"
+    semsup: true
+    # label_model_name_or_path: bert-base-uncased # prajjwal1/bert-small
+    label_model_name_or_path: prajjwal1/bert-small
+    encoder_model_type: bert
+    use_custom_optimizer: adamw
+    output_learning_rate: 1.e-4
+    arch_type : 2
+    add_label_name: true
+    normalize_embeddings: false
+    tie_weights: false
+    coil: true
+    colbert: false
+    # use_precomputed_embeddings: datasets/eurlex4.3k/heir_withdescriptions_4.3k_v1_embs_bert_9_96.npy
+    token_dim: 16
+    label_frozen_layers: 2
+TRAINING:
+    do_train: true
+    do_eval: true
+    do_predict: false
+    per_device_train_batch_size: 1
+    gradient_accumulation_steps: 4
+    per_device_eval_batch_size: 1
+    learning_rate: 5.e-5 # Will point to input encoder lr, if user_custom_optimizer is False
+    num_train_epochs: 20
+    save_steps: 5000
+    evaluation_strategy: steps
+    eval_steps: 1000
+    fp16: true
+    fp16_opt_level: O1
+    lr_scheduler_type: "linear" # defaults to 'linear'
+    dataloader_num_workers: 8
+    label_names: [labels]
+    scenario: "unseen_labels"
+    ddp_find_unused_parameters: false
+    max_eval_samples: 15000
+    max_train_samples: 30000
+    # ignore_data_skip: true
+    # one_hour_job: true

cleaned_code/configs/amzn13k_active_random.yml ADDED Viewed

	@@ -0,0 +1,81 @@

+EXP_NAME: "semsup_descs_100ep_newds_cosine"
+EXP_DESC: "SemSup Descriptions ran for 100 epochs"
+DATA:
+    task_name: amazon13k
+    dataset_name: amazon13k
+    dataset_config_name: null
+    max_seq_length: 128
+    overwrite_output_dir: true # Set to false, if using one_hour_job
+    overwrite_cache: false
+    pad_to_max_length: true
+    load_from_local: true
+    max_train_samples: null
+    max_eval_samples: null
+    max_predict_samples: null
+    train_file: datasets/Amzn13K/train_split1668_random.jsonl
+    validation_file: datasets/Amzn13K/test_unseen_split6500_2.jsonl
+    test_file: datasets/Amzn13K/test_unseen_split6500_2.jsonl
+    label_max_seq_length: 32
+    # descriptions_file: datasets/Amzn13K/amzn_curie_descsriptions.json
+    descriptions_file: datasets/Amzn13K/amzn_descs_refined_v3_v3.json
+    all_labels : datasets/Amzn13K/all_labels.txt
+    test_labels: datasets/Amzn13K/unseen_labels_split6500_2.txt
+    # max_descs_per_label: 10
+    # contrastive_learning_samples: 5000
+    # cl_min_positive_descs: 1
+    # bm_short_file: datasets/eurlex4.3k/train_bmshort.txt
+    # ignore_pos_labels_file: datasets/Amzn13K/ignore_train_split1668_random_fs50.txt
+MODEL:
+    model_name_or_path: bert-base-uncased
+    config_name: null
+    tokenizer_name: null
+    cache_dir: null
+    use_fast_tokenizer: true
+    model_revision: main
+    use_auth_token: false
+    ignore_mismatched_sizes: false
+    negative_sampling: "none"
+    semsup: true
+    # label_model_name_or_path: bert-base-uncased # prajjwal1/bert-small
+    label_model_name_or_path: prajjwal1/bert-small
+    encoder_model_type: bert
+    use_custom_optimizer: adamw
+    output_learning_rate: 1.e-4
+    arch_type : 2
+    add_label_name: true
+    normalize_embeddings: false
+    tie_weights: false
+    coil: true
+    colbert: false
+    # use_precomputed_embeddings: datasets/eurlex4.3k/heir_withdescriptions_4.3k_v1_embs_bert_9_96.npy
+    token_dim: 16
+    label_frozen_layers: 2
+TRAINING:
+    do_train: true
+    do_eval: true
+    do_predict: false
+    per_device_train_batch_size: 1
+    gradient_accumulation_steps: 4
+    per_device_eval_batch_size: 1
+    learning_rate: 5.e-5 # Will point to input encoder lr, if user_custom_optimizer is False
+    num_train_epochs: 20
+    save_steps: 5000
+    evaluation_strategy: steps
+    eval_steps: 2000
+    fp16: true
+    fp16_opt_level: O1
+    lr_scheduler_type: "linear" # defaults to 'linear'
+    dataloader_num_workers: 4
+    label_names: [labels]
+    scenario: "unseen_labels"
+    ddp_find_unused_parameters: false
+    max_eval_samples: 15000
+    max_train_samples: 30000
+    # ignore_data_skip: true
+    # one_hour_job: true

cleaned_code/configs/amzn13k_active_wnet.yml ADDED Viewed

	@@ -0,0 +1,79 @@

+EXP_NAME: "semsup_descs_100ep_newds_cosine"
+EXP_DESC: "SemSup Descriptions ran for 100 epochs"
+DATA:
+    task_name: amazon13k
+    dataset_name: amazon13k
+    dataset_config_name: null
+    max_seq_length: 128
+    overwrite_output_dir: true # Set to false, if using one_hour_job
+    overwrite_cache: false
+    pad_to_max_length: true
+    load_from_local: true
+    max_train_samples: null
+    max_eval_samples: null
+    max_predict_samples: null
+    train_file: datasets/Amzn13K/train_split1228_wnet.jsonl
+    validation_file: datasets/Amzn13K/test_unseen_split6500_2.jsonl
+    test_file: datasets/Amzn13K/test_unseen_split6500_2.jsonl
+    label_max_seq_length: 32
+    # descriptions_file: datasets/Amzn13K/amzn_curie_descsriptions.json
+    descriptions_file: datasets/Amzn13K/amzn_descs_refined_v3_v3.json
+    all_labels : datasets/Amzn13K/all_labels.txt
+    test_labels: datasets/Amzn13K/unseen_labels_split6500_2.txt
+    # max_descs_per_label: 10
+    # contrastive_learning_samples: 5000
+    # cl_min_positive_descs: 1
+    # bm_short_file: datasets/eurlex4.3k/train_bmshort.txt
+    # ignore_pos_labels_file: datasets/Amzn13K/ignore_train_split6500_fs5.txt
+MODEL:
+    model_name_or_path: bert-base-uncased
+    config_name: null
+    tokenizer_name: null
+    cache_dir: null
+    use_fast_tokenizer: true
+    model_revision: main
+    use_auth_token: false
+    ignore_mismatched_sizes: false
+    negative_sampling: "none"
+    semsup: true
+    # label_model_name_or_path: bert-base-uncased # prajjwal1/bert-small
+    label_model_name_or_path: prajjwal1/bert-small
+    encoder_model_type: bert
+    use_custom_optimizer: adamw
+    output_learning_rate: 1.e-4
+    arch_type : 2
+    add_label_name: true
+    normalize_embeddings: false
+    tie_weights: false
+    coil: true
+    colbert: false
+    # use_precomputed_embeddings: datasets/eurlex4.3k/heir_withdescriptions_4.3k_v1_embs_bert_9_96.npy
+    token_dim: 16
+    label_frozen_layers: 2
+TRAINING:
+    do_train: true
+    do_eval: true
+    do_predict: false
+    per_device_train_batch_size: 1
+    gradient_accumulation_steps: 4
+    per_device_eval_batch_size: 1
+    learning_rate: 5.e-5 # Will point to input encoder lr, if user_custom_optimizer is False
+    num_train_epochs: 20
+    save_steps: 5000
+    evaluation_strategy: steps
+    eval_steps: 2000
+    fp16: true
+    fp16_opt_level: O1
+    lr_scheduler_type: "linear" # defaults to 'linear'
+    dataloader_num_workers: 4
+    label_names: [labels]
+    scenario: "unseen_labels"
+    ddp_find_unused_parameters: false
+    max_eval_samples: 15000
+    # ignore_data_skip: true
+    # one_hour_job: true

cleaned_code/configs/amzn13k_active_wnet2.yml ADDED Viewed

	@@ -0,0 +1,86 @@

+EXP_NAME: "semsup_descs_100ep_newds_cosine"
+EXP_DESC: "SemSup Descriptions ran for 100 epochs"
+DATA:
+    task_name: amazon13k
+    dataset_name: amazon13k
+    dataset_config_name: null
+    max_seq_length: 128
+    overwrite_output_dir: true # Set to false, if using one_hour_job
+    overwrite_cache: false
+    pad_to_max_length: true
+    load_from_local: true
+    max_train_samples: null
+    max_eval_samples: null
+    max_predict_samples: null
+    # train_file: datasets/Amzn13K/train_split2807_wnet2_fs50.jsonl
+    # train_file: datasets/Amzn13K/train_split2807_wnet2.jsonl
+    train_file: datasets/Amzn13K/train_split1106_wnet2_bot_high.jsonl
+    validation_file: datasets/Amzn13K/test_unseen_split6500_2.jsonl
+    test_file: datasets/Amzn13K/test_unseen_split6500_2.jsonl
+    label_max_seq_length: 96
+    # descriptions_file: datasets/Amzn13K/amzn_curie_descsriptions.json
+    # descriptions_file: datasets/Amzn13K/amzn_descs_refined_v3_v3.json
+    descriptions_file: datasets/Amzn13K/heir_withdescriptions_v3_v3_unseen.json
+    test_descriptions_file: datasets/Amzn13K/heir_withdescriptions_v3_v3.json
+    all_labels : datasets/Amzn13K/all_labels.txt
+    test_labels: datasets/Amzn13K/unseen_labels_split6500_2.txt
+    # max_descs_per_label: 10
+    # contrastive_learning_samples: 5000
+    # cl_min_positive_descs: 1
+    # bm_short_file: datasets/eurlex4.3k/train_bmshort.txt
+    # ignore_pos_labels_file: datasets/Amzn13K/ignore_train_split2807_wnet2_fs50.txt
+MODEL:
+    model_name_or_path: bert-base-uncased
+    pretrained_label_model_path: label_model_amzn_hier_format.pt
+    config_name: null
+    tokenizer_name: null
+    cache_dir: null
+    use_fast_tokenizer: true
+    model_revision: main
+    use_auth_token: false
+    ignore_mismatched_sizes: false
+    negative_sampling: "none"
+    semsup: true
+    # label_model_name_or_path: bert-base-uncased # prajjwal1/bert-small
+    label_model_name_or_path: prajjwal1/bert-small
+    encoder_model_type: bert
+    use_custom_optimizer: adamw
+    output_learning_rate: 1.e-4
+    arch_type : 2
+    add_label_name: true
+    normalize_embeddings: false
+    tie_weights: false
+    coil: true
+    colbert: false
+    # use_precomputed_embeddings: datasets/eurlex4.3k/heir_withdescriptions_4.3k_v1_embs_bert_9_96.npy
+    token_dim: 16
+    label_frozen_layers: 2
+TRAINING:
+    do_train: true
+    do_eval: true
+    do_predict: false
+    per_device_train_batch_size: 1
+    gradient_accumulation_steps: 8
+    per_device_eval_batch_size: 1
+    learning_rate: 5.e-5 # Will point to input encoder lr, if user_custom_optimizer is False
+    num_train_epochs: 20
+    save_steps: 5000
+    evaluation_strategy: steps
+    eval_steps: 500
+    fp16: true
+    fp16_opt_level: O1
+    lr_scheduler_type: "linear" # defaults to 'linear'
+    dataloader_num_workers: 8
+    label_names: [labels]
+    scenario: "unseen_labels"
+    ddp_find_unused_parameters: false
+    max_eval_samples: 15000
+    max_train_samples: 10000
+    # ignore_data_skip: true
+    # one_hour_job: true

cleaned_code/configs/amzn13k_baseline.yml ADDED Viewed

	@@ -0,0 +1,73 @@

+EXP_NAME: "semsup_descs_100ep_newds_cosine"
+EXP_DESC: "SemSup Descriptions ran for 100 epochs"
+DATA:
+    task_name: amazon13k
+    dataset_name: amazon13k
+    dataset_config_name: null
+    max_seq_length: 128
+    overwrite_output_dir: true
+    overwrite_cache: false
+    pad_to_max_length: true
+    load_from_local: true
+    max_train_samples: null
+    max_eval_samples: null
+    max_predict_samples: null
+    train_file: datasets/Amzn13K/train_split6500.jsonl
+    validation_file: datasets/Amzn13K/test_unseen_split6500.jsonl
+    test_file: datasets/Amzn13K/test_unseen_split6500.jsonl
+    label_max_seq_length: 8
+    descriptions_file: datasets/Amzn13K/names_descriptions.json
+    all_labels : datasets/Amzn13K/all_labels.txt
+    test_labels: datasets/Amzn13K/unseen_labels_split6500.txt
+    max_descs_per_label: 5
+    contrastive_learning_samples: 6000
+    cl_min_positive_descs: 1
+    # bm_short_file: datasets/eurlex4.3k/train_bmshort.txt
+MODEL:
+    model_name_or_path: bert-base-uncased
+    config_name: null
+    tokenizer_name: null
+    cache_dir: null
+    use_fast_tokenizer: true
+    model_revision: main
+    use_auth_token: false
+    ignore_mismatched_sizes: false
+    negative_sampling: "none"
+    semsup: true
+    # label_model_name_or_path: bert-base-uncased # prajjwal1/bert-small
+    label_model_name_or_path: prajjwal1/bert-tiny
+    encoder_model_type: bert
+    use_custom_optimizer: adamw
+    output_learning_rate: 1.e-4
+    arch_type : 2
+    add_label_name: false
+    normalize_embeddings: false
+    tie_weights: false
+    coil: true
+    # use_precomputed_embeddings: datasets/eurlex4.3k/heir_withdescriptions_4.3k_v1_embs_bert_9_96.npy
+    token_dim: 16
+TRAINING:
+    do_train: true
+    do_eval: true
+    per_device_train_batch_size: 4
+    gradient_accumulation_steps: 1
+    per_device_eval_batch_size: 4
+    learning_rate: 5.e-5 # Will point to input encoder lr, if user_custom_optimizer is False
+    num_train_epochs: 3
+    save_steps: 10000
+    evaluation_strategy: steps
+    eval_steps: 1000
+    fp16: true
+    fp16_opt_level: O1
+    lr_scheduler_type: "linear" # defaults to 'linear'
+    dataloader_num_workers: 8
+    label_names: [labels]
+    scenario: "unseen_labels"
+    ddp_find_unused_parameters: false
+    max_eval_samples: 20000

cleaned_code/configs/amzn13k_baseline_descs.yml ADDED Viewed

	@@ -0,0 +1,81 @@

+EXP_NAME: "semsup_descs_100ep_newds_cosine"
+EXP_DESC: "SemSup Descriptions ran for 100 epochs"
+DATA:
+    task_name: amazon13k
+    dataset_name: amazon13k
+    dataset_config_name: null
+    max_seq_length: 128
+    overwrite_output_dir: false # Set to false, if using one_hour_job
+    overwrite_cache: false
+    pad_to_max_length: true
+    load_from_local: true
+    max_train_samples: null
+    max_eval_samples: null
+    max_predict_samples: null
+    train_file: datasets/Amzn13K/train_split6500_2.jsonl
+    validation_file: datasets/Amzn13K/test_unseen_split6500_2.jsonl
+    test_file: datasets/Amzn13K/test_unseen_split6500_2.jsonl
+    label_max_seq_length: 32
+    # descriptions_file: datasets/Amzn13K/amzn_curie_descsriptions.json
+    # descriptions_file: datasets/Amzn13K/amzn_descs_refined_v3_v3.json
+    descriptions_file: datasets/Amzn13K/amzn_summ_descs.json
+    all_labels : datasets/Amzn13K/all_labels.txt
+    test_labels: datasets/Amzn13K/unseen_labels_split6500_2.txt
+    max_descs_per_label: 5
+    contrastive_learning_samples: 3500
+    cl_min_positive_descs: 1
+    # bm_short_file: datasets/eurlex4.3k/train_bmshort.txt
+    # ignore_pos_labels_file: datasets/Amzn13K/ignore_train_split6500_fs5.txt
+MODEL:
+    model_name_or_path: bert-base-uncased
+    # pretrained_model_path: output/semsup_descs_amzn13k_web_6500_small/checkpoint-20000/pytorch_model.bin
+    config_name: null
+    tokenizer_name: null
+    cache_dir: null
+    use_fast_tokenizer: true
+    model_revision: main
+    use_auth_token: false
+    ignore_mismatched_sizes: false
+    negative_sampling: "none"
+    semsup: true
+    # label_model_name_or_path: bert-base-uncased # prajjwal1/bert-small
+    label_model_name_or_path: prajjwal1/bert-small
+    encoder_model_type: bert
+    use_custom_optimizer: adamw
+    output_learning_rate: 1.e-4
+    arch_type : 2
+    add_label_name: true
+    normalize_embeddings: false
+    tie_weights: false
+    coil: true
+    colbert: false
+    # use_precomputed_embeddings: datasets/eurlex4.3k/heir_withdescriptions_4.3k_v1_embs_bert_9_96.npy
+    token_dim: 16
+    label_frozen_layers: 2
+TRAINING:
+    do_train: true
+    do_eval: true
+    per_device_train_batch_size: 1
+    gradient_accumulation_steps: 4
+    per_device_eval_batch_size: 1
+    learning_rate: 5.e-5 # Will point to input encoder lr, if user_custom_optimizer is False
+    num_train_epochs: 3
+    save_steps: 4000
+    evaluation_strategy: steps
+    eval_steps: 30000
+    fp16: true
+    fp16_opt_level: O1
+    lr_scheduler_type: "linear" # defaults to 'linear'
+    dataloader_num_workers: 8
+    label_names: [labels]
+    scenario: "unseen_labels"
+    ddp_find_unused_parameters: false
+    max_eval_samples: 15000
+    ignore_data_skip: true
+    # one_hour_job: true

cleaned_code/configs/amzn13k_baseline_descs_edaaug.yml ADDED Viewed

	@@ -0,0 +1,75 @@

+EXP_NAME: "semsup_descs_100ep_newds_cosine"
+EXP_DESC: "SemSup Descriptions ran for 100 epochs"
+DATA:
+    task_name: amazon13k
+    dataset_name: amazon13k
+    dataset_config_name: null
+    max_seq_length: 128
+    overwrite_output_dir: true # Set to false, if using one_hour_job
+    overwrite_cache: false
+    pad_to_max_length: true
+    load_from_local: true
+    max_train_samples: null
+    max_eval_samples: null
+    max_predict_samples: null
+    train_file: datasets/Amzn13K/train_split6500_2.jsonl
+    validation_file: datasets/Amzn13K/test_unseen_split6500_2.jsonl
+    test_file: datasets/Amzn13K/test_unseen_split6500_2.jsonl
+    label_max_seq_length: 32
+    descriptions_file: datasets/Amzn13K/amzn_descs_refined_v3_v3_eda_aug.json
+    test_descriptions_file: datasets/Amzn13K/amzn_descs_refined_v3_v3.json
+    all_labels : datasets/Amzn13K/all_labels.txt
+    test_labels: datasets/Amzn13K/unseen_labels_split6500_2.txt
+    max_descs_per_label: 1000
+    contrastive_learning_samples: 3500
+    cl_min_positive_descs: 1
+MODEL:
+    model_name_or_path: bert-base-uncased
+    config_name: null
+    tokenizer_name: null
+    cache_dir: null
+    use_fast_tokenizer: true
+    model_revision: main
+    use_auth_token: false
+    ignore_mismatched_sizes: false
+    negative_sampling: "none"
+    semsup: true
+    label_model_name_or_path: prajjwal1/bert-small
+    encoder_model_type: bert
+    use_custom_optimizer: adamw
+    output_learning_rate: 1.e-4
+    arch_type : 2
+    add_label_name: true
+    normalize_embeddings: false
+    tie_weights: false
+    coil: true
+    colbert: false
+    token_dim: 16
+    label_frozen_layers: 2
+TRAINING:
+    do_train: true
+    do_eval: true
+    do_predict: false
+    per_device_train_batch_size: 1
+    gradient_accumulation_steps: 4
+    per_device_eval_batch_size: 1
+    learning_rate: 5.e-5 # Will point to input encoder lr, if user_custom_optimizer is False
+    num_train_epochs: 3
+    save_steps: 5000
+    evaluation_strategy: steps
+    eval_steps: 2000
+    fp16: true
+    fp16_opt_level: O1
+    lr_scheduler_type: "linear" # defaults to 'linear'
+    dataloader_num_workers: 4
+    label_names: [labels]
+    scenario: "unseen_labels"
+    ddp_find_unused_parameters: false
+    max_eval_samples: 15000

cleaned_code/configs/amzn13k_baseline_descs_fullsup.yml ADDED Viewed

	@@ -0,0 +1,74 @@

+EXP_NAME: "semsup_descs_100ep_newds_cosine"
+EXP_DESC: "SemSup Descriptions ran for 100 epochs"
+DATA:
+    task_name: amazon13k
+    dataset_name: amazon13k
+    dataset_config_name: null
+    max_seq_length: 128
+    overwrite_output_dir: true
+    overwrite_cache: false
+    pad_to_max_length: true
+    load_from_local: true
+    max_train_samples: null
+    max_eval_samples: null
+    max_predict_samples: null
+    train_file: datasets/Amzn13K/train.jsonl
+    validation_file: datasets/Amzn13K/test.jsonl
+    test_file: datasets/Amzn13K/test.jsonl
+    label_max_seq_length: 32
+    # descriptions_file: datasets/Amzn13K/amzn_curie_descsriptions.json
+    descriptions_file: datasets/Amzn13K/amzn_descs_refined_v3_v3.json
+    # all_labels : datasets/Amzn13K/all_labels.txt
+    # test_labels: datasets/Amzn13K/unseen_labels_split6500_2.txt
+    # max_descs_per_label: 10
+    contrastive_learning_samples: 5000
+    cl_min_positive_descs: 1
+    # bm_short_file: datasets/eurlex4.3k/train_bmshort.txt
+MODEL:
+    model_name_or_path: bert-base-uncased
+    config_name: null
+    tokenizer_name: null
+    cache_dir: null
+    use_fast_tokenizer: true
+    model_revision: main
+    use_auth_token: false
+    ignore_mismatched_sizes: false
+    negative_sampling: "none"
+    semsup: true
+    # label_model_name_or_path: bert-base-uncased # prajjwal1/bert-small
+    label_model_name_or_path: prajjwal1/bert-tiny
+    encoder_model_type: bert
+    use_custom_optimizer: adamw
+    output_learning_rate: 1.e-4
+    arch_type : 2
+    add_label_name: true
+    normalize_embeddings: false
+    tie_weights: false
+    coil: true
+    # use_precomputed_embeddings: datasets/eurlex4.3k/heir_withdescriptions_4.3k_v1_embs_bert_9_96.npy
+    token_dim: 16
+TRAINING:
+    do_train: true
+    do_eval: true
+    per_device_train_batch_size: 4
+    gradient_accumulation_steps: 1
+    per_device_eval_batch_size: 2
+    learning_rate: 5.e-5 # Will point to input encoder lr, if user_custom_optimizer is False
+    num_train_epochs: 3
+    save_steps: 30000
+    evaluation_strategy: steps
+    eval_steps: 5000
+    fp16: true
+    fp16_opt_level: O1
+    lr_scheduler_type: "linear" # defaults to 'linear'
+    dataloader_num_workers: 8
+    label_names: [labels]
+    scenario: "seen"
+    ddp_find_unused_parameters: false
+    max_eval_samples: 15000

cleaned_code/configs/amzn13k_baseline_descs_masked_0.0.yml ADDED Viewed

	@@ -0,0 +1,75 @@

+EXP_NAME: "semsup_descs_100ep_newds_cosine"
+EXP_DESC: "SemSup Descriptions ran for 100 epochs"
+DATA:
+    task_name: amazon13k
+    dataset_name: amazon13k
+    dataset_config_name: null
+    max_seq_length: 128
+    overwrite_output_dir: true # Set to false, if using one_hour_job
+    overwrite_cache: false
+    pad_to_max_length: true
+    load_from_local: true
+    max_train_samples: null
+    max_eval_samples: null
+    max_predict_samples: null
+    train_file: datasets/Amzn13K/train_split6500_2.jsonl
+    validation_file: datasets/Amzn13K/test_unseen_split6500_2.jsonl
+    test_file: datasets/Amzn13K/test_unseen_split6500_2.jsonl
+    label_max_seq_length: 32
+    descriptions_file: datasets/Amzn13K/amzn_descs_refined_v3_v3_masked_0.0.json
+    test_descriptions_file: datasets/Amzn13K/amzn_descs_refined_v3_v3.json
+    all_labels : datasets/Amzn13K/all_labels.txt
+    test_labels: datasets/Amzn13K/unseen_labels_split6500_2.txt
+    max_descs_per_label: 1000
+    contrastive_learning_samples: 3500
+    cl_min_positive_descs: 1
+MODEL:
+    model_name_or_path: bert-base-uncased
+    config_name: null
+    tokenizer_name: null
+    cache_dir: null
+    use_fast_tokenizer: true
+    model_revision: main
+    use_auth_token: false
+    ignore_mismatched_sizes: false
+    negative_sampling: "none"
+    semsup: true
+    label_model_name_or_path: prajjwal1/bert-small
+    encoder_model_type: bert
+    use_custom_optimizer: adamw
+    output_learning_rate: 1.e-4
+    arch_type : 2
+    add_label_name: true
+    normalize_embeddings: false
+    tie_weights: false
+    coil: true
+    colbert: false
+    token_dim: 16
+    label_frozen_layers: 2
+TRAINING:
+    do_train: true
+    do_eval: true
+    do_predict: false
+    per_device_train_batch_size: 1
+    gradient_accumulation_steps: 4
+    per_device_eval_batch_size: 1
+    learning_rate: 5.e-5 # Will point to input encoder lr, if user_custom_optimizer is False
+    num_train_epochs: 3
+    save_steps: 5000
+    evaluation_strategy: steps
+    eval_steps: 2000
+    fp16: true
+    fp16_opt_level: O1
+    lr_scheduler_type: "linear" # defaults to 'linear'
+    dataloader_num_workers: 4
+    label_names: [labels]
+    scenario: "unseen_labels"
+    ddp_find_unused_parameters: false
+    max_eval_samples: 15000

cleaned_code/configs/amzn13k_baseline_descs_masked_0.2.yml ADDED Viewed

	@@ -0,0 +1,75 @@

+EXP_NAME: "semsup_descs_100ep_newds_cosine"
+EXP_DESC: "SemSup Descriptions ran for 100 epochs"
+DATA:
+    task_name: amazon13k
+    dataset_name: amazon13k
+    dataset_config_name: null
+    max_seq_length: 128
+    overwrite_output_dir: true # Set to false, if using one_hour_job
+    overwrite_cache: false
+    pad_to_max_length: true
+    load_from_local: true
+    max_train_samples: null
+    max_eval_samples: null
+    max_predict_samples: null
+    train_file: datasets/Amzn13K/train_split6500_2.jsonl
+    validation_file: datasets/Amzn13K/test_unseen_split6500_2.jsonl
+    test_file: datasets/Amzn13K/test_unseen_split6500_2.jsonl
+    label_max_seq_length: 32
+    descriptions_file: datasets/Amzn13K/amzn_descs_refined_v3_v3_masked_0.2.json
+    test_descriptions_file: datasets/Amzn13K/amzn_descs_refined_v3_v3.json
+    all_labels : datasets/Amzn13K/all_labels.txt
+    test_labels: datasets/Amzn13K/unseen_labels_split6500_2.txt
+    max_descs_per_label: 1000
+    contrastive_learning_samples: 3500
+    cl_min_positive_descs: 1
+MODEL:
+    model_name_or_path: bert-base-uncased
+    config_name: null
+    tokenizer_name: null
+    cache_dir: null
+    use_fast_tokenizer: true
+    model_revision: main
+    use_auth_token: false
+    ignore_mismatched_sizes: false
+    negative_sampling: "none"
+    semsup: true
+    label_model_name_or_path: prajjwal1/bert-small
+    encoder_model_type: bert
+    use_custom_optimizer: adamw
+    output_learning_rate: 1.e-4
+    arch_type : 2
+    add_label_name: true
+    normalize_embeddings: false
+    tie_weights: false
+    coil: true
+    colbert: false
+    token_dim: 16
+    label_frozen_layers: 2
+TRAINING:
+    do_train: true
+    do_eval: true
+    do_predict: false
+    per_device_train_batch_size: 1
+    gradient_accumulation_steps: 4
+    per_device_eval_batch_size: 1
+    learning_rate: 5.e-5 # Will point to input encoder lr, if user_custom_optimizer is False
+    num_train_epochs: 3
+    save_steps: 5000
+    evaluation_strategy: steps
+    eval_steps: 2000
+    fp16: true
+    fp16_opt_level: O1
+    lr_scheduler_type: "linear" # defaults to 'linear'
+    dataloader_num_workers: 4
+    label_names: [labels]
+    scenario: "unseen_labels"
+    ddp_find_unused_parameters: false
+    max_eval_samples: 15000

cleaned_code/configs/amzn13k_baseline_descs_masked_0.5.yml ADDED Viewed

	@@ -0,0 +1,75 @@

+EXP_NAME: "semsup_descs_100ep_newds_cosine"
+EXP_DESC: "SemSup Descriptions ran for 100 epochs"
+DATA:
+    task_name: amazon13k
+    dataset_name: amazon13k
+    dataset_config_name: null
+    max_seq_length: 128
+    overwrite_output_dir: true # Set to false, if using one_hour_job
+    overwrite_cache: false
+    pad_to_max_length: true
+    load_from_local: true
+    max_train_samples: null
+    max_eval_samples: null
+    max_predict_samples: null
+    train_file: datasets/Amzn13K/train_split6500_2.jsonl
+    validation_file: datasets/Amzn13K/test_unseen_split6500_2.jsonl
+    test_file: datasets/Amzn13K/test_unseen_split6500_2.jsonl
+    label_max_seq_length: 32
+    descriptions_file: datasets/Amzn13K/amzn_descs_refined_v3_v3_masked_0.5.json
+    test_descriptions_file: datasets/Amzn13K/amzn_descs_refined_v3_v3.json
+    all_labels : datasets/Amzn13K/all_labels.txt
+    test_labels: datasets/Amzn13K/unseen_labels_split6500_2.txt
+    max_descs_per_label: 1000
+    contrastive_learning_samples: 3500
+    cl_min_positive_descs: 1
+MODEL:
+    model_name_or_path: bert-base-uncased
+    config_name: null
+    tokenizer_name: null
+    cache_dir: null
+    use_fast_tokenizer: true
+    model_revision: main
+    use_auth_token: false
+    ignore_mismatched_sizes: false
+    negative_sampling: "none"
+    semsup: true
+    label_model_name_or_path: prajjwal1/bert-small
+    encoder_model_type: bert
+    use_custom_optimizer: adamw
+    output_learning_rate: 1.e-4
+    arch_type : 2
+    add_label_name: true
+    normalize_embeddings: false
+    tie_weights: false
+    coil: true
+    colbert: false
+    token_dim: 16
+    label_frozen_layers: 2
+TRAINING:
+    do_train: true
+    do_eval: true
+    do_predict: false
+    per_device_train_batch_size: 1
+    gradient_accumulation_steps: 4
+    per_device_eval_batch_size: 1
+    learning_rate: 5.e-5 # Will point to input encoder lr, if user_custom_optimizer is False
+    num_train_epochs: 3
+    save_steps: 5000
+    evaluation_strategy: steps
+    eval_steps: 2000
+    fp16: true
+    fp16_opt_level: O1
+    lr_scheduler_type: "linear" # defaults to 'linear'
+    dataloader_num_workers: 4
+    label_names: [labels]
+    scenario: "unseen_labels"
+    ddp_find_unused_parameters: false
+    max_eval_samples: 15000

cleaned_code/configs/amzn13k_baseline_descs_masked_0.9.yml ADDED Viewed

	@@ -0,0 +1,75 @@

+EXP_NAME: "semsup_descs_100ep_newds_cosine"
+EXP_DESC: "SemSup Descriptions ran for 100 epochs"
+DATA:
+    task_name: amazon13k
+    dataset_name: amazon13k
+    dataset_config_name: null
+    max_seq_length: 128
+    overwrite_output_dir: true # Set to false, if using one_hour_job
+    overwrite_cache: false
+    pad_to_max_length: true
+    load_from_local: true
+    max_train_samples: null
+    max_eval_samples: null
+    max_predict_samples: null
+    train_file: datasets/Amzn13K/train_split6500_2.jsonl
+    validation_file: datasets/Amzn13K/test_unseen_split6500_2.jsonl
+    test_file: datasets/Amzn13K/test_unseen_split6500_2.jsonl
+    label_max_seq_length: 32
+    descriptions_file: datasets/Amzn13K/amzn_descs_refined_v3_v3_masked_0.9.json
+    test_descriptions_file: datasets/Amzn13K/amzn_descs_refined_v3_v3.json
+    all_labels : datasets/Amzn13K/all_labels.txt
+    test_labels: datasets/Amzn13K/unseen_labels_split6500_2.txt
+    max_descs_per_label: 1000
+    contrastive_learning_samples: 3500
+    cl_min_positive_descs: 1
+MODEL:
+    model_name_or_path: bert-base-uncased
+    config_name: null
+    tokenizer_name: null
+    cache_dir: null
+    use_fast_tokenizer: true
+    model_revision: main
+    use_auth_token: false
+    ignore_mismatched_sizes: false
+    negative_sampling: "none"
+    semsup: true
+    label_model_name_or_path: prajjwal1/bert-small
+    encoder_model_type: bert
+    use_custom_optimizer: adamw
+    output_learning_rate: 1.e-4
+    arch_type : 2
+    add_label_name: true
+    normalize_embeddings: false
+    tie_weights: false
+    coil: true
+    colbert: false
+    token_dim: 16
+    label_frozen_layers: 2
+TRAINING:
+    do_train: true
+    do_eval: true
+    do_predict: false
+    per_device_train_batch_size: 1
+    gradient_accumulation_steps: 4
+    per_device_eval_batch_size: 1
+    learning_rate: 5.e-5 # Will point to input encoder lr, if user_custom_optimizer is False
+    num_train_epochs: 3
+    save_steps: 5000
+    evaluation_strategy: steps
+    eval_steps: 2000
+    fp16: true
+    fp16_opt_level: O1
+    lr_scheduler_type: "linear" # defaults to 'linear'
+    dataloader_num_workers: 4
+    label_names: [labels]
+    scenario: "unseen_labels"
+    ddp_find_unused_parameters: false
+    max_eval_samples: 15000

cleaned_code/configs/amzn13k_baseline_descs_merge.yml ADDED Viewed

	@@ -0,0 +1,76 @@

+EXP_NAME: "semsup_descs_100ep_newds_cosine"
+EXP_DESC: "SemSup Descriptions ran for 100 epochs"
+DATA:
+    task_name: amazon13k
+    dataset_name: amazon13k
+    dataset_config_name: null
+    max_seq_length: 128
+    overwrite_output_dir: true
+    overwrite_cache: false
+    pad_to_max_length: true
+    load_from_local: true
+    max_train_samples: null
+    max_eval_samples: null
+    max_predict_samples: null
+    train_file: datasets/Amzn13K/train_split6500_2.jsonl
+    validation_file: datasets/Amzn13K/test_unseen_split6500_2.jsonl
+    test_file: datasets/Amzn13K/test_unseen_split6500_2.jsonl
+    label_max_seq_length: 80
+    # descriptions_file: datasets/Amzn13K/amzn_curie_descsriptions.json
+    descriptions_file: datasets/Amzn13K/amzn_descs_refined_v3_v3_merge3.json
+    all_labels : datasets/Amzn13K/all_labels.txt
+    test_labels: datasets/Amzn13K/unseen_labels_split6500_2.txt
+    # max_descs_per_label: 10
+    contrastive_learning_samples: 5000
+    cl_min_positive_descs: 1
+    # bm_short_file: datasets/eurlex4.3k/train_bmshort.txt
+MODEL:
+    model_name_or_path: bert-base-uncased
+    config_name: null
+    tokenizer_name: null
+    cache_dir: null
+    use_fast_tokenizer: true
+    model_revision: main
+    use_auth_token: false
+    ignore_mismatched_sizes: false
+    negative_sampling: "none"
+    semsup: true
+    # label_model_name_or_path: bert-base-uncased # prajjwal1/bert-small
+    label_model_name_or_path: prajjwal1/bert-tiny
+    encoder_model_type: bert
+    use_custom_optimizer: adamw
+    output_learning_rate: 1.e-4
+    arch_type : 2
+    add_label_name: true
+    normalize_embeddings: false
+    tie_weights: false
+    coil: true
+    colbert: true
+    # use_precomputed_embeddings: datasets/eurlex4.3k/heir_withdescriptions_4.3k_v1_embs_bert_9_96.npy
+    token_dim: 16
+    label_frozen_layers: 2
+TRAINING:
+    do_train: true
+    do_eval: true
+    per_device_train_batch_size: 2
+    gradient_accumulation_steps: 4
+    per_device_eval_batch_size: 2
+    learning_rate: 5.e-5 # Will point to input encoder lr, if user_custom_optimizer is False
+    num_train_epochs: 3
+    save_steps: 10000
+    evaluation_strategy: steps
+    eval_steps: 1000
+    fp16: true
+    fp16_opt_level: O1
+    lr_scheduler_type: "linear" # defaults to 'linear'
+    dataloader_num_workers: 8
+    label_names: [labels]
+    scenario: "unseen_labels"
+    ddp_find_unused_parameters: false
+    max_eval_samples: 15000

cleaned_code/configs/amzn13k_baseline_fs.yml ADDED Viewed

	@@ -0,0 +1,80 @@

+EXP_NAME: "semsup_descs_100ep_newds_cosine"
+EXP_DESC: "SemSup Descriptions ran for 100 epochs"
+DATA:
+    task_name: amazon13k
+    dataset_name: amazon13k
+    dataset_config_name: null
+    max_seq_length: 128
+    overwrite_output_dir: true # Set to false, if using one_hour_job
+    overwrite_cache: false
+    pad_to_max_length: true
+    load_from_local: true
+    max_train_samples: null
+    max_eval_samples: null
+    max_predict_samples: null
+    train_file: datasets/Amzn13K/train_split6500_fs100.jsonl
+    validation_file: datasets/Amzn13K/test_unseen_split6500_2.jsonl
+    test_file: datasets/Amzn13K/test_unseen_split6500_2.jsonl
+    label_max_seq_length: 32
+    # descriptions_file: datasets/Amzn13K/amzn_curie_descsriptions.json
+    descriptions_file: datasets/Amzn13K/amzn_descs_refined_v3_v3.json
+    all_labels : datasets/Amzn13K/all_labels.txt
+    test_labels: datasets/Amzn13K/unseen_labels_split6500_2.txt
+    # max_descs_per_label: 10
+    # contrastive_learning_samples: 5000
+    # cl_min_positive_descs: 1
+    # bm_short_file: datasets/eurlex4.3k/train_bmshort.txt
+    ignore_pos_labels_file: datasets/Amzn13K/ignore_train_split6500_fs100.txt
+MODEL:
+    model_name_or_path: bert-base-uncased
+    pretrained_model_path: output/semsup_descs_amzn13k_web_6500_small/checkpoint-20000/pytorch_model.bin
+    config_name: null
+    tokenizer_name: null
+    cache_dir: null
+    use_fast_tokenizer: true
+    model_revision: main
+    use_auth_token: false
+    ignore_mismatched_sizes: false
+    negative_sampling: "none"
+    semsup: true
+    # label_model_name_or_path: bert-base-uncased # prajjwal1/bert-small
+    label_model_name_or_path: prajjwal1/bert-small
+    encoder_model_type: bert
+    use_custom_optimizer: adamw
+    output_learning_rate: 5.e-5
+    arch_type : 2
+    add_label_name: true
+    normalize_embeddings: false
+    tie_weights: false
+    coil: true
+    colbert: false
+    # use_precomputed_embeddings: datasets/eurlex4.3k/heir_withdescriptions_4.3k_v1_embs_bert_9_96.npy
+    token_dim: 16
+    label_frozen_layers: 2
+TRAINING:
+    do_train: true
+    do_eval: true
+    do_predict: false
+    per_device_train_batch_size: 1
+    gradient_accumulation_steps: 8
+    per_device_eval_batch_size: 1
+    learning_rate: 2.e-5 # Will point to input encoder lr, if user_custom_optimizer is False
+    num_train_epochs: 10
+    save_steps: 5000
+    evaluation_strategy: steps
+    eval_steps: 500
+    fp16: true
+    fp16_opt_level: O1
+    lr_scheduler_type: "linear" # defaults to 'linear'
+    dataloader_num_workers: 4
+    label_names: [labels]
+    scenario: "unseen_labels"
+    ddp_find_unused_parameters: false
+    max_eval_samples: 15000
+    # ignore_data_skip: true
+    # one_hour_job: true

cleaned_code/configs/amzn13k_baseline_fs2.yml ADDED Viewed

	@@ -0,0 +1,80 @@

+EXP_NAME: "semsup_descs_100ep_newds_cosine"
+EXP_DESC: "SemSup Descriptions ran for 100 epochs"
+DATA:
+    task_name: amazon13k
+    dataset_name: amazon13k
+    dataset_config_name: null
+    max_seq_length: 128
+    overwrite_output_dir: false # Set to false, if using one_hour_job
+    overwrite_cache: false
+    pad_to_max_length: true
+    load_from_local: true
+    max_train_samples: null
+    max_eval_samples: null
+    max_predict_samples: null
+    train_file: datasets/Amzn13K/train_split6500_fs5.jsonl
+    validation_file: datasets/Amzn13K/test_unseen_split6500_fs5.jsonl
+    test_file: datasets/Amzn13K/test_unseen_split6500_fs5.jsonl
+    label_max_seq_length: 32
+    # descriptions_file: datasets/Amzn13K/amzn_curie_descsriptions.json
+    descriptions_file: datasets/Amzn13K/amzn_descs_refined_v3_v3.json
+    all_labels : datasets/Amzn13K/all_labels.txt
+    test_labels: datasets/Amzn13K/unseen_labels_split6500_fs5.txt
+    # max_descs_per_label: 10
+    # contrastive_learning_samples: 5000
+    # cl_min_positive_descs: 1
+    # bm_short_file: datasets/eurlex4.3k/train_bmshort.txt
+    # ignore_pos_labels_file: datasets/Amzn13K/ignore_train_split6500_fs5.txt
+MODEL:
+    model_name_or_path: bert-base-uncased
+    pretrained_model_path: output/semsup_descs_amzn13k_web_6500_small/checkpoint-20000/pytorch_model.bin
+    config_name: null
+    tokenizer_name: null
+    cache_dir: null
+    use_fast_tokenizer: true
+    model_revision: main
+    use_auth_token: false
+    ignore_mismatched_sizes: false
+    negative_sampling: "none"
+    semsup: true
+    # label_model_name_or_path: bert-base-uncased # prajjwal1/bert-small
+    label_model_name_or_path: prajjwal1/bert-small
+    encoder_model_type: bert
+    use_custom_optimizer: adamw
+    output_learning_rate: 5.e-5
+    arch_type : 2
+    add_label_name: true
+    normalize_embeddings: false
+    tie_weights: false
+    coil: true
+    colbert: true
+    # use_precomputed_embeddings: datasets/eurlex4.3k/heir_withdescriptions_4.3k_v1_embs_bert_9_96.npy
+    token_dim: 16
+    label_frozen_layers: 2
+TRAINING:
+    do_train: true
+    do_eval: true
+    do_predict: false
+    per_device_train_batch_size: 1
+    gradient_accumulation_steps: 4
+    per_device_eval_batch_size: 1
+    learning_rate: 2.e-5 # Will point to input encoder lr, if user_custom_optimizer is False
+    num_train_epochs: 3
+    save_steps: 10000
+    evaluation_strategy: steps
+    eval_steps: 500
+    fp16: true
+    fp16_opt_level: O1
+    lr_scheduler_type: "linear" # defaults to 'linear'
+    dataloader_num_workers: 8
+    label_names: [labels]
+    scenario: "unseen_labels"
+    ddp_find_unused_parameters: false
+    max_eval_samples: 15000
+    # ignore_data_skip: true
+    # one_hour_job: true

cleaned_code/configs/amzn13k_baseline_fs5.yml ADDED Viewed

	@@ -0,0 +1,80 @@

+EXP_NAME: "semsup_descs_100ep_newds_cosine"
+EXP_DESC: "SemSup Descriptions ran for 100 epochs"
+DATA:
+    task_name: amazon13k
+    dataset_name: amazon13k
+    dataset_config_name: null
+    max_seq_length: 128
+    overwrite_output_dir: true # Set to false, if using one_hour_job
+    overwrite_cache: false
+    pad_to_max_length: true
+    load_from_local: true
+    max_train_samples: null
+    max_eval_samples: null
+    max_predict_samples: null
+    train_file: datasets/Amzn13K/train_split6500_fs5.jsonl
+    validation_file: datasets/Amzn13K/test_unseen_split6500_2.jsonl
+    test_file: datasets/Amzn13K/test_unseen_split6500_2.jsonl
+    label_max_seq_length: 32
+    # descriptions_file: datasets/Amzn13K/amzn_curie_descsriptions.json
+    descriptions_file: datasets/Amzn13K/amzn_descs_refined_v3_v3.json
+    all_labels : datasets/Amzn13K/all_labels.txt
+    test_labels: datasets/Amzn13K/unseen_labels_split6500_2.txt
+    # max_descs_per_label: 10
+    # contrastive_learning_samples: 5000
+    # cl_min_positive_descs: 1
+    # bm_short_file: datasets/eurlex4.3k/train_bmshort.txt
+    ignore_pos_labels_file: datasets/Amzn13K/ignore_train_split6500_fs5.txt
+MODEL:
+    model_name_or_path: bert-base-uncased
+    pretrained_model_path: output/semsup_descs_amzn13k_web_6500_small/checkpoint-20000/pytorch_model.bin
+    config_name: null
+    tokenizer_name: null
+    cache_dir: null
+    use_fast_tokenizer: true
+    model_revision: main
+    use_auth_token: false
+    ignore_mismatched_sizes: false
+    negative_sampling: "none"
+    semsup: true
+    # label_model_name_or_path: bert-base-uncased # prajjwal1/bert-small
+    label_model_name_or_path: prajjwal1/bert-small
+    encoder_model_type: bert
+    use_custom_optimizer: adamw
+    output_learning_rate: 5.e-5
+    arch_type : 2
+    add_label_name: true
+    normalize_embeddings: false
+    tie_weights: false
+    coil: true
+    colbert: false
+    # use_precomputed_embeddings: datasets/eurlex4.3k/heir_withdescriptions_4.3k_v1_embs_bert_9_96.npy
+    token_dim: 16
+    label_frozen_layers: 2
+TRAINING:
+    do_train: true
+    do_eval: true
+    do_predict: false
+    per_device_train_batch_size: 1
+    gradient_accumulation_steps: 8
+    per_device_eval_batch_size: 1
+    learning_rate: 2.e-5 # Will point to input encoder lr, if user_custom_optimizer is False
+    num_train_epochs: 10
+    save_steps: 5000
+    evaluation_strategy: steps
+    eval_steps: 500
+    fp16: true
+    fp16_opt_level: O1
+    lr_scheduler_type: "linear" # defaults to 'linear'
+    dataloader_num_workers: 4
+    label_names: [labels]
+    scenario: "unseen_labels"
+    ddp_find_unused_parameters: false
+    max_eval_samples: 15000
+    # ignore_data_skip: true
+    # one_hour_job: true

cleaned_code/configs/amzn13k_baseline_hierdescs.yml ADDED Viewed

	@@ -0,0 +1,84 @@

+EXP_NAME: "semsup_descs_100ep_newds_cosine"
+EXP_DESC: "SemSup Descriptions ran for 100 epochs"
+DATA:
+    task_name: amazon13k
+    dataset_name: amazon13k
+    dataset_config_name: null
+    max_seq_length: 128
+    overwrite_output_dir: true # Set to false, if using one_hour_job
+    overwrite_cache: false
+    pad_to_max_length: true
+    load_from_local: true
+    max_train_samples: null
+    max_eval_samples: null
+    max_predict_samples: null
+    train_file: datasets/Amzn13K/train_split6500_2.jsonl
+    validation_file: datasets/Amzn13K/test_unseen_split6500_2.jsonl
+    test_file: datasets/Amzn13K/test_unseen_split6500_2.jsonl
+    label_max_seq_length: 96
+    # descriptions_file: datasets/Amzn13K/amzn_curie_descsriptions.json
+    descriptions_file: datasets/Amzn13K/heir_withdescriptions_v3_v3_unseen.json
+    test_descriptions_file: datasets/Amzn13K/heir_withdescriptions_v3_v3.json
+    all_labels : datasets/Amzn13K/all_labels.txt
+    test_labels: datasets/Amzn13K/unseen_labels_split6500_2.txt
+    contrastive_learning_samples: 2000
+    cl_min_positive_descs: 1
+    # bm_short_file: datasets/eurlex4.3k/train_bmshort.txt
+    # ignore_pos_labels_file: datasets/Amzn13K/ignore_train_split6500_fs5.txt
+    coil_cluster_mapping_path: bert_coil_map_dict_lemma255K_isotropic.json
+MODEL:
+    model_name_or_path: bert-base-uncased
+    # pretrained_model_path: output/semsup_descs_amzn13k_web_6500_small/checkpoint-20000/pytorch_model.bin
+    config_name: null
+    tokenizer_name: null
+    cache_dir: null
+    use_fast_tokenizer: true
+    model_revision: main
+    use_auth_token: false
+    ignore_mismatched_sizes: false
+    negative_sampling: "none"
+    semsup: true
+    # label_model_name_or_path: bert-base-uncased # prajjwal1/bert-small
+    label_model_name_or_path: prajjwal1/bert-small
+    encoder_model_type: bert
+    use_custom_optimizer: adamw
+    output_learning_rate: 1.e-4
+    arch_type : 2
+    add_label_name: true
+    normalize_embeddings: false
+    tie_weights: false
+    coil: true
+    colbert: false
+    # use_precomputed_embeddings: datasets/eurlex4.3k/heir_withdescriptions_4.3k_v1_embs_bert_9_96.npy
+    token_dim: 16
+    label_frozen_layers: 2
+TRAINING:
+    do_train: true
+    do_eval: true
+    do_predict: false
+    per_device_train_batch_size: 1
+    gradient_accumulation_steps: 8
+    per_device_eval_batch_size: 1
+    learning_rate: 5.e-5 # Will point to input encoder lr, if user_custom_optimizer is False
+    num_train_epochs: 3
+    save_steps: 4900
+    evaluation_strategy: steps
+    eval_steps: 5000
+    fp16: true
+    fp16_opt_level: O1
+    lr_scheduler_type: "linear" # defaults to 'linear'
+    dataloader_num_workers: 4
+    label_names: [labels]
+    scenario: "unseen_labels"
+    ddp_find_unused_parameters: false
+    max_eval_samples: 15000
+    ignore_data_skip: true
+    # one_hour_job: true

cleaned_code/configs/amzn13k_baseline_hierdescs_seen.yml ADDED Viewed

	@@ -0,0 +1,82 @@

+EXP_NAME: "semsup_descs_100ep_newds_cosine"
+EXP_DESC: "SemSup Descriptions ran for 100 epochs"
+DATA:
+    task_name: amazon13k
+    dataset_name: amazon13k
+    dataset_config_name: null
+    max_seq_length: 128
+    overwrite_output_dir: true # Set to false, if using one_hour_job
+    overwrite_cache: false
+    pad_to_max_length: true
+    load_from_local: true
+    max_train_samples: null
+    max_eval_samples: null
+    max_predict_samples: null
+    train_file: datasets/Amzn13K/train_split6500_2.jsonl
+    validation_file: datasets/Amzn13K/test_unseen_split6500_2.jsonl
+    test_file: datasets/Amzn13K/test_unseen_split6500_2.jsonl
+    label_max_seq_length: 96
+    descriptions_file: datasets/Amzn13K/heir_withdescriptions_v3_v3.json
+    # descriptions_file: datasets/Amzn13K/heir_withdescriptions_v3_v3_unseen.json
+    # test_descriptions_file: datasets/Amzn13K/heir_withdescriptions_v3_v3.json
+    all_labels : datasets/Amzn13K/all_labels.txt
+    test_labels: datasets/Amzn13K/unseen_labels_split6500_2.txt
+    contrastive_learning_samples: 2000
+    cl_min_positive_descs: 1
+    # bm_short_file: datasets/eurlex4.3k/train_bmshort.txt
+    # ignore_pos_labels_file: datasets/Amzn13K/ignore_train_split6500_fs5.txt
+MODEL:
+    model_name_or_path: bert-base-uncased
+    # pretrained_model_path: output/semsup_descs_amzn13k_web_6500_small/checkpoint-20000/pytorch_model.bin
+    config_name: null
+    tokenizer_name: null
+    cache_dir: null
+    use_fast_tokenizer: true
+    model_revision: main
+    use_auth_token: false
+    ignore_mismatched_sizes: false
+    negative_sampling: "none"
+    semsup: true
+    # label_model_name_or_path: bert-base-uncased # prajjwal1/bert-small
+    label_model_name_or_path: prajjwal1/bert-small
+    encoder_model_type: bert
+    use_custom_optimizer: adamw
+    output_learning_rate: 1.e-4
+    arch_type : 2
+    add_label_name: true
+    normalize_embeddings: false
+    tie_weights: false
+    coil: true
+    colbert: false
+    # use_precomputed_embeddings: datasets/eurlex4.3k/heir_withdescriptions_4.3k_v1_embs_bert_9_96.npy
+    token_dim: 16
+    label_frozen_layers: 2
+TRAINING:
+    do_train: true
+    do_eval: true
+    per_device_train_batch_size: 1
+    gradient_accumulation_steps: 4
+    per_device_eval_batch_size: 1
+    learning_rate: 5.e-5 # Will point to input encoder lr, if user_custom_optimizer is False
+    num_train_epochs: 3
+    save_steps: 5000
+    evaluation_strategy: steps
+    eval_steps: 1000
+    fp16: true
+    fp16_opt_level: O1
+    lr_scheduler_type: "linear" # defaults to 'linear'
+    dataloader_num_workers: 4
+    label_names: [labels]
+    scenario: "unseen_labels"
+    ddp_find_unused_parameters: false
+    max_eval_samples: 15000
+    # ignore_data_skip: true
+    # one_hour_job: true

cleaned_code/configs/baseline.yml ADDED Viewed

	@@ -0,0 +1,52 @@

+EXP_NAME: "eurlex4k_baseline_128_newds"
+EXP_DESC: "Eurlex4K Baseline with len=128 on new dataset"
+# Ideally would contain all the possible keys
+DATA:
+    task_name: eurlex4k
+    dataset_name: eurlex
+    dataset_config_name: null
+    max_seq_length: 128
+    overwrite_output_dir: true
+    overwrite_cache: true
+    pad_to_max_length: true
+    load_from_local: true
+    max_train_samples: null
+    max_eval_samples: null
+    max_predict_samples: null
+    train_file: datasets/eurlex_raw_text_dataset/train.jsonl
+    validation_file: datasets/eurlex_raw_text_dataset/test.jsonl
+    test_file: datasets/eurlex_raw_text_dataset/test.jsonl
+MODEL:
+    model_name_or_path: bert-base-uncased
+    config_name: null
+    tokenizer_name: null
+    cache_dir: null
+    use_fast_tokenizer: true
+    model_revision: main
+    use_auth_token: false
+    ignore_mismatched_sizes: false
+    negative_sampling: "none"
+    semsup: false
+    encoder_model_type: bert
+    user_custom_optimizer: null
+TRAINING:
+    do_train: true
+    do_eval: true
+    per_device_train_batch_size: 8
+    gradient_accumulation_steps: 1
+    learning_rate: 1.e-4 # Will point to input encoder lr, if user_custom_optimizer is False
+    num_train_epochs: 30
+    save_steps: 20000
+    evaluation_strategy: steps
+    eval_steps: 10000
+    fp16: true
+    fp16_opt_level: O1
+    lr_scheduler_type: "constant_with_warmup" # defaults to 'linear'
+    dataloader_num_workers: 4
+    label_names: [labels]

cleaned_code/configs/eurlex4.3k_baseline.yml ADDED Viewed

	@@ -0,0 +1,87 @@

+EXP_NAME: "semsup_descs_100ep_newds_cosine"
+EXP_DESC: "SemSup Descriptions ran for 100 epochs"
+DATA:
+    task_name: eurlex57k
+    dataset_name: eurlex
+    dataset_config_name: null
+    max_seq_length: 512
+    overwrite_output_dir: true
+    overwrite_cache: false
+    pad_to_max_length: true
+    load_from_local: true
+    max_train_samples: null
+    max_eval_samples: null
+    max_predict_samples: null
+    # train_file: ../training/datasets/eurlex4.3k/train_hr.jsonl
+    # train_file: ../training/datasets/eurlex4.3k/train.jsonl
+    # validation_file: ../training/datasets/eurlex4.3k/test_unseen.jsonl
+    # test_file: ../training/datasets/eurlex4.3k/test_unseen.jsonl
+    # validation_file: ../training/datasets/eurlex4.3k/test.jsonl
+    # test_file: ../training/datasets/eurlex4.3k/test.jsonl
+    train_file: ../training/datasets/eurlex4.3k/train_split1057_1000highfreq.jsonl
+    validation_file: ../training/datasets/eurlex4.3k/test_unseen_split1057.jsonl
+    test_file: ../training/datasets/eurlex4.3k/test_unseen_split1057.jsonl
+    # validation_file: ../training/datasets/eurlex4.3k/test_unseen_hr.jsonl
+    # test_file: ../training/datasets/eurlex4.3k/test_unseen_hr.jsonl
+    label_max_seq_length: 96
+    descriptions_file: ../training/datasets/eurlex4.3k/heir_withdescriptions_4.3k_v1.json
+    # descriptions_file: ../training/datasets/eurlex4.3k/heir_descriptions_4.3k_v1.json
+    # descriptions_file: ../training/datasets/eurlex4.3k/curie_descriptions_4.3k_v1.json
+    all_labels : ../training/datasets/eurlex4.3k/all_labels.txt
+    test_labels: ../training/datasets/eurlex4.3k/unseen_labels_split1057.txt
+    # test_labels: ../training/datasets/eurlex4.3k/unseen_labels.txt
+    max_descs_per_label: 5
+    # contrastive_learning_samples: 1500
+    # cl_min_positive_descs: 1
+    # bm_short_file: ../training/datasets/eurlex4.3k/train_bmshort.txt
+MODEL:
+    model_name_or_path: bert-base-uncased
+    config_name: null
+    tokenizer_name: null
+    cache_dir: null
+    use_fast_tokenizer: true
+    model_revision: main
+    use_auth_token: false
+    ignore_mismatched_sizes: false
+    negative_sampling: "none"
+    semsup: true
+    # label_model_name_or_path: prajjwal1/bert-small
+    label_model_name_or_path: bert-base-uncased
+    # label_model_name_or_path: prajjwal1/bert-tiny
+    encoder_model_type: bert
+    use_custom_optimizer: adamw
+    output_learning_rate: 1.e-4
+    arch_type : 2
+    add_label_name: false
+    normalize_embeddings: false
+    tie_weights: true
+    coil: true
+    # use_precomputed_embeddings: ../training/datasets/eurlex4.3k/heir_withdescriptions_4.3k_v1_embs_bert_9_96.npy
+    token_dim: 16
+    num_frozen_layers: 9
+TRAINING:
+    do_train: true
+    do_eval: true
+    per_device_train_batch_size: 1
+    gradient_accumulation_steps: 4
+    per_device_eval_batch_size: 1
+    learning_rate: 5.e-5 # Will point to input encoder lr, if user_custom_optimizer is False
+    num_train_epochs: 10
+    save_steps: 10000
+    evaluation_strategy: steps
+    eval_steps: 500
+    fp16: true
+    fp16_opt_level: O1
+    lr_scheduler_type: "linear" # defaults to 'linear'
+    dataloader_num_workers: 8
+    label_names: [labels]
+    scenario: "unseen_labels"
+    ddp_find_unused_parameters: false

cleaned_code/configs/eurlex4.3k_baseline2.yml ADDED Viewed

	@@ -0,0 +1,84 @@

+EXP_NAME: "semsup_descs_100ep_newds_cosine"
+EXP_DESC: "SemSup Descriptions ran for 100 epochs"
+DATA:
+    task_name: eurlex57k
+    dataset_name: eurlex
+    dataset_config_name: null
+    max_seq_length: 512
+    overwrite_output_dir: true
+    overwrite_cache: false
+    pad_to_max_length: true
+    load_from_local: true
+    max_train_samples: null
+    max_eval_samples: null
+    max_predict_samples: null
+    # train_file: datasets/eurlex4.3k/train_hr.jsonl
+    # train_file: datasets/eurlex4.3k/train.jsonl
+    # validation_file: datasets/eurlex4.3k/test_unseen.jsonl
+    # test_file: datasets/eurlex4.3k/test_unseen.jsonl
+    # validation_file: datasets/eurlex4.3k/test.jsonl
+    # test_file: datasets/eurlex4.3k/test.jsonl
+    train_file: datasets/eurlex4.3k/train_split248_root.jsonl
+    validation_file: datasets/eurlex4.3k/test_unseen_split248_root.jsonl
+    test_file: datasets/eurlex4.3k/test_unseen_split248_root.jsonl
+    # validation_file: datasets/eurlex4.3k/test_unseen_hr.jsonl
+    # test_file: datasets/eurlex4.3k/test_unseen_hr.jsonl
+    label_max_seq_length: 96
+    descriptions_file: datasets/eurlex4.3k/heir_withdescriptions_4.3k_v1.json
+    # descriptions_file: datasets/eurlex4.3k/curie_descriptions_4.3k_v1.json
+    all_labels : datasets/eurlex4.3k/all_labels.txt
+    test_labels: datasets/eurlex4.3k/unseen_labels_split248_root.txt
+    # test_labels: datasets/eurlex4.3k/unseen_labels.txt
+    max_descs_per_label: 5
+    contrastive_learning_samples: 2500
+    cl_min_positive_descs: 1
+    bm_short_file: datasets/eurlex4.3k/train_bmshort.txt
+MODEL:
+    model_name_or_path: bert-base-uncased
+    config_name: null
+    tokenizer_name: null
+    cache_dir: null
+    use_fast_tokenizer: true
+    model_revision: main
+    use_auth_token: false
+    ignore_mismatched_sizes: false
+    negative_sampling: "none"
+    semsup: true
+    # label_model_name_or_path: bert-base-uncased # prajjwal1/bert-small
+    label_model_name_or_path: prajjwal1/bert-tiny
+    encoder_model_type: bert
+    use_custom_optimizer: adamw
+    output_learning_rate: 1.e-4
+    arch_type : 2
+    add_label_name: false
+    normalize_embeddings: false
+    tie_weights: false
+    coil: true
+    # use_precomputed_embeddings: datasets/eurlex4.3k/heir_withdescriptions_4.3k_v1_embs_bert_9_96.npy
+    token_dim: 16
+TRAINING:
+    do_train: true
+    do_eval: true
+    per_device_train_batch_size: 1
+    gradient_accumulation_steps: 8
+    per_device_eval_batch_size: 2
+    learning_rate: 5.e-5 # Will point to input encoder lr, if user_custom_optimizer is False
+    num_train_epochs: 10
+    save_steps: 10000
+    evaluation_strategy: steps
+    eval_steps: 500
+    fp16: true
+    fp16_opt_level: O1
+    lr_scheduler_type: "linear" # defaults to 'linear'
+    dataloader_num_workers: 8
+    label_names: [labels]
+    scenario: "unseen_labels"
+    ddp_find_unused_parameters: false

cleaned_code/configs/eurlex4.3k_baseline_fs.yml ADDED Viewed

	@@ -0,0 +1,90 @@

+EXP_NAME: "semsup_descs_100ep_newds_cosine"
+EXP_DESC: "SemSup Descriptions ran for 100 epochs"
+DATA:
+    task_name: eurlex57k
+    dataset_name: eurlex
+    dataset_config_name: null
+    max_seq_length: 512
+    overwrite_output_dir: true
+    overwrite_cache: false
+    pad_to_max_length: true
+    load_from_local: true
+    max_train_samples: null
+    max_eval_samples: null
+    max_predict_samples: null
+    # train_file: datasets/eurlex4.3k/train_hr.jsonl
+    # train_file: datasets/eurlex4.3k/train.jsonl
+    # validation_file: datasets/eurlex4.3k/test_unseen.jsonl
+    # test_file: datasets/eurlex4.3k/test_unseen.jsonl
+    # validation_file: datasets/eurlex4.3k/test.jsonl
+    # test_file: datasets/eurlex4.3k/test.jsonl
+    train_file: datasets/eurlex4.3k/train_split1057_fs1.jsonl
+    validation_file: datasets/eurlex4.3k/test_unseen_split1057_fs1.jsonl
+    test_file: datasets/eurlex4.3k/test_unseen_split1057_fs1.jsonl
+    # validation_file: datasets/eurlex4.3k/test_unseen_hr.jsonl
+    # test_file: datasets/eurlex4.3k/test_unseen_hr.jsonl
+    label_max_seq_length: 80
+    # descriptions_file: datasets/eurlex4.3k/heir_withdescriptions_4.3k_v1.json
+    descriptions_file: datasets/eurlex4.3k/heir_descriptions_4.3k_v1.json
+    # descriptions_file: datasets/eurlex4.3k/curie_descriptions_4.3k_v1.json
+    all_labels : datasets/eurlex4.3k/all_labels.txt
+    test_labels: datasets/eurlex4.3k/unseen_labels_split1057_fs1.txt
+    # test_labels: datasets/eurlex4.3k/unseen_labels.txt
+    ignore_pos_labels_file: datasets/eurlex4.3k/ignore_train_split1057_fs1.txt
+    max_descs_per_label: 5
+    contrastive_learning_samples: 600
+    cl_min_positive_descs: 2
+    # bm_short_file: datasets/eurlex4.3k/train_bmshort.txt
+MODEL:
+    model_name_or_path: bert-base-uncased
+    pretrained_model_path: output/semsup_descs_100ep_4.3k_unseen_coilsmall_hier/checkpoint-20000/pytorch_model.bin
+    config_name: null
+    tokenizer_name: null
+    cache_dir: null
+    use_fast_tokenizer: true
+    model_revision: main
+    use_auth_token: false
+    ignore_mismatched_sizes: false
+    negative_sampling: "none"
+    semsup: true
+    label_model_name_or_path: prajjwal1/bert-small
+    # label_model_name_or_path: prajjwal1/bert-tiny
+    encoder_model_type: bert
+    use_custom_optimizer: adamw
+    output_learning_rate: 1.e-4
+    arch_type : 2
+    add_label_name: false
+    normalize_embeddings: false
+    tie_weights: false
+    coil: true
+    # use_precomputed_embeddings: datasets/eurlex4.3k/heir_withdescriptions_4.3k_v1_embs_bert_9_96.npy
+    token_dim: 16
+    label_frozen_layers: 2
+TRAINING:
+    do_train: true
+    do_eval: true
+    per_device_train_batch_size: 1
+    gradient_accumulation_steps: 4
+    per_device_eval_batch_size: 1
+    learning_rate: 5.e-5 # Will point to input encoder lr, if user_custom_optimizer is False
+    num_train_epochs: 100
+    save_steps: 10000
+    evaluation_strategy: steps
+    eval_steps: 100
+    fp16: true
+    fp16_opt_level: O1
+    lr_scheduler_type: "linear" # defaults to 'linear'
+    dataloader_num_workers: 1
+    label_names: [labels]
+    scenario: "unseen_labels"
+    ddp_find_unused_parameters: false
+    # ignore_data_skip: true
+    # one_hour_job: true

cleaned_code/configs/eurlex4.3k_baseline_fs20.yml ADDED Viewed

	@@ -0,0 +1,90 @@

+EXP_NAME: "semsup_descs_100ep_newds_cosine"
+EXP_DESC: "SemSup Descriptions ran for 100 epochs"
+DATA:
+    task_name: eurlex57k
+    dataset_name: eurlex
+    dataset_config_name: null
+    max_seq_length: 512
+    overwrite_output_dir: true
+    overwrite_cache: false
+    pad_to_max_length: true
+    load_from_local: true
+    max_train_samples: null
+    max_eval_samples: null
+    max_predict_samples: null
+    # train_file: datasets/eurlex4.3k/train_hr.jsonl
+    # train_file: datasets/eurlex4.3k/train.jsonl
+    # validation_file: datasets/eurlex4.3k/test_unseen.jsonl
+    # test_file: datasets/eurlex4.3k/test_unseen.jsonl
+    # validation_file: datasets/eurlex4.3k/test.jsonl
+    # test_file: datasets/eurlex4.3k/test.jsonl
+    train_file: datasets/eurlex4.3k/train_split1057_fs20.jsonl
+    validation_file: datasets/eurlex4.3k/test_unseen_split1057_fs20.jsonl
+    test_file: datasets/eurlex4.3k/test_unseen_split1057_fs20.jsonl
+    # validation_file: datasets/eurlex4.3k/test_unseen_hr.jsonl
+    # test_file: datasets/eurlex4.3k/test_unseen_hr.jsonl
+    label_max_seq_length: 80
+    # descriptions_file: datasets/eurlex4.3k/heir_withdescriptions_4.3k_v1.json
+    descriptions_file: datasets/eurlex4.3k/heir_descriptions_4.3k_v1.json
+    # descriptions_file: datasets/eurlex4.3k/curie_descriptions_4.3k_v1.json
+    all_labels : datasets/eurlex4.3k/all_labels.txt
+    test_labels: datasets/eurlex4.3k/unseen_labels_split1057_fs20.txt
+    # test_labels: datasets/eurlex4.3k/unseen_labels.txt
+    ignore_pos_labels_file: datasets/eurlex4.3k/ignore_train_split1057_fs20.txt
+    max_descs_per_label: 5
+    contrastive_learning_samples: 600
+    cl_min_positive_descs: 2
+    # bm_short_file: datasets/eurlex4.3k/train_bmshort.txt
+MODEL:
+    model_name_or_path: bert-base-uncased
+    pretrained_model_path: output/semsup_descs_100ep_4.3k_unseen_coilsmall_hier/checkpoint-20000/pytorch_model.bin
+    config_name: null
+    tokenizer_name: null
+    cache_dir: null
+    use_fast_tokenizer: true
+    model_revision: main
+    use_auth_token: false
+    ignore_mismatched_sizes: false
+    negative_sampling: "none"
+    semsup: true
+    label_model_name_or_path: prajjwal1/bert-small
+    # label_model_name_or_path: prajjwal1/bert-tiny
+    encoder_model_type: bert
+    use_custom_optimizer: adamw
+    output_learning_rate: 1.e-4
+    arch_type : 2
+    add_label_name: false
+    normalize_embeddings: false
+    tie_weights: false
+    coil: true
+    # use_precomputed_embeddings: datasets/eurlex4.3k/heir_withdescriptions_4.3k_v1_embs_bert_9_96.npy
+    token_dim: 16
+    label_frozen_layers: 2
+TRAINING:
+    do_train: true
+    do_eval: true
+    per_device_train_batch_size: 1
+    gradient_accumulation_steps: 4
+    per_device_eval_batch_size: 1
+    learning_rate: 5.e-5 # Will point to input encoder lr, if user_custom_optimizer is False
+    num_train_epochs: 20
+    save_steps: 10000
+    evaluation_strategy: steps
+    eval_steps: 500
+    fp16: true
+    fp16_opt_level: O1
+    lr_scheduler_type: "linear" # defaults to 'linear'
+    dataloader_num_workers: 1
+    label_names: [labels]
+    scenario: "unseen_labels"
+    ddp_find_unused_parameters: false
+    # ignore_data_skip: true
+    # one_hour_job: true

cleaned_code/configs/eurlex4.3k_baseline_fs5.yml ADDED Viewed

	@@ -0,0 +1,78 @@

+EXP_NAME: "semsup_descs_100ep_newds_cosine"
+EXP_DESC: "SemSup Descriptions ran for 100 epochs"
+DATA:
+    task_name: eurlex57k
+    dataset_name: eurlex
+    dataset_config_name: null
+    max_seq_length: 512
+    overwrite_output_dir: true
+    overwrite_cache: false
+    pad_to_max_length: true
+    load_from_local: true
+    max_train_samples: null
+    max_eval_samples: null
+    max_predict_samples: null
+    train_file: datasets/eurlex4.3k/train_split1057_fs5.jsonl
+    validation_file: datasets/eurlex4.3k/test_unseen_split1057_fs5.jsonl
+    test_file: datasets/eurlex4.3k/test_unseen_split1057_fs5.jsonl
+    label_max_seq_length: 80
+    descriptions_file: datasets/eurlex4.3k/heir_descriptions_4.3k_v1.json
+    all_labels : datasets/eurlex4.3k/all_labels.txt
+    test_labels: datasets/eurlex4.3k/unseen_labels_split1057_fs5.txt
+    ignore_pos_labels_file: datasets/eurlex4.3k/ignore_train_split1057_fs5.txt
+    max_descs_per_label: 5
+    contrastive_learning_samples: 600
+    cl_min_positive_descs: 2
+MODEL:
+    model_name_or_path: bert-base-uncased
+    pretrained_model_path: output/semsup_descs_100ep_4.3k_unseen_coilsmall_hier/checkpoint-20000/pytorch_model.bin
+    config_name: null
+    tokenizer_name: null
+    cache_dir: null
+    use_fast_tokenizer: true
+    model_revision: main
+    use_auth_token: false
+    ignore_mismatched_sizes: false
+    negative_sampling: "none"
+    semsup: true
+    label_model_name_or_path: prajjwal1/bert-small
+    # label_model_name_or_path: prajjwal1/bert-tiny
+    encoder_model_type: bert
+    use_custom_optimizer: adamw
+    output_learning_rate: 1.e-4
+    arch_type : 2
+    add_label_name: false
+    normalize_embeddings: false
+    tie_weights: false
+    coil: true
+    # use_precomputed_embeddings: datasets/eurlex4.3k/heir_withdescriptions_4.3k_v1_embs_bert_9_96.npy
+    token_dim: 16
+    label_frozen_layers: 2
+TRAINING:
+    do_train: true
+    do_eval: true
+    per_device_train_batch_size: 1
+    gradient_accumulation_steps: 4
+    per_device_eval_batch_size: 1
+    learning_rate: 5.e-5 # Will point to input encoder lr, if user_custom_optimizer is False
+    num_train_epochs: 20
+    save_steps: 10000
+    evaluation_strategy: steps
+    eval_steps: 500
+    fp16: true
+    fp16_opt_level: O1
+    lr_scheduler_type: "linear" # defaults to 'linear'
+    dataloader_num_workers: 1
+    label_names: [labels]
+    scenario: "unseen_labels"
+    ddp_find_unused_parameters: false
+    # ignore_data_skip: true
+    # one_hour_job: true

cleaned_code/configs/eurlex4.3k_baseline_nl.yml ADDED Viewed

	@@ -0,0 +1,88 @@

+EXP_NAME: "semsup_descs_100ep_newds_cosine"
+EXP_DESC: "SemSup Descriptions ran for 100 epochs"
+DATA:
+    task_name: eurlex57k
+    dataset_name: eurlex
+    dataset_config_name: null
+    max_seq_length: 512
+    overwrite_output_dir: true
+    overwrite_cache: false
+    pad_to_max_length: true
+    load_from_local: true
+    max_train_samples: null
+    max_eval_samples: null
+    max_predict_samples: null
+    # train_file: datasets/eurlex4.3k/train_hr.jsonl
+    # train_file: datasets/eurlex4.3k/train.jsonl
+    # validation_file: datasets/eurlex4.3k/test_unseen.jsonl
+    # test_file: datasets/eurlex4.3k/test_unseen.jsonl
+    # validation_file: datasets/eurlex4.3k/test.jsonl
+    # test_file: datasets/eurlex4.3k/test.jsonl
+    train_file: datasets/eurlex4.3k/train_split1057.jsonl
+    validation_file: datasets/eurlex4.3k/test_unseen_split1057.jsonl
+    test_file: datasets/eurlex4.3k/test_unseen_split1057.jsonl
+    # validation_file: datasets/eurlex4.3k/test_unseen_hr.jsonl
+    # test_file: datasets/eurlex4.3k/test_unseen_hr.jsonl
+    label_max_seq_length: 96
+    descriptions_file: datasets/eurlex4.3k/heir_withdescriptions_4.3k_v1_nl.json
+    # descriptions_file: datasets/eurlex4.3k/heir_descriptions_4.3k_v1.json
+    # descriptions_file: datasets/eurlex4.3k/curie_descriptions_4.3k_v1.json
+    all_labels : datasets/eurlex4.3k/all_labels.txt
+    test_labels: datasets/eurlex4.3k/unseen_labels_split1057.txt
+    # test_labels: datasets/eurlex4.3k/unseen_labels.txt
+    max_descs_per_label: 5
+    # contrastive_learning_samples: 1500
+    # cl_min_positive_descs: 1
+    # bm_short_file: datasets/eurlex4.3k/train_bmshort.txt
+MODEL:
+    model_name_or_path: bert-base-uncased
+    config_name: null
+    tokenizer_name: null
+    cache_dir: null
+    use_fast_tokenizer: true
+    model_revision: main
+    use_auth_token: false
+    ignore_mismatched_sizes: false
+    negative_sampling: "none"
+    semsup: true
+    label_model_name_or_path: prajjwal1/bert-small
+    # label_model_name_or_path: bert-base-uncased
+    # label_model_name_or_path: prajjwal1/bert-tiny
+    encoder_model_type: bert
+    use_custom_optimizer: adamw
+    output_learning_rate: 1.e-4
+    arch_type : 2
+    add_label_name: false
+    normalize_embeddings: false
+    tie_weights: false
+    coil: true
+    # use_precomputed_embeddings: datasets/eurlex4.3k/heir_withdescriptions_4.3k_v1_embs_bert_9_96.npy
+    token_dim: 16
+    # num_frozen_layers: 9
+    label_frozen_layers: 2
+TRAINING:
+    do_train: true
+    do_eval: true
+    per_device_train_batch_size: 1
+    gradient_accumulation_steps: 4
+    per_device_eval_batch_size: 1
+    learning_rate: 5.e-5 # Will point to input encoder lr, if user_custom_optimizer is False
+    num_train_epochs: 10
+    save_steps: 10000
+    evaluation_strategy: steps
+    eval_steps: 500
+    fp16: true
+    fp16_opt_level: O1
+    lr_scheduler_type: "linear" # defaults to 'linear'
+    dataloader_num_workers: 8
+    label_names: [labels]
+    scenario: "unseen_labels"
+    ddp_find_unused_parameters: false