still on ngm

e51c465a · Max Björkander · 028fdee9 · e51c465a
Commit e51c465a authored 2 years ago by Max Björkander
--- a/Neural graph module/ngm.ipynb
+++ b/Neural graph module/ngm.ipynb
@@ -2,7 +2,7 @@
 "cells": [
  {
   "cell_type": "code",
-   "execution_count": 28,
+   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -20,29 +20,16 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 29,
+   "execution_count": 11,
   "metadata": {},
-   "outputs": [
+   "outputs": [],
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Downloading: 100%|██████████| 232k/232k [00:00<00:00, 636kB/s] \n",
-      "c:\\Users\\maxbj\\AppData\\Local\\Programs\\Python\\Python39\\lib\\site-packages\\huggingface_hub\\file_download.py:123: UserWarning: `huggingface_hub` cache-system uses symlinks by default to efficiently store duplicated files but your machine does not support them in C:\\Users\\maxbj\\.cache\\huggingface\\hub. Caching files will still work but in a degraded version that might require more space on your disk. This warning can be disabled by setting the `HF_HUB_DISABLE_SYMLINKS_WARNING` environment variable. For more details, see https://huggingface.co/docs/huggingface_hub/how-to-cache#limitations.\n",
-      "To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development\n",
-      "  warnings.warn(message)\n",
-      "Downloading: 100%|██████████| 28.0/28.0 [00:00<00:00, 28.9kB/s]\n",
-      "Downloading: 100%|██████████| 570/570 [00:00<00:00, 572kB/s]\n"
-     ]
-    }
-   ],
   "source": [
    "tokenizer = BertTokenizer.from_pretrained(\"bert-base-uncased\")"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 64,
+   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -55,8 +42,10 @@
    "        self.linear = nn.Linear(768, 247)\n",
    "        self.softmax = nn.Softmax(dim=1)\n",
    "    \n",
-    "    def forward(self, tokenized_seq):\n",
+    "    def forward(self, tokenized_seq, tokenized_mask):\n",
-    "        x = self.bert.forward(tokenized_seq)\n",
+    "        x = self.bert.forward(tokenized_seq, attention_mask=tokenized_mask)\n",
+    "        x = x[0][:,0,:]\n",
+    "        \n",
    "        x = self.linear(x)\n",
    "\n",
    "        x = self.softmax(x)\n",
@@ -65,7 +54,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -103,7 +92,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 62,
+   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -173,13 +162,16 @@
    "\n",
    "    inputs_padded = np.array([i + [0]*(inputs_max_len-len(i)) for i in inputs])\n",
    "    correct_rels_padded = np.array([i + [0]*(correct_rels_max_len-len(i)) for i in correct_rels])\n",
+    "\n",
+    "    inputs_attention_mask = np.where(inputs_padded != 0, 1, 0)\n",
+    "    correct_rels_attention_mask = np.where(correct_rels_padded != 0, 1, 0)\n",
    "    print(\"Finished with batches\")\n",
-    "    return torch.IntTensor(inputs_padded), torch.IntTensor(correct_rels_padded)\n"
+    "    return torch.IntTensor(inputs_padded), torch.IntTensor(inputs_attention_mask), torch.IntTensor(correct_rels_padded), torch.IntTensor(correct_rels_attention_mask)\n"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -196,28 +188,59 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 69,
+   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
-     "ename": "MemoryError",
+     "name": "stderr",
-     "evalue": "",
+     "output_type": "stream",
+     "text": [
+      "Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias']\n",
+      "- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
+      "- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Beginning making batch\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|██████████| 408/408 [00:00<00:00, 688.03it/s]\n",
+      "100%|██████████| 408/408 [00:00<00:00, 2241.79it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Finished with batches\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "  0%|          | 0/3 [03:03<?, ?it/s]\n"
+     ]
+    },
+    {
+     "ename": "RuntimeError",
+     "evalue": "0D or 1D target tensor expected, multi-target not supported",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mRuntimeError\u001b[0m                              Traceback (most recent call last)",
-      "File \u001b[1;32mc:\\Users\\maxbj\\AppData\\Local\\Programs\\Python\\Python39\\lib\\site-packages\\transformers\\modeling_utils.py:399\u001b[0m, in \u001b[0;36mload_state_dict\u001b[1;34m(checkpoint_file)\u001b[0m\n\u001b[0;32m    398\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[1;32m--> 399\u001b[0m     \u001b[39mreturn\u001b[39;00m torch\u001b[39m.\u001b[39;49mload(checkpoint_file, map_location\u001b[39m=\u001b[39;49m\u001b[39m\"\u001b[39;49m\u001b[39mcpu\u001b[39;49m\u001b[39m\"\u001b[39;49m)\n\u001b[0;32m    400\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mException\u001b[39;00m \u001b[39mas\u001b[39;00m e:\n",
+      "Cell \u001b[1;32mIn [26], line 13\u001b[0m\n\u001b[0;32m     11\u001b[0m \u001b[39m# Forward pass\u001b[39;00m\n\u001b[0;32m     12\u001b[0m output \u001b[39m=\u001b[39m model(train, train_mask)\n\u001b[1;32m---> 13\u001b[0m loss \u001b[39m=\u001b[39m criterion(output, corr_rels)\n\u001b[0;32m     15\u001b[0m \u001b[39mif\u001b[39;00m (epoch \u001b[39m+\u001b[39m \u001b[39m1\u001b[39m) \u001b[39m%\u001b[39m \u001b[39m10\u001b[39m \u001b[39m==\u001b[39m \u001b[39m0\u001b[39m:\n\u001b[0;32m     16\u001b[0m     \u001b[39mprint\u001b[39m(\u001b[39m'\u001b[39m\u001b[39mEpoch:\u001b[39m\u001b[39m'\u001b[39m, \u001b[39m'\u001b[39m\u001b[39m%04d\u001b[39;00m\u001b[39m'\u001b[39m \u001b[39m%\u001b[39m (epoch \u001b[39m+\u001b[39m \u001b[39m1\u001b[39m), \u001b[39m'\u001b[39m\u001b[39mcost =\u001b[39m\u001b[39m'\u001b[39m, \u001b[39m'\u001b[39m\u001b[39m{:.6f}\u001b[39;00m\u001b[39m'\u001b[39m\u001b[39m.\u001b[39mformat(loss))\n",
-      "File \u001b[1;32mc:\\Users\\maxbj\\AppData\\Local\\Programs\\Python\\Python39\\lib\\site-packages\\torch\\serialization.py:713\u001b[0m, in \u001b[0;36mload\u001b[1;34m(f, map_location, pickle_module, **pickle_load_args)\u001b[0m\n\u001b[0;32m    712\u001b[0m         \u001b[39mreturn\u001b[39;00m _load(opened_zipfile, map_location, pickle_module, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mpickle_load_args)\n\u001b[1;32m--> 713\u001b[0m \u001b[39mreturn\u001b[39;00m _legacy_load(opened_file, map_location, pickle_module, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mpickle_load_args)\n",
+      "File \u001b[1;32mc:\\Users\\maxbj\\AppData\\Local\\Programs\\Python\\Python39\\lib\\site-packages\\torch\\nn\\modules\\module.py:1130\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[1;34m(self, *input, **kwargs)\u001b[0m\n\u001b[0;32m   1126\u001b[0m \u001b[39m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[0;32m   1127\u001b[0m \u001b[39m# this function, and just call forward.\u001b[39;00m\n\u001b[0;32m   1128\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m (\u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_backward_hooks \u001b[39mor\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_forward_hooks \u001b[39mor\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_forward_pre_hooks \u001b[39mor\u001b[39;00m _global_backward_hooks\n\u001b[0;32m   1129\u001b[0m         \u001b[39mor\u001b[39;00m _global_forward_hooks \u001b[39mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[1;32m-> 1130\u001b[0m     \u001b[39mreturn\u001b[39;00m forward_call(\u001b[39m*\u001b[39m\u001b[39minput\u001b[39m, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mkwargs)\n\u001b[0;32m   1131\u001b[0m \u001b[39m# Do not call functions when jit is used\u001b[39;00m\n\u001b[0;32m   1132\u001b[0m full_backward_hooks, non_full_backward_hooks \u001b[39m=\u001b[39m [], []\n",
-      "File \u001b[1;32mc:\\Users\\maxbj\\AppData\\Local\\Programs\\Python\\Python39\\lib\\site-packages\\torch\\serialization.py:930\u001b[0m, in \u001b[0;36m_legacy_load\u001b[1;34m(f, map_location, pickle_module, **pickle_load_args)\u001b[0m\n\u001b[0;32m    929\u001b[0m unpickler\u001b[39m.\u001b[39mpersistent_load \u001b[39m=\u001b[39m persistent_load\n\u001b[1;32m--> 930\u001b[0m result \u001b[39m=\u001b[39m unpickler\u001b[39m.\u001b[39;49mload()\n\u001b[0;32m    932\u001b[0m deserialized_storage_keys \u001b[39m=\u001b[39m pickle_module\u001b[39m.\u001b[39mload(f, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mpickle_load_args)\n",
+      "File \u001b[1;32mc:\\Users\\maxbj\\AppData\\Local\\Programs\\Python\\Python39\\lib\\site-packages\\torch\\nn\\modules\\loss.py:1164\u001b[0m, in \u001b[0;36mCrossEntropyLoss.forward\u001b[1;34m(self, input, target)\u001b[0m\n\u001b[0;32m   1163\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mforward\u001b[39m(\u001b[39mself\u001b[39m, \u001b[39minput\u001b[39m: Tensor, target: Tensor) \u001b[39m-\u001b[39m\u001b[39m>\u001b[39m Tensor:\n\u001b[1;32m-> 1164\u001b[0m     \u001b[39mreturn\u001b[39;00m F\u001b[39m.\u001b[39;49mcross_entropy(\u001b[39minput\u001b[39;49m, target, weight\u001b[39m=\u001b[39;49m\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mweight,\n\u001b[0;32m   1165\u001b[0m                            ignore_index\u001b[39m=\u001b[39;49m\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mignore_index, reduction\u001b[39m=\u001b[39;49m\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mreduction,\n\u001b[0;32m   1166\u001b[0m                            label_smoothing\u001b[39m=\u001b[39;49m\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mlabel_smoothing)\n",
-      "File \u001b[1;32mc:\\Users\\maxbj\\AppData\\Local\\Programs\\Python\\Python39\\lib\\site-packages\\torch\\serialization.py:871\u001b[0m, in \u001b[0;36m_legacy_load.<locals>.persistent_load\u001b[1;34m(saved_id)\u001b[0m\n\u001b[0;32m    870\u001b[0m \u001b[39mif\u001b[39;00m root_key \u001b[39mnot\u001b[39;00m \u001b[39min\u001b[39;00m deserialized_objects:\n\u001b[1;32m--> 871\u001b[0m     obj \u001b[39m=\u001b[39m cast(Storage, torch\u001b[39m.\u001b[39;49m_UntypedStorage(nbytes))\n\u001b[0;32m    872\u001b[0m     obj\u001b[39m.\u001b[39m_torch_load_uninitialized \u001b[39m=\u001b[39m \u001b[39mTrue\u001b[39;00m\n",
+      "File \u001b[1;32mc:\\Users\\maxbj\\AppData\\Local\\Programs\\Python\\Python39\\lib\\site-packages\\torch\\nn\\functional.py:3014\u001b[0m, in \u001b[0;36mcross_entropy\u001b[1;34m(input, target, weight, size_average, ignore_index, reduce, reduction, label_smoothing)\u001b[0m\n\u001b[0;32m   3012\u001b[0m \u001b[39mif\u001b[39;00m size_average \u001b[39mis\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mNone\u001b[39;00m \u001b[39mor\u001b[39;00m reduce \u001b[39mis\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mNone\u001b[39;00m:\n\u001b[0;32m   3013\u001b[0m     reduction \u001b[39m=\u001b[39m _Reduction\u001b[39m.\u001b[39mlegacy_get_string(size_average, reduce)\n\u001b[1;32m-> 3014\u001b[0m \u001b[39mreturn\u001b[39;00m torch\u001b[39m.\u001b[39;49m_C\u001b[39m.\u001b[39;49m_nn\u001b[39m.\u001b[39;49mcross_entropy_loss(\u001b[39minput\u001b[39;49m, target, weight, _Reduction\u001b[39m.\u001b[39;49mget_enum(reduction), ignore_index, label_smoothing)\n",
-      "\u001b[1;31mRuntimeError\u001b[0m: [enforce fail at C:\\actions-runner\\_work\\pytorch\\pytorch\\builder\\windows\\pytorch\\c10\\core\\impl\\alloc_cpu.cpp:81] data. DefaultCPUAllocator: not enough memory: you tried to allocate 93763584 bytes.",
+      "\u001b[1;31mRuntimeError\u001b[0m: 0D or 1D target tensor expected, multi-target not supported"
-      "\nDuring handling of the above exception, another exception occurred:\n",
-      "\u001b[1;31mMemoryError\u001b[0m                               Traceback (most recent call last)",
-      "Cell \u001b[1;32mIn [69], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m model \u001b[39m=\u001b[39m NgmOne()\n\u001b[0;32m      3\u001b[0m EPOCHS \u001b[39m=\u001b[39m \u001b[39m3\u001b[39m\n\u001b[0;32m      4\u001b[0m criterion \u001b[39m=\u001b[39m nn\u001b[39m.\u001b[39mCrossEntropyLoss()\n",
-      "Cell \u001b[1;32mIn [64], line 5\u001b[0m, in \u001b[0;36mNgmOne.__init__\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m      3\u001b[0m \u001b[39msuper\u001b[39m(NgmOne, \u001b[39mself\u001b[39m)\u001b[39m.\u001b[39m\u001b[39m__init__\u001b[39m()\n\u001b[0;32m      4\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mtokenizer \u001b[39m=\u001b[39m BertTokenizer\u001b[39m.\u001b[39mfrom_pretrained(\u001b[39m\"\u001b[39m\u001b[39mbert-base-uncased\u001b[39m\u001b[39m\"\u001b[39m)\n\u001b[1;32m----> 5\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mbert \u001b[39m=\u001b[39m BertModel\u001b[39m.\u001b[39;49mfrom_pretrained(\u001b[39m\"\u001b[39;49m\u001b[39mbert-base-uncased\u001b[39;49m\u001b[39m\"\u001b[39;49m)\n\u001b[0;32m      6\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mlinear \u001b[39m=\u001b[39m nn\u001b[39m.\u001b[39mLinear(\u001b[39m768\u001b[39m, \u001b[39m247\u001b[39m)\n\u001b[0;32m      7\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39msoftmax \u001b[39m=\u001b[39m nn\u001b[39m.\u001b[39mSoftmax(dim\u001b[39m=\u001b[39m\u001b[39m1\u001b[39m)\n",
-      "File \u001b[1;32mc:\\Users\\maxbj\\AppData\\Local\\Programs\\Python\\Python39\\lib\\site-packages\\transformers\\modeling_utils.py:2184\u001b[0m, in \u001b[0;36mPreTrainedModel.from_pretrained\u001b[1;34m(cls, pretrained_model_name_or_path, *model_args, **kwargs)\u001b[0m\n\u001b[0;32m   2181\u001b[0m \u001b[39mif\u001b[39;00m from_pt:\n\u001b[0;32m   2182\u001b[0m     \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m is_sharded \u001b[39mand\u001b[39;00m state_dict \u001b[39mis\u001b[39;00m \u001b[39mNone\u001b[39;00m:\n\u001b[0;32m   2183\u001b[0m         \u001b[39m# Time to load the checkpoint\u001b[39;00m\n\u001b[1;32m-> 2184\u001b[0m         state_dict \u001b[39m=\u001b[39m load_state_dict(resolved_archive_file)\n\u001b[0;32m   2186\u001b[0m     \u001b[39m# set dtype to instantiate the model under:\u001b[39;00m\n\u001b[0;32m   2187\u001b[0m     \u001b[39m# 1. If torch_dtype is not None, we use that dtype\u001b[39;00m\n\u001b[0;32m   2188\u001b[0m     \u001b[39m# 2. If torch_dtype is \"auto\", we auto-detect dtype from the loaded state_dict, by checking its first\u001b[39;00m\n\u001b[0;32m   2189\u001b[0m     \u001b[39m#    weights entry that is of a floating type - we assume all floating dtype weights are of the same dtype\u001b[39;00m\n\u001b[0;32m   2190\u001b[0m     \u001b[39m# we also may have config.torch_dtype available, but we won't rely on it till v5\u001b[39;00m\n\u001b[0;32m   2191\u001b[0m     dtype_orig \u001b[39m=\u001b[39m \u001b[39mNone\u001b[39;00m\n",
-      "File \u001b[1;32mc:\\Users\\maxbj\\AppData\\Local\\Programs\\Python\\Python39\\lib\\site-packages\\transformers\\modeling_utils.py:403\u001b[0m, in \u001b[0;36mload_state_dict\u001b[1;34m(checkpoint_file)\u001b[0m\n\u001b[0;32m    401\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m    402\u001b[0m     \u001b[39mwith\u001b[39;00m \u001b[39mopen\u001b[39m(checkpoint_file) \u001b[39mas\u001b[39;00m f:\n\u001b[1;32m--> 403\u001b[0m         \u001b[39mif\u001b[39;00m f\u001b[39m.\u001b[39;49mread()\u001b[39m.\u001b[39mstartswith(\u001b[39m\"\u001b[39m\u001b[39mversion\u001b[39m\u001b[39m\"\u001b[39m):\n\u001b[0;32m    404\u001b[0m             \u001b[39mraise\u001b[39;00m \u001b[39mOSError\u001b[39;00m(\n\u001b[0;32m    405\u001b[0m                 \u001b[39m\"\u001b[39m\u001b[39mYou seem to have cloned a repository without having git-lfs installed. Please install \u001b[39m\u001b[39m\"\u001b[39m\n\u001b[0;32m    406\u001b[0m                 \u001b[39m\"\u001b[39m\u001b[39mgit-lfs and run `git lfs install` followed by `git lfs pull` in the folder \u001b[39m\u001b[39m\"\u001b[39m\n\u001b[0;32m    407\u001b[0m                 \u001b[39m\"\u001b[39m\u001b[39myou cloned.\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[0;32m    408\u001b[0m             )\n\u001b[0;32m    409\u001b[0m         \u001b[39melse\u001b[39;00m:\n",
-      "\u001b[1;31mMemoryError\u001b[0m: "
     ]
    }
   ],
@@ -228,12 +251,12 @@
    "criterion = nn.CrossEntropyLoss()\n",
    "optimizer = optim.Adam(model.parameters(), lr=0.001)\n",
    "\n",
-    "train, corr_rels = make_batch()\n",
+    "train,train_mask, corr_rels, correct_rels_mask = make_batch()\n",
    "for epoch in tqdm(range(EPOCHS)):\n",
    "    optimizer.zero_grad()\n",
    "\n",
    "    # Forward pass\n",
-    "    output = model(train)\n",
+    "    output = model(train, train_mask)\n",
    "    loss = criterion(output, corr_rels)\n",
    "\n",
    "    if (epoch + 1) % 10 == 0:\n",

 %% Cell type:code id: tags:
 ``` python
 import datasets
 import torch
 import torch.nn as nn
 import torch.optim as optim
 import pandas as pd
 import numpy as np
 from transformers import BertTokenizer, BertModel
 from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
 from tqdm import tqdm
 import json
 ```
 %% Cell type:code id: tags:
 ``` python
 tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
 ```
-%% Output
-    Downloading: 100%|██████████| 232k/232k [00:00<00:00, 636kB/s]
-    c:\Users\maxbj\AppData\Local\Programs\Python\Python39\lib\site-packages\huggingface_hub\file_download.py:123: UserWarning: `huggingface_hub` cache-system uses symlinks by default to efficiently store duplicated files but your machine does not support them in C:\Users\maxbj\.cache\huggingface\hub. Caching files will still work but in a degraded version that might require more space on your disk. This warning can be disabled by setting the `HF_HUB_DISABLE_SYMLINKS_WARNING` environment variable. For more details, see https://huggingface.co/docs/huggingface_hub/how-to-cache#limitations.
-    To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
-      warnings.warn(message)
-    Downloading: 100%|██████████| 28.0/28.0 [00:00<00:00, 28.9kB/s]
-    Downloading: 100%|██████████| 570/570 [00:00<00:00, 572kB/s]
 %% Cell type:code id: tags:
 ``` python
 class NgmOne(nn.Module):
    def __init__(self):
        super(NgmOne, self).__init__()
        self.tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
        self.bert = BertModel.from_pretrained("bert-base-uncased")
        self.linear = nn.Linear(768, 247)
        self.softmax = nn.Softmax(dim=1)
-    def forward(self, tokenized_seq):
+    def forward(self, tokenized_seq, tokenized_mask):
-        x = self.bert.forward(tokenized_seq)
+        x = self.bert.forward(tokenized_seq, attention_mask=tokenized_mask)
+        x = x[0][:,0,:]
        x = self.linear(x)
        x = self.softmax(x)
        return x
 ```
 %% Cell type:code id: tags:
 ``` python
 # def encode(batch):
 #   return tokenizer(batch, padding="max_length", max_length=256, return_tensors="pt")
 # def convert_to_features(example_batch):
 #     input_encodings = encode(example_batch['text'])
 #     target_encodings = encode(example_batch['summary'])
 #     labels = target_encodings['input_ids']
 #     decoder_input_ids = shift_tokens_right(
 #         labels, model.config.pad_token_id, model.config.decoder_start_token_id)
 #     labels[labels[:, :] == model.config.pad_token_id] = -100
 #     encodings = {
 #         'input_ids': input_encodings['input_ids'],
 #         'attention_mask': input_encodings['attention_mask'],
 #         'decoder_input_ids': decoder_input_ids,
 #         'labels': labels,
 #     }
 #     return encodings
 # def get_dataset(path):
 #   df = pd.read_csv(path, sep=",", on_bad_lines='skip')
 #   dataset = datasets.Dataset.from_pandas(df)
 #   dataset = dataset.map(convert_to_features, batched=True)
 #   columns = ['input_ids', 'labels', 'decoder_input_ids', 'attention_mask', ]
 #   dataset.set_format(type='torch', columns=columns)
 #   return dataset
 ```
 %% Cell type:code id: tags:
 ``` python
 def make_batch():
    """Triplet is a list of [subject entity, relation, object entity], None if not present"""
    # Load predicted data
    pred = "../data/qald-9-train-linked.json"
    #Load gold data
    gold = "../data/qald-9-train-linked.json"
    print("Beginning making batch")
    with open(pred, "r") as p, open(gold, "r") as g:
        pred = json.load(p)
        gold = json.load(g)
    inputs = []
    inputs_max_len = 0
    for d in tqdm(pred["questions"]):
        question = d["question"][0]["string"]
        query = d["query"]["sparql"]
        #Take the first tripletin query
        trip = query.split("WHERE")[1]
        trip = trip.replace("{", "").replace("}", "")
        triplet = trip.split(" ")
        #remove empty strings
        triplet = [x for x in triplet if x != ""]
        for t in triplet:
            if not(t.find("?")):
                triplet[triplet.index(t)] = None
        #seq = "[CLS] " + question + " [SEP] "
        if triplet[0] is not None:
            #seq += "[SUB] [SEP] " + triplet[0]
            # , padding=True, truncation=True)
            tokenized_seq = tokenizer(question, "[SUB]", triplet[0], padding=True, truncation=True)
        elif triplet[2] is not None:
            #seq += "[OBJ] [SEP] " + triplet[2]
            tokenized_seq = tokenizer(question, "[OBJ]", triplet[2], padding=True, truncation=True)
        if inputs_max_len < len(tokenized_seq["input_ids"]):
            inputs_max_len = len(tokenized_seq["input_ids"])
        inputs.append(list(tokenized_seq.values())[0])
    correct_rels_max_len = 0
    correct_rels = []
    for d in tqdm(gold["questions"]):
        question = d["question"][0]["string"]
        query = d["query"]["sparql"]
        #Take the first tripletin query
        trip = query.split("WHERE")[1]
        trip = trip.replace("{", "").replace("}", "")
        triplet = trip.split(" ")
        #remove empty strings
        triplet = [x for x in triplet if x != ""]
        tokenized = tokenizer(triplet[1], padding=True, truncation=True)
        if correct_rels_max_len < len(tokenized["input_ids"]):
            correct_rels_max_len = len(tokenized["input_ids"])
        correct_rels.append(list(tokenized.values())[0])
    inputs_padded = np.array([i + [0]*(inputs_max_len-len(i)) for i in inputs])
    correct_rels_padded = np.array([i + [0]*(correct_rels_max_len-len(i)) for i in correct_rels])
+    inputs_attention_mask = np.where(inputs_padded != 0, 1, 0)
+    correct_rels_attention_mask = np.where(correct_rels_padded != 0, 1, 0)
    print("Finished with batches")
-    return torch.IntTensor(inputs_padded), torch.IntTensor(correct_rels_padded)
+    return torch.IntTensor(inputs_padded), torch.IntTensor(inputs_attention_mask), torch.IntTensor(correct_rels_padded), torch.IntTensor(correct_rels_attention_mask)
 ```
 %% Cell type:code id: tags:
 ``` python
 # training_args = Seq2SeqTrainingArguments(
 #     output_dir='./models/blackbox',
 #     num_train_epochs=1,
 #     per_device_train_batch_size=1,
 #     per_device_eval_batch_size=1,
 #     warmup_steps=10,
 #     weight_decay=0.01,
 #     logging_dir='./logs',
 # )
 ```
 %% Cell type:code id: tags:
 ``` python
 model = NgmOne()
 EPOCHS = 3
 criterion = nn.CrossEntropyLoss()
 optimizer = optim.Adam(model.parameters(), lr=0.001)
-train, corr_rels = make_batch()
+train,train_mask, corr_rels, correct_rels_mask = make_batch()
 for epoch in tqdm(range(EPOCHS)):
    optimizer.zero_grad()
    # Forward pass
-    output = model(train)
+    output = model(train, train_mask)
    loss = criterion(output, corr_rels)
    if (epoch + 1) % 10 == 0:
        print('Epoch:', '%04d' % (epoch + 1), 'cost =', '{:.6f}'.format(loss))
    # Backward pass
    loss.backward()
    optimizer.step()
 ```
 %% Output
+    Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias']
+    - This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
+    - This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
+    Beginning making batch
+    100%|██████████| 408/408 [00:00<00:00, 688.03it/s]
+    100%|██████████| 408/408 [00:00<00:00, 2241.79it/s]
+    Finished with batches
+      0%|          | 0/3 [03:03<?, ?it/s]
    ---------------------------------------------------------------------------
    RuntimeError                              Traceback (most recent call last)
-File     c:\Users\maxbj\AppData\Local\Programs\Python\Python39\lib\site-packages\transformers\modeling_utils.py:399, in load_state_dict(checkpoint_file)
+Cell     In [26], line 13
-        398 try:
+         11 # Forward pass
-    --> 399     return torch.load(checkpoint_file, map_location="cpu")
+         12 output = model(train, train_mask)
-        400 except Exception as e:
+    ---> 13 loss = criterion(output, corr_rels)
-File     c:\Users\maxbj\AppData\Local\Programs\Python\Python39\lib\site-packages\torch\serialization.py:713, in load(f, map_location, pickle_module, **pickle_load_args)
+         15 if (epoch + 1) % 10 == 0:
-        712         return _load(opened_zipfile, map_location, pickle_module, **pickle_load_args)
+         16     print('Epoch:', '%04d' % (epoch + 1), 'cost =', '{:.6f}'.format(loss))
-    --> 713 return _legacy_load(opened_file, map_location, pickle_module, **pickle_load_args)
+File     c:\Users\maxbj\AppData\Local\Programs\Python\Python39\lib\site-packages\torch\nn\modules\module.py:1130, in Module._call_impl(self, *input, **kwargs)
-File     c:\Users\maxbj\AppData\Local\Programs\Python\Python39\lib\site-packages\torch\serialization.py:930, in _legacy_load(f, map_location, pickle_module, **pickle_load_args)
+       1126 # If we don't have any hooks, we want to skip the rest of the logic in
-        929 unpickler.persistent_load = persistent_load
+       1127 # this function, and just call forward.
-    --> 930 result = unpickler.load()
+       1128 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
-        932 deserialized_storage_keys = pickle_module.load(f, **pickle_load_args)
+       1129         or _global_forward_hooks or _global_forward_pre_hooks):
-File     c:\Users\maxbj\AppData\Local\Programs\Python\Python39\lib\site-packages\torch\serialization.py:871, in _legacy_load.<locals>.persistent_load(saved_id)
+    -> 1130     return forward_call(*input, **kwargs)
-        870 if root_key not in deserialized_objects:
+       1131 # Do not call functions when jit is used
-    --> 871     obj = cast(Storage, torch._UntypedStorage(nbytes))
+       1132 full_backward_hooks, non_full_backward_hooks = [], []
-        872     obj._torch_load_uninitialized = True
+File     c:\Users\maxbj\AppData\Local\Programs\Python\Python39\lib\site-packages\torch\nn\modules\loss.py:1164, in CrossEntropyLoss.forward(self, input, target)
-    RuntimeError: [enforce fail at C:\actions-runner\_work\pytorch\pytorch\builder\windows\pytorch\c10\core\impl\alloc_cpu.cpp:81] data. DefaultCPUAllocator: not enough memory: you tried to allocate 93763584 bytes.
+       1163 def forward(self, input: Tensor, target: Tensor) -> Tensor:
+    -> 1164     return F.cross_entropy(input, target, weight=self.weight,
-During handling of the above exception, another exception occurred:
+       1165                            ignore_index=self.ignore_index, reduction=self.reduction,
-    MemoryError                               Traceback (most recent call last)
+       1166                            label_smoothing=self.label_smoothing)
-Cell     In [69], line 1
+File     c:\Users\maxbj\AppData\Local\Programs\Python\Python39\lib\site-packages\torch\nn\functional.py:3014, in cross_entropy(input, target, weight, size_average, ignore_index, reduce, reduction, label_smoothing)
-    ----> 1 model = NgmOne()
+       3012 if size_average is not None or reduce is not None:
-          3 EPOCHS = 3
+       3013     reduction = _Reduction.legacy_get_string(size_average, reduce)
-          4 criterion = nn.CrossEntropyLoss()
+    -> 3014 return torch._C._nn.cross_entropy_loss(input, target, weight, _Reduction.get_enum(reduction), ignore_index, label_smoothing)
-Cell     In [64], line 5, in NgmOne.__init__(self)
+    RuntimeError: 0D or 1D target tensor expected, multi-target not supported
-          3 super(NgmOne, self).__init__()
-          4 self.tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
-    ----> 5 self.bert = BertModel.from_pretrained("bert-base-uncased")
-          6 self.linear = nn.Linear(768, 247)
-          7 self.softmax = nn.Softmax(dim=1)
-File     c:\Users\maxbj\AppData\Local\Programs\Python\Python39\lib\site-packages\transformers\modeling_utils.py:2184, in PreTrainedModel.from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs)
-       2181 if from_pt:
-       2182     if not is_sharded and state_dict is None:
-       2183         # Time to load the checkpoint
-    -> 2184         state_dict = load_state_dict(resolved_archive_file)
-       2186     # set dtype to instantiate the model under:
-       2187     # 1. If torch_dtype is not None, we use that dtype
-       2188     # 2. If torch_dtype is "auto", we auto-detect dtype from the loaded state_dict, by checking its first
-       2189     #    weights entry that is of a floating type - we assume all floating dtype weights are of the same dtype
-       2190     # we also may have config.torch_dtype available, but we won't rely on it till v5
-       2191     dtype_orig = None
-File     c:\Users\maxbj\AppData\Local\Programs\Python\Python39\lib\site-packages\transformers\modeling_utils.py:403, in load_state_dict(checkpoint_file)
-        401 try:
-        402     with open(checkpoint_file) as f:
-    --> 403         if f.read().startswith("version"):
-        404             raise OSError(
-        405                 "You seem to have cloned a repository without having git-lfs installed. Please install "
-        406                 "git-lfs and run `git lfs install` followed by `git lfs pull` in the folder "
-        407                 "you cloned."
-        408             )
-        409         else:
-    MemoryError: