kept on working on ngm but now something is broken

c6a82125 · Max Björkander · 1c8ba7a8 · c6a82125
Commit c6a82125 authored 2 years ago by Max Björkander
--- a/Neural graph module/ngm.ipynb
+++ b/Neural graph module/ngm.ipynb
@@ -2,31 +2,47 @@
 "cells": [
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "import datasets\n",
    "import torch\n",
    "import torch.nn as nn\n",
+    "import torch.optim as optim\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "from transformers import BertTokenizer, BertModel\n",
-    "from transformers.models.bert.modeling_bert import shift_tokens_right\n"
+    "from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments\n",
+    "from tqdm import tqdm\n",
+    "import json\n"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 29,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Downloading: 100%|██████████| 232k/232k [00:00<00:00, 636kB/s] \n",
+      "c:\\Users\\maxbj\\AppData\\Local\\Programs\\Python\\Python39\\lib\\site-packages\\huggingface_hub\\file_download.py:123: UserWarning: `huggingface_hub` cache-system uses symlinks by default to efficiently store duplicated files but your machine does not support them in C:\\Users\\maxbj\\.cache\\huggingface\\hub. Caching files will still work but in a degraded version that might require more space on your disk. This warning can be disabled by setting the `HF_HUB_DISABLE_SYMLINKS_WARNING` environment variable. For more details, see https://huggingface.co/docs/huggingface_hub/how-to-cache#limitations.\n",
+      "To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development\n",
+      "  warnings.warn(message)\n",
+      "Downloading: 100%|██████████| 28.0/28.0 [00:00<00:00, 28.9kB/s]\n",
+      "Downloading: 100%|██████████| 570/570 [00:00<00:00, 572kB/s]\n"
+     ]
+    }
+   ],
   "source": [
    "tokenizer = BertTokenizer.from_pretrained(\"bert-base-uncased\")"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 64,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -36,28 +52,129 @@
    "        super(NgmOne, self).__init__()\n",
    "        self.tokenizer = BertTokenizer.from_pretrained(\"bert-base-uncased\")\n",
    "        self.bert = BertModel.from_pretrained(\"bert-base-uncased\")\n",
-    "        self.linear = nn.Linear(768, 1)\n",
+    "        self.linear = nn.Linear(768, 247)\n",
    "        self.softmax = nn.Softmax(dim=1)\n",
    "    \n",
-    "    def forward(self, triplet, question):\n",
-    "        \"\"\"Triplet is a list of subject entity, relation, object entity, None if not present\"\"\"\n",
-    "        \n",
+    "    def forward(self, tokenized_seq):\n",
+    "        x = self.bert.forward(tokenized_seq)\n",
+    "        x = self.linear(x)\n",
+    "\n",
+    "        x = self.softmax(x)\n",
+    "        return x"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# def encode(batch):\n",
+    "#   return tokenizer(batch, padding=\"max_length\", max_length=256, return_tensors=\"pt\")\n",
+    "\n",
+    "\n",
+    "# def convert_to_features(example_batch):\n",
+    "#     input_encodings = encode(example_batch['text'])\n",
+    "#     target_encodings = encode(example_batch['summary'])\n",
+    "\n",
+    "#     labels = target_encodings['input_ids']\n",
+    "#     decoder_input_ids = shift_tokens_right(\n",
+    "#         labels, model.config.pad_token_id, model.config.decoder_start_token_id)\n",
+    "#     labels[labels[:, :] == model.config.pad_token_id] = -100\n",
+    "\n",
+    "#     encodings = {\n",
+    "#         'input_ids': input_encodings['input_ids'],\n",
+    "#         'attention_mask': input_encodings['attention_mask'],\n",
+    "#         'decoder_input_ids': decoder_input_ids,\n",
+    "#         'labels': labels,\n",
+    "#     }\n",
+    "\n",
+    "#     return encodings\n",
+    "\n",
+    "\n",
+    "# def get_dataset(path):\n",
+    "#   df = pd.read_csv(path, sep=\",\", on_bad_lines='skip')\n",
+    "#   dataset = datasets.Dataset.from_pandas(df)\n",
+    "#   dataset = dataset.map(convert_to_features, batched=True)\n",
+    "#   columns = ['input_ids', 'labels', 'decoder_input_ids', 'attention_mask', ]\n",
+    "#   dataset.set_format(type='torch', columns=columns)\n",
+    "#   return dataset\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 62,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def make_batch():\n",
+    "    \"\"\"Triplet is a list of [subject entity, relation, object entity], None if not present\"\"\"\n",
+    "\n",
+    "    # Load predicted data\n",
+    "    pred = \"../data/qald-9-train-linked.json\"\n",
+    "\n",
+    "    #Load gold data\n",
+    "    gold = \"../data/qald-9-train-linked.json\"\n",
+    "    print(\"Beginning making batch\")\n",
+    "    with open(pred, \"r\") as p, open(gold, \"r\") as g:\n",
+    "        pred = json.load(p)\n",
+    "        gold = json.load(g)\n",
+    "\n",
+    "    inputs = []\n",
+    "    inputs_max_len = 0\n",
+    "    for d in tqdm(pred[\"questions\"]):\n",
+    "        question = d[\"question\"][0][\"string\"]\n",
+    "        query = d[\"query\"][\"sparql\"]\n",
+    "\n",
+    "        #Take the first tripletin query\n",
+    "        trip = query.split(\"WHERE\")[1]\n",
+    "        trip = trip.replace(\"{\", \"\").replace(\"}\", \"\")\n",
+    "        triplet = trip.split(\" \")\n",
+    "\n",
+    "        #remove empty strings\n",
+    "        triplet = [x for x in triplet if x != \"\"]\n",
+    "\n",
+    "        for t in triplet:\n",
+    "            if not(t.find(\"?\")):\n",
+    "                triplet[triplet.index(t)] = None\n",
+    "\n",
    "        #seq = \"[CLS] \" + question + \" [SEP] \"\n",
    "        if triplet[0] is not None:\n",
    "            #seq += \"[SUB] [SEP] \" + triplet[0]\n",
-    "            tokenized_seq = self.tokenizer(question, \"[SUB]\", triplet[0])#, padding=True, truncation=True)\n",
+    "            # , padding=True, truncation=True)\n",
+    "            tokenized_seq = tokenizer(question, \"[SUB]\", triplet[0], padding=True, truncation=True)\n",
    "        elif triplet[2] is not None:\n",
    "            #seq += \"[OBJ] [SEP] \" + triplet[2]\n",
-    "            tokenized_seq = self.tokenizer(question, \"[OBJ]\", triplet[2])#, padding=True, truncation=True)\n",
-    "        \n",
-    "        x = self.bert.forward(**tokenized_seq)\n",
-    "        x = self.linear(x)\n",
-    "        \n",
-    "        x = self.softmax(x)\n",
-    "        return x\n",
+    "            tokenized_seq = tokenizer(question, \"[OBJ]\", triplet[2], padding=True, truncation=True)\n",
+    "\n",
+    "        if inputs_max_len < len(tokenized_seq[\"input_ids\"]):\n",
+    "            inputs_max_len = len(tokenized_seq[\"input_ids\"])\n",
+    "        inputs.append(list(tokenized_seq.values())[0])\n",
    "\n",
+    "    correct_rels_max_len = 0\n",
+    "    correct_rels = []\n",
+    "    for d in tqdm(gold[\"questions\"]):\n",
+    "        question = d[\"question\"][0][\"string\"]\n",
+    "        query = d[\"query\"][\"sparql\"]\n",
    "\n",
-    "\n"
+    "        #Take the first tripletin query\n",
+    "        trip = query.split(\"WHERE\")[1]\n",
+    "        trip = trip.replace(\"{\", \"\").replace(\"}\", \"\")\n",
+    "        triplet = trip.split(\" \")\n",
+    "\n",
+    "        #remove empty strings\n",
+    "        triplet = [x for x in triplet if x != \"\"]\n",
+    "\n",
+    "        tokenized = tokenizer(triplet[1], padding=True, truncation=True)\n",
+    "        if correct_rels_max_len < len(tokenized[\"input_ids\"]):\n",
+    "            correct_rels_max_len = len(tokenized[\"input_ids\"])\n",
+    "\n",
+    "        correct_rels.append(list(tokenized.values())[0])\n",
+    "\n",
+    "    inputs_padded = np.array([i + [0]*(inputs_max_len-len(i)) for i in inputs])\n",
+    "    correct_rels_padded = np.array([i + [0]*(correct_rels_max_len-len(i)) for i in correct_rels])\n",
+    "    print(\"Finished with batches\")\n",
+    "    return torch.IntTensor(inputs_padded), torch.IntTensor(correct_rels_padded)\n"
   ]
  },
  {
@@ -66,36 +183,65 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "def encode(batch):\n",
-    "  return tokenizer(batch, padding=\"max_length\", max_length=256, return_tensors=\"pt\")\n",
-    "\n",
-    "\n",
-    "def convert_to_features(example_batch):\n",
-    "    input_encodings = encode(example_batch['text'])\n",
-    "    target_encodings = encode(example_batch['summary'])\n",
-    "\n",
-    "    labels = target_encodings['input_ids']\n",
-    "    decoder_input_ids = shift_tokens_right(\n",
-    "        labels, model.config.pad_token_id, model.config.decoder_start_token_id)\n",
-    "    labels[labels[:, :] == model.config.pad_token_id] = -100\n",
+    "# training_args = Seq2SeqTrainingArguments(\n",
+    "#     output_dir='./models/blackbox',\n",
+    "#     num_train_epochs=1,\n",
+    "#     per_device_train_batch_size=1,\n",
+    "#     per_device_eval_batch_size=1,\n",
+    "#     warmup_steps=10,\n",
+    "#     weight_decay=0.01,\n",
+    "#     logging_dir='./logs',\n",
+    "# )\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 69,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "MemoryError",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[1;31mRuntimeError\u001b[0m                              Traceback (most recent call last)",
+      "File \u001b[1;32mc:\\Users\\maxbj\\AppData\\Local\\Programs\\Python\\Python39\\lib\\site-packages\\transformers\\modeling_utils.py:399\u001b[0m, in \u001b[0;36mload_state_dict\u001b[1;34m(checkpoint_file)\u001b[0m\n\u001b[0;32m    398\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[1;32m--> 399\u001b[0m     \u001b[39mreturn\u001b[39;00m torch\u001b[39m.\u001b[39;49mload(checkpoint_file, map_location\u001b[39m=\u001b[39;49m\u001b[39m\"\u001b[39;49m\u001b[39mcpu\u001b[39;49m\u001b[39m\"\u001b[39;49m)\n\u001b[0;32m    400\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mException\u001b[39;00m \u001b[39mas\u001b[39;00m e:\n",
+      "File \u001b[1;32mc:\\Users\\maxbj\\AppData\\Local\\Programs\\Python\\Python39\\lib\\site-packages\\torch\\serialization.py:713\u001b[0m, in \u001b[0;36mload\u001b[1;34m(f, map_location, pickle_module, **pickle_load_args)\u001b[0m\n\u001b[0;32m    712\u001b[0m         \u001b[39mreturn\u001b[39;00m _load(opened_zipfile, map_location, pickle_module, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mpickle_load_args)\n\u001b[1;32m--> 713\u001b[0m \u001b[39mreturn\u001b[39;00m _legacy_load(opened_file, map_location, pickle_module, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mpickle_load_args)\n",
+      "File \u001b[1;32mc:\\Users\\maxbj\\AppData\\Local\\Programs\\Python\\Python39\\lib\\site-packages\\torch\\serialization.py:930\u001b[0m, in \u001b[0;36m_legacy_load\u001b[1;34m(f, map_location, pickle_module, **pickle_load_args)\u001b[0m\n\u001b[0;32m    929\u001b[0m unpickler\u001b[39m.\u001b[39mpersistent_load \u001b[39m=\u001b[39m persistent_load\n\u001b[1;32m--> 930\u001b[0m result \u001b[39m=\u001b[39m unpickler\u001b[39m.\u001b[39;49mload()\n\u001b[0;32m    932\u001b[0m deserialized_storage_keys \u001b[39m=\u001b[39m pickle_module\u001b[39m.\u001b[39mload(f, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mpickle_load_args)\n",
+      "File \u001b[1;32mc:\\Users\\maxbj\\AppData\\Local\\Programs\\Python\\Python39\\lib\\site-packages\\torch\\serialization.py:871\u001b[0m, in \u001b[0;36m_legacy_load.<locals>.persistent_load\u001b[1;34m(saved_id)\u001b[0m\n\u001b[0;32m    870\u001b[0m \u001b[39mif\u001b[39;00m root_key \u001b[39mnot\u001b[39;00m \u001b[39min\u001b[39;00m deserialized_objects:\n\u001b[1;32m--> 871\u001b[0m     obj \u001b[39m=\u001b[39m cast(Storage, torch\u001b[39m.\u001b[39;49m_UntypedStorage(nbytes))\n\u001b[0;32m    872\u001b[0m     obj\u001b[39m.\u001b[39m_torch_load_uninitialized \u001b[39m=\u001b[39m \u001b[39mTrue\u001b[39;00m\n",
+      "\u001b[1;31mRuntimeError\u001b[0m: [enforce fail at C:\\actions-runner\\_work\\pytorch\\pytorch\\builder\\windows\\pytorch\\c10\\core\\impl\\alloc_cpu.cpp:81] data. DefaultCPUAllocator: not enough memory: you tried to allocate 93763584 bytes.",
+      "\nDuring handling of the above exception, another exception occurred:\n",
+      "\u001b[1;31mMemoryError\u001b[0m                               Traceback (most recent call last)",
+      "Cell \u001b[1;32mIn [69], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m model \u001b[39m=\u001b[39m NgmOne()\n\u001b[0;32m      3\u001b[0m EPOCHS \u001b[39m=\u001b[39m \u001b[39m3\u001b[39m\n\u001b[0;32m      4\u001b[0m criterion \u001b[39m=\u001b[39m nn\u001b[39m.\u001b[39mCrossEntropyLoss()\n",
+      "Cell \u001b[1;32mIn [64], line 5\u001b[0m, in \u001b[0;36mNgmOne.__init__\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m      3\u001b[0m \u001b[39msuper\u001b[39m(NgmOne, \u001b[39mself\u001b[39m)\u001b[39m.\u001b[39m\u001b[39m__init__\u001b[39m()\n\u001b[0;32m      4\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mtokenizer \u001b[39m=\u001b[39m BertTokenizer\u001b[39m.\u001b[39mfrom_pretrained(\u001b[39m\"\u001b[39m\u001b[39mbert-base-uncased\u001b[39m\u001b[39m\"\u001b[39m)\n\u001b[1;32m----> 5\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mbert \u001b[39m=\u001b[39m BertModel\u001b[39m.\u001b[39;49mfrom_pretrained(\u001b[39m\"\u001b[39;49m\u001b[39mbert-base-uncased\u001b[39;49m\u001b[39m\"\u001b[39;49m)\n\u001b[0;32m      6\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mlinear \u001b[39m=\u001b[39m nn\u001b[39m.\u001b[39mLinear(\u001b[39m768\u001b[39m, \u001b[39m247\u001b[39m)\n\u001b[0;32m      7\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39msoftmax \u001b[39m=\u001b[39m nn\u001b[39m.\u001b[39mSoftmax(dim\u001b[39m=\u001b[39m\u001b[39m1\u001b[39m)\n",
+      "File \u001b[1;32mc:\\Users\\maxbj\\AppData\\Local\\Programs\\Python\\Python39\\lib\\site-packages\\transformers\\modeling_utils.py:2184\u001b[0m, in \u001b[0;36mPreTrainedModel.from_pretrained\u001b[1;34m(cls, pretrained_model_name_or_path, *model_args, **kwargs)\u001b[0m\n\u001b[0;32m   2181\u001b[0m \u001b[39mif\u001b[39;00m from_pt:\n\u001b[0;32m   2182\u001b[0m     \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m is_sharded \u001b[39mand\u001b[39;00m state_dict \u001b[39mis\u001b[39;00m \u001b[39mNone\u001b[39;00m:\n\u001b[0;32m   2183\u001b[0m         \u001b[39m# Time to load the checkpoint\u001b[39;00m\n\u001b[1;32m-> 2184\u001b[0m         state_dict \u001b[39m=\u001b[39m load_state_dict(resolved_archive_file)\n\u001b[0;32m   2186\u001b[0m     \u001b[39m# set dtype to instantiate the model under:\u001b[39;00m\n\u001b[0;32m   2187\u001b[0m     \u001b[39m# 1. If torch_dtype is not None, we use that dtype\u001b[39;00m\n\u001b[0;32m   2188\u001b[0m     \u001b[39m# 2. If torch_dtype is \"auto\", we auto-detect dtype from the loaded state_dict, by checking its first\u001b[39;00m\n\u001b[0;32m   2189\u001b[0m     \u001b[39m#    weights entry that is of a floating type - we assume all floating dtype weights are of the same dtype\u001b[39;00m\n\u001b[0;32m   2190\u001b[0m     \u001b[39m# we also may have config.torch_dtype available, but we won't rely on it till v5\u001b[39;00m\n\u001b[0;32m   2191\u001b[0m     dtype_orig \u001b[39m=\u001b[39m \u001b[39mNone\u001b[39;00m\n",
+      "File \u001b[1;32mc:\\Users\\maxbj\\AppData\\Local\\Programs\\Python\\Python39\\lib\\site-packages\\transformers\\modeling_utils.py:403\u001b[0m, in \u001b[0;36mload_state_dict\u001b[1;34m(checkpoint_file)\u001b[0m\n\u001b[0;32m    401\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m    402\u001b[0m     \u001b[39mwith\u001b[39;00m \u001b[39mopen\u001b[39m(checkpoint_file) \u001b[39mas\u001b[39;00m f:\n\u001b[1;32m--> 403\u001b[0m         \u001b[39mif\u001b[39;00m f\u001b[39m.\u001b[39;49mread()\u001b[39m.\u001b[39mstartswith(\u001b[39m\"\u001b[39m\u001b[39mversion\u001b[39m\u001b[39m\"\u001b[39m):\n\u001b[0;32m    404\u001b[0m             \u001b[39mraise\u001b[39;00m \u001b[39mOSError\u001b[39;00m(\n\u001b[0;32m    405\u001b[0m                 \u001b[39m\"\u001b[39m\u001b[39mYou seem to have cloned a repository without having git-lfs installed. Please install \u001b[39m\u001b[39m\"\u001b[39m\n\u001b[0;32m    406\u001b[0m                 \u001b[39m\"\u001b[39m\u001b[39mgit-lfs and run `git lfs install` followed by `git lfs pull` in the folder \u001b[39m\u001b[39m\"\u001b[39m\n\u001b[0;32m    407\u001b[0m                 \u001b[39m\"\u001b[39m\u001b[39myou cloned.\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[0;32m    408\u001b[0m             )\n\u001b[0;32m    409\u001b[0m         \u001b[39melse\u001b[39;00m:\n",
+      "\u001b[1;31mMemoryError\u001b[0m: "
+     ]
+    }
+   ],
+   "source": [
+    "model = NgmOne()\n",
    "\n",
-    "    encodings = {\n",
-    "        'input_ids': input_encodings['input_ids'],\n",
-    "        'attention_mask': input_encodings['attention_mask'],\n",
-    "        'decoder_input_ids': decoder_input_ids,\n",
-    "        'labels': labels,\n",
-    "    }\n",
+    "EPOCHS = 3\n",
+    "criterion = nn.CrossEntropyLoss()\n",
+    "optimizer = optim.Adam(model.parameters(), lr=0.001)\n",
    "\n",
-    "    return encodings\n",
+    "train, corr_rels = make_batch()\n",
+    "for epoch in tqdm(range(EPOCHS)):\n",
+    "    optimizer.zero_grad()\n",
    "\n",
+    "    # Forward pass\n",
+    "    output = model(train)\n",
+    "    loss = criterion(output, corr_rels)\n",
    "\n",
-    "def get_dataset(path):\n",
-    "  df = pd.read_csv(path, sep=\",\", on_bad_lines='skip')\n",
-    "  dataset = datasets.Dataset.from_pandas(df)\n",
-    "  dataset = dataset.map(convert_to_features, batched=True)\n",
-    "  columns = ['input_ids', 'labels', 'decoder_input_ids', 'attention_mask', ]\n",
-    "  dataset.set_format(type='torch', columns=columns)\n",
-    "  return dataset\n"
+    "    if (epoch + 1) % 10 == 0:\n",
+    "        print('Epoch:', '%04d' % (epoch + 1), 'cost =', '{:.6f}'.format(loss))\n",
+    "    # Backward pass\n",
+    "    loss.backward()\n",
+    "    optimizer.step()\n",
+    "    \n"
   ]
  }
 ],
@@ -106,7 +252,15 @@
   "name": "python3"
  },
  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
   "version": "3.9.11"
  },
  "orig_nbformat": 4,

 %% Cell type:code id: tags:

 ``` python
 import datasets
 import torch
 import torch.nn as nn
+import torch.optim as optim
 import pandas as pd
 import numpy as np
 from transformers import BertTokenizer, BertModel
-from transformers.models.bert.modeling_bert import shift_tokens_right
+from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
+from tqdm import tqdm
+import json
 ```

 %% Cell type:code id: tags:

 ``` python
 tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
 ```

+%% Output
+
+    Downloading: 100%|██████████| 232k/232k [00:00<00:00, 636kB/s]
+    c:\Users\maxbj\AppData\Local\Programs\Python\Python39\lib\site-packages\huggingface_hub\file_download.py:123: UserWarning: `huggingface_hub` cache-system uses symlinks by default to efficiently store duplicated files but your machine does not support them in C:\Users\maxbj\.cache\huggingface\hub. Caching files will still work but in a degraded version that might require more space on your disk. This warning can be disabled by setting the `HF_HUB_DISABLE_SYMLINKS_WARNING` environment variable. For more details, see https://huggingface.co/docs/huggingface_hub/how-to-cache#limitations.
+    To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
+      warnings.warn(message)
+    Downloading: 100%|██████████| 28.0/28.0 [00:00<00:00, 28.9kB/s]
+    Downloading: 100%|██████████| 570/570 [00:00<00:00, 572kB/s]
+
 %% Cell type:code id: tags:

 ``` python

 class NgmOne(nn.Module):
    def __init__(self):
        super(NgmOne, self).__init__()
        self.tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
        self.bert = BertModel.from_pretrained("bert-base-uncased")
-        self.linear = nn.Linear(768, 1)
+        self.linear = nn.Linear(768, 247)
        self.softmax = nn.Softmax(dim=1)

-    def forward(self, triplet, question):
-        """Triplet is a list of subject entity, relation, object entity, None if not present"""
+    def forward(self, tokenized_seq):
+        x = self.bert.forward(tokenized_seq)
+        x = self.linear(x)
+
+        x = self.softmax(x)
+        return x
+```
+
+%% Cell type:code id: tags:
+
+``` python
+# def encode(batch):
+#   return tokenizer(batch, padding="max_length", max_length=256, return_tensors="pt")
+
+
+# def convert_to_features(example_batch):
+#     input_encodings = encode(example_batch['text'])
+#     target_encodings = encode(example_batch['summary'])
+
+#     labels = target_encodings['input_ids']
+#     decoder_input_ids = shift_tokens_right(
+#         labels, model.config.pad_token_id, model.config.decoder_start_token_id)
+#     labels[labels[:, :] == model.config.pad_token_id] = -100
+
+#     encodings = {
+#         'input_ids': input_encodings['input_ids'],
+#         'attention_mask': input_encodings['attention_mask'],
+#         'decoder_input_ids': decoder_input_ids,
+#         'labels': labels,
+#     }
+
+#     return encodings
+
+
+# def get_dataset(path):
+#   df = pd.read_csv(path, sep=",", on_bad_lines='skip')
+#   dataset = datasets.Dataset.from_pandas(df)
+#   dataset = dataset.map(convert_to_features, batched=True)
+#   columns = ['input_ids', 'labels', 'decoder_input_ids', 'attention_mask', ]
+#   dataset.set_format(type='torch', columns=columns)
+#   return dataset
+```
+
+%% Cell type:code id: tags:
+
+``` python
+def make_batch():
+    """Triplet is a list of [subject entity, relation, object entity], None if not present"""
+
+    # Load predicted data
+    pred = "../data/qald-9-train-linked.json"
+
+    #Load gold data
+    gold = "../data/qald-9-train-linked.json"
+    print("Beginning making batch")
+    with open(pred, "r") as p, open(gold, "r") as g:
+        pred = json.load(p)
+        gold = json.load(g)
+
+    inputs = []
+    inputs_max_len = 0
+    for d in tqdm(pred["questions"]):
+        question = d["question"][0]["string"]
+        query = d["query"]["sparql"]
+
+        #Take the first tripletin query
+        trip = query.split("WHERE")[1]
+        trip = trip.replace("{", "").replace("}", "")
+        triplet = trip.split(" ")
+
+        #remove empty strings
+        triplet = [x for x in triplet if x != ""]
+
+        for t in triplet:
+            if not(t.find("?")):
+                triplet[triplet.index(t)] = None

        #seq = "[CLS] " + question + " [SEP] "
        if triplet[0] is not None:
            #seq += "[SUB] [SEP] " + triplet[0]
-            tokenized_seq = self.tokenizer(question, "[SUB]", triplet[0])#, padding=True, truncation=True)
+            # , padding=True, truncation=True)
+            tokenized_seq = tokenizer(question, "[SUB]", triplet[0], padding=True, truncation=True)
        elif triplet[2] is not None:
            #seq += "[OBJ] [SEP] " + triplet[2]
-            tokenized_seq = self.tokenizer(question, "[OBJ]", triplet[2])#, padding=True, truncation=True)
-
-        x = self.bert.forward(**tokenized_seq)
-        x = self.linear(x)
-
-        x = self.softmax(x)
-        return x
+            tokenized_seq = tokenizer(question, "[OBJ]", triplet[2], padding=True, truncation=True)

+        if inputs_max_len < len(tokenized_seq["input_ids"]):
+            inputs_max_len = len(tokenized_seq["input_ids"])
+        inputs.append(list(tokenized_seq.values())[0])
+
+    correct_rels_max_len = 0
+    correct_rels = []
+    for d in tqdm(gold["questions"]):
+        question = d["question"][0]["string"]
+        query = d["query"]["sparql"]
+
+        #Take the first tripletin query
+        trip = query.split("WHERE")[1]
+        trip = trip.replace("{", "").replace("}", "")
+        triplet = trip.split(" ")
+
+        #remove empty strings
+        triplet = [x for x in triplet if x != ""]
+
+        tokenized = tokenizer(triplet[1], padding=True, truncation=True)
+        if correct_rels_max_len < len(tokenized["input_ids"]):
+            correct_rels_max_len = len(tokenized["input_ids"])
+
+        correct_rels.append(list(tokenized.values())[0])
+
+    inputs_padded = np.array([i + [0]*(inputs_max_len-len(i)) for i in inputs])
+    correct_rels_padded = np.array([i + [0]*(correct_rels_max_len-len(i)) for i in correct_rels])
+    print("Finished with batches")
+    return torch.IntTensor(inputs_padded), torch.IntTensor(correct_rels_padded)
+```

+%% Cell type:code id: tags:

+``` python
+# training_args = Seq2SeqTrainingArguments(
+#     output_dir='./models/blackbox',
+#     num_train_epochs=1,
+#     per_device_train_batch_size=1,
+#     per_device_eval_batch_size=1,
+#     warmup_steps=10,
+#     weight_decay=0.01,
+#     logging_dir='./logs',
+# )
 ```

 %% Cell type:code id: tags:

 ``` python
-def encode(batch):
-  return tokenizer(batch, padding="max_length", max_length=256, return_tensors="pt")
+model = NgmOne()

+EPOCHS = 3
+criterion = nn.CrossEntropyLoss()
+optimizer = optim.Adam(model.parameters(), lr=0.001)
+
+train, corr_rels = make_batch()
+for epoch in tqdm(range(EPOCHS)):
+    optimizer.zero_grad()
+
+    # Forward pass
+    output = model(train)
+    loss = criterion(output, corr_rels)
+
+    if (epoch + 1) % 10 == 0:
+        print('Epoch:', '%04d' % (epoch + 1), 'cost =', '{:.6f}'.format(loss))
+    # Backward pass
+    loss.backward()
+    optimizer.step()

-def convert_to_features(example_batch):
-    input_encodings = encode(example_batch['text'])
-    target_encodings = encode(example_batch['summary'])
-
-    labels = target_encodings['input_ids']
-    decoder_input_ids = shift_tokens_right(
-        labels, model.config.pad_token_id, model.config.decoder_start_token_id)
-    labels[labels[:, :] == model.config.pad_token_id] = -100
-
-    encodings = {
-        'input_ids': input_encodings['input_ids'],
-        'attention_mask': input_encodings['attention_mask'],
-        'decoder_input_ids': decoder_input_ids,
-        'labels': labels,
-    }
-
-    return encodings
-
-
-def get_dataset(path):
-  df = pd.read_csv(path, sep=",", on_bad_lines='skip')
-  dataset = datasets.Dataset.from_pandas(df)
-  dataset = dataset.map(convert_to_features, batched=True)
-  columns = ['input_ids', 'labels', 'decoder_input_ids', 'attention_mask', ]
-  dataset.set_format(type='torch', columns=columns)
-  return dataset
 ```
+
+%% Output
+
+    ---------------------------------------------------------------------------
+    RuntimeError                              Traceback (most recent call last)
+File     c:\Users\maxbj\AppData\Local\Programs\Python\Python39\lib\site-packages\transformers\modeling_utils.py:399, in load_state_dict(checkpoint_file)
+        398 try:
+    --> 399     return torch.load(checkpoint_file, map_location="cpu")
+        400 except Exception as e:
+File     c:\Users\maxbj\AppData\Local\Programs\Python\Python39\lib\site-packages\torch\serialization.py:713, in load(f, map_location, pickle_module, **pickle_load_args)
+        712         return _load(opened_zipfile, map_location, pickle_module, **pickle_load_args)
+    --> 713 return _legacy_load(opened_file, map_location, pickle_module, **pickle_load_args)
+File     c:\Users\maxbj\AppData\Local\Programs\Python\Python39\lib\site-packages\torch\serialization.py:930, in _legacy_load(f, map_location, pickle_module, **pickle_load_args)
+        929 unpickler.persistent_load = persistent_load
+    --> 930 result = unpickler.load()
+        932 deserialized_storage_keys = pickle_module.load(f, **pickle_load_args)
+File     c:\Users\maxbj\AppData\Local\Programs\Python\Python39\lib\site-packages\torch\serialization.py:871, in _legacy_load.<locals>.persistent_load(saved_id)
+        870 if root_key not in deserialized_objects:
+    --> 871     obj = cast(Storage, torch._UntypedStorage(nbytes))
+        872     obj._torch_load_uninitialized = True
+    RuntimeError: [enforce fail at C:\actions-runner\_work\pytorch\pytorch\builder\windows\pytorch\c10\core\impl\alloc_cpu.cpp:81] data. DefaultCPUAllocator: not enough memory: you tried to allocate 93763584 bytes.
+
+During handling of the above exception, another exception occurred:
+    MemoryError                               Traceback (most recent call last)
+Cell     In [69], line 1
+    ----> 1 model = NgmOne()
+          3 EPOCHS = 3
+          4 criterion = nn.CrossEntropyLoss()
+Cell     In [64], line 5, in NgmOne.__init__(self)
+          3 super(NgmOne, self).__init__()
+          4 self.tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
+    ----> 5 self.bert = BertModel.from_pretrained("bert-base-uncased")
+          6 self.linear = nn.Linear(768, 247)
+          7 self.softmax = nn.Softmax(dim=1)
+File     c:\Users\maxbj\AppData\Local\Programs\Python\Python39\lib\site-packages\transformers\modeling_utils.py:2184, in PreTrainedModel.from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs)
+       2181 if from_pt:
+       2182     if not is_sharded and state_dict is None:
+       2183         # Time to load the checkpoint
+    -> 2184         state_dict = load_state_dict(resolved_archive_file)
+       2186     # set dtype to instantiate the model under:
+       2187     # 1. If torch_dtype is not None, we use that dtype
+       2188     # 2. If torch_dtype is "auto", we auto-detect dtype from the loaded state_dict, by checking its first
+       2189     #    weights entry that is of a floating type - we assume all floating dtype weights are of the same dtype
+       2190     # we also may have config.torch_dtype available, but we won't rely on it till v5
+       2191     dtype_orig = None
+File     c:\Users\maxbj\AppData\Local\Programs\Python\Python39\lib\site-packages\transformers\modeling_utils.py:403, in load_state_dict(checkpoint_file)
+        401 try:
+        402     with open(checkpoint_file) as f:
+    --> 403         if f.read().startswith("version"):
+        404             raise OSError(
+        405                 "You seem to have cloned a repository without having git-lfs installed. Please install "
+        406                 "git-lfs and run `git lfs install` followed by `git lfs pull` in the folder "
+        407                 "you cloned."
+        408             )
+        409         else:
+    MemoryError: