began on neural graph search module

d0958432 · Max Björkander · 0426cd7d · d0958432
Commit d0958432 authored 2 years ago by Max Björkander
--- a/Neural graph module/ngm.ipynb
+++ b/Neural graph module/ngm.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import datasets\n",
+    "import torch\n",
+    "import torch.nn as nn\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "from transformers import BertTokenizer, BertModel\n",
+    "from transformers.models.bert.modeling_bert import shift_tokens_right\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tokenizer = BertTokenizer.from_pretrained(\"bert-base-uncased\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "class NgmOne(nn.Module):\n",
+    "    def __init__(self):\n",
+    "        super(NgmOne, self).__init__()\n",
+    "        self.tokenizer = BertTokenizer.from_pretrained(\"bert-base-uncased\")\n",
+    "        self.bert = BertModel.from_pretrained(\"bert-base-uncased\")\n",
+    "        self.linear = nn.Linear(768, 1)\n",
+    "        self.softmax = nn.Softmax(dim=1)\n",
+    "    \n",
+    "    def forward(self, triplet, question):\n",
+    "        \"\"\"Triplet is a list of subject entity, relation, object entity, None if not present\"\"\"\n",
+    "        \n",
+    "        #seq = \"[CLS] \" + question + \" [SEP] \"\n",
+    "        if triplet[0] is not None:\n",
+    "            #seq += \"[SUB] [SEP] \" + triplet[0]\n",
+    "            tokenized_seq = self.tokenizer(question, \"[SUB]\", triplet[0])#, padding=True, truncation=True)\n",
+    "        elif triplet[2] is not None:\n",
+    "            #seq += \"[OBJ] [SEP] \" + triplet[2]\n",
+    "            tokenized_seq = self.tokenizer(question, \"[OBJ]\", triplet[2])#, padding=True, truncation=True)\n",
+    "        \n",
+    "        x = self.bert.forward(**tokenized_seq)\n",
+    "        x = self.linear(x)\n",
+    "        \n",
+    "        x = self.softmax(x)\n",
+    "        return x\n",
+    "\n",
+    "\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def encode(batch):\n",
+    "  return tokenizer(batch, padding=\"max_length\", max_length=256, return_tensors=\"pt\")\n",
+    "\n",
+    "\n",
+    "def convert_to_features(example_batch):\n",
+    "    input_encodings = encode(example_batch['text'])\n",
+    "    target_encodings = encode(example_batch['summary'])\n",
+    "\n",
+    "    labels = target_encodings['input_ids']\n",
+    "    decoder_input_ids = shift_tokens_right(\n",
+    "        labels, model.config.pad_token_id, model.config.decoder_start_token_id)\n",
+    "    labels[labels[:, :] == model.config.pad_token_id] = -100\n",
+    "\n",
+    "    encodings = {\n",
+    "        'input_ids': input_encodings['input_ids'],\n",
+    "        'attention_mask': input_encodings['attention_mask'],\n",
+    "        'decoder_input_ids': decoder_input_ids,\n",
+    "        'labels': labels,\n",
+    "    }\n",
+    "\n",
+    "    return encodings\n",
+    "\n",
+    "\n",
+    "def get_dataset(path):\n",
+    "  df = pd.read_csv(path, sep=\",\", on_bad_lines='skip')\n",
+    "  dataset = datasets.Dataset.from_pandas(df)\n",
+    "  dataset = dataset.map(convert_to_features, batched=True)\n",
+    "  columns = ['input_ids', 'labels', 'decoder_input_ids', 'attention_mask', ]\n",
+    "  dataset.set_format(type='torch', columns=columns)\n",
+    "  return dataset\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3.9.11 64-bit",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "name": "python",
+   "version": "3.9.11"
+  },
+  "orig_nbformat": 4,
+  "vscode": {
+   "interpreter": {
+    "hash": "64e7cd3b4b88defe39dd61a4584920400d6beb2615ab2244e340c2e20eecdfe9"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
+%% Cell type:code id: tags:
+
+``` python
+import datasets
+import torch
+import torch.nn as nn
+import pandas as pd
+import numpy as np
+from transformers import BertTokenizer, BertModel
+from transformers.models.bert.modeling_bert import shift_tokens_right
+```
+
+%% Cell type:code id: tags:
+
+``` python
+tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
+```
+
+%% Cell type:code id: tags:
+
+``` python
+
+class NgmOne(nn.Module):
+    def __init__(self):
+        super(NgmOne, self).__init__()
+        self.tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
+        self.bert = BertModel.from_pretrained("bert-base-uncased")
+        self.linear = nn.Linear(768, 1)
+        self.softmax = nn.Softmax(dim=1)
+
+    def forward(self, triplet, question):
+        """Triplet is a list of subject entity, relation, object entity, None if not present"""
+
+        #seq = "[CLS] " + question + " [SEP] "
+        if triplet[0] is not None:
+            #seq += "[SUB] [SEP] " + triplet[0]
+            tokenized_seq = self.tokenizer(question, "[SUB]", triplet[0])#, padding=True, truncation=True)
+        elif triplet[2] is not None:
+            #seq += "[OBJ] [SEP] " + triplet[2]
+            tokenized_seq = self.tokenizer(question, "[OBJ]", triplet[2])#, padding=True, truncation=True)
+
+        x = self.bert.forward(**tokenized_seq)
+        x = self.linear(x)
+
+        x = self.softmax(x)
+        return x
+
+
+
+```
+
+%% Cell type:code id: tags:
+
+``` python
+def encode(batch):
+  return tokenizer(batch, padding="max_length", max_length=256, return_tensors="pt")
+
+
+def convert_to_features(example_batch):
+    input_encodings = encode(example_batch['text'])
+    target_encodings = encode(example_batch['summary'])
+
+    labels = target_encodings['input_ids']
+    decoder_input_ids = shift_tokens_right(
+        labels, model.config.pad_token_id, model.config.decoder_start_token_id)
+    labels[labels[:, :] == model.config.pad_token_id] = -100
+
+    encodings = {
+        'input_ids': input_encodings['input_ids'],
+        'attention_mask': input_encodings['attention_mask'],
+        'decoder_input_ids': decoder_input_ids,
+        'labels': labels,
+    }
+
+    return encodings
+
+
+def get_dataset(path):
+  df = pd.read_csv(path, sep=",", on_bad_lines='skip')
+  dataset = datasets.Dataset.from_pandas(df)
+  dataset = dataset.map(convert_to_features, batched=True)
+  columns = ['input_ids', 'labels', 'decoder_input_ids', 'attention_mask', ]
+  dataset.set_format(type='torch', columns=columns)
+  return dataset
+```