diff --git a/Neural graph module/ngm.ipynb b/Neural graph module/ngm.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..bf75c0e5c224fa7785492429a288b518ec45bada --- /dev/null +++ b/Neural graph module/ngm.ipynb @@ -0,0 +1,121 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import datasets\n", + "import torch\n", + "import torch.nn as nn\n", + "import pandas as pd\n", + "import numpy as np\n", + "from transformers import BertTokenizer, BertModel\n", + "from transformers.models.bert.modeling_bert import shift_tokens_right\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "tokenizer = BertTokenizer.from_pretrained(\"bert-base-uncased\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "class NgmOne(nn.Module):\n", + " def __init__(self):\n", + " super(NgmOne, self).__init__()\n", + " self.tokenizer = BertTokenizer.from_pretrained(\"bert-base-uncased\")\n", + " self.bert = BertModel.from_pretrained(\"bert-base-uncased\")\n", + " self.linear = nn.Linear(768, 1)\n", + " self.softmax = nn.Softmax(dim=1)\n", + " \n", + " def forward(self, triplet, question):\n", + " \"\"\"Triplet is a list of subject entity, relation, object entity, None if not present\"\"\"\n", + " \n", + " #seq = \"[CLS] \" + question + \" [SEP] \"\n", + " if triplet[0] is not None:\n", + " #seq += \"[SUB] [SEP] \" + triplet[0]\n", + " tokenized_seq = self.tokenizer(question, \"[SUB]\", triplet[0])#, padding=True, truncation=True)\n", + " elif triplet[2] is not None:\n", + " #seq += \"[OBJ] [SEP] \" + triplet[2]\n", + " tokenized_seq = self.tokenizer(question, \"[OBJ]\", triplet[2])#, padding=True, truncation=True)\n", + " \n", + " x = self.bert.forward(**tokenized_seq)\n", + " x = self.linear(x)\n", + " \n", + " x = self.softmax(x)\n", + " return x\n", + "\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def encode(batch):\n", + " return tokenizer(batch, padding=\"max_length\", max_length=256, return_tensors=\"pt\")\n", + "\n", + "\n", + "def convert_to_features(example_batch):\n", + " input_encodings = encode(example_batch['text'])\n", + " target_encodings = encode(example_batch['summary'])\n", + "\n", + " labels = target_encodings['input_ids']\n", + " decoder_input_ids = shift_tokens_right(\n", + " labels, model.config.pad_token_id, model.config.decoder_start_token_id)\n", + " labels[labels[:, :] == model.config.pad_token_id] = -100\n", + "\n", + " encodings = {\n", + " 'input_ids': input_encodings['input_ids'],\n", + " 'attention_mask': input_encodings['attention_mask'],\n", + " 'decoder_input_ids': decoder_input_ids,\n", + " 'labels': labels,\n", + " }\n", + "\n", + " return encodings\n", + "\n", + "\n", + "def get_dataset(path):\n", + " df = pd.read_csv(path, sep=\",\", on_bad_lines='skip')\n", + " dataset = datasets.Dataset.from_pandas(df)\n", + " dataset = dataset.map(convert_to_features, batched=True)\n", + " columns = ['input_ids', 'labels', 'decoder_input_ids', 'attention_mask', ]\n", + " dataset.set_format(type='torch', columns=columns)\n", + " return dataset\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3.9.11 64-bit", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.9.11" + }, + "orig_nbformat": 4, + "vscode": { + "interpreter": { + "hash": "64e7cd3b4b88defe39dd61a4584920400d6beb2615ab2244e340c2e20eecdfe9" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}