diff --git a/final_project.ipynb b/final_project.ipynb index 3598b50cc6615ad11b38531eadb5bcceb28e1cbf..21ca5541fe1673a3483a1a369e12d5a207bd1f62 100644 --- a/final_project.ipynb +++ b/final_project.ipynb @@ -21,7 +21,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 40, "metadata": { "id": "1LGzWEOnVyAr" }, @@ -33,20 +33,21 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 41, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "IjZFoilkuix1", - "outputId": "ac122e04-10de-4338-9ea3-0395bbdc39c1" + "outputId": "68aa4eb9-ca40-45e0-d95e-ee9886a6c5c9" }, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ - "[nltk_data] Downloading package vader_lexicon to /root/nltk_data...\n" + "[nltk_data] Downloading package vader_lexicon to /root/nltk_data...\n", + "[nltk_data] Package vader_lexicon is already up-to-date!\n" ] } ], @@ -79,13 +80,13 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 42, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "WnjtR3s3uix4", - "outputId": "6bb795ab-d9b5-4398-faca-edd03ec2ff3f" + "outputId": "dbb3634e-6907-4480-97d2-15abcf0c3f23" }, "outputs": [ { @@ -93,27 +94,21 @@ "name": "stdout", "text": [ "(50000, 2)\n", - "(13274, 2)\n" + "(13274, 2)\n", + "51.94364923911405\n" ] }, { "output_type": "stream", "name": "stderr", "text": [ - "<ipython-input-3-ba7acf40090b>:9: SettingWithCopyWarning: \n", + "<ipython-input-42-26ec676a7505>:10: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df['sentiment'] = df['sentiment'].map(label_map)\n" ] - }, - { - "output_type": "stream", - "name": "stdout", - "text": [ - "51.94364923911405\n" - ] } ], "source": [ @@ -121,6 +116,7 @@ "import pandas as pd\n", "\n", "df = pd.read_csv('imdb.csv')\n", + "df = df.sample(frac=1, random_state=12).reset_index(drop=True)\n", "print(df.shape)\n", "df = df[df['review'].str.split().apply(len) <= 128]\n", "print(df.shape)\n", @@ -129,6 +125,50 @@ "print((df[df['sentiment']==1].count()[0]/len(df))*100)" ] }, + { + "cell_type": "code", + "source": [ + "train = df[0:int(len(df)*0.70)]\n", + "print(len(train))\n", + "print((train[train['sentiment']==1].count()[0]))\n", + "print((train[train['sentiment']==0].count()[0]))\n", + "\n", + "valid = df[len(train):int(len(df)*0.85)]\n", + "print(len(valid))\n", + "print((valid[valid['sentiment']==1].count()[0]))\n", + "print((valid[valid['sentiment']==0].count()[0]))\n", + "\n", + "test = df[len(train)+len(valid):]\n", + "print(len(test))\n", + "print((test[test['sentiment']==1].count()[0]))\n", + "print((test[test['sentiment']==0].count()[0]))" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "HqnvpSwgm1or", + "outputId": "775b5b27-75de-411c-9501-ee85a5b5e533" + }, + "execution_count": 43, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "9291\n", + "4826\n", + "4465\n", + "1991\n", + "1035\n", + "956\n", + "1992\n", + "1034\n", + "958\n" + ] + } + ] + }, { "cell_type": "markdown", "metadata": { @@ -140,7 +180,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 44, "metadata": { "id": "ufyC5VtJuix6" }, @@ -187,20 +227,20 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 45, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "4xZ3x_MQvYuK", - "outputId": "cdbbfac1-ab2b-461c-d6c7-f1674978bea9" + "outputId": "6cd3b43c-4c7a-4b44-9ae2-f373a953cb04" }, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ - "Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']\n", + "Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight']\n", "- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n", "- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n", "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']\n", @@ -228,7 +268,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 46, "metadata": { "id": "9DCuYESsuix8" }, @@ -268,7 +308,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 47, "metadata": { "id": "FvvIUO4iuix8" }, @@ -283,8 +323,9 @@ "from sklearn.metrics import *\n", "\n", "def train_bert(n_epochs=1, batch_size=32):\n", - " train = df[0:int(len(df)*0.75)]\n", - " test = df[int(len(df)*0.75):]\n", + " train = df[0:int(len(df)*0.70)]\n", + " valid = df[len(train):int(len(df)*0.85)]\n", + " test = df[len(train)+len(valid):]\n", " \n", " train = tensorize(train)\n", " test = tensorize(test)\n", @@ -364,13 +405,13 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 48, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "mH9OCuh9uix9", - "outputId": "d755a07a-e0d4-4c17-c8b8-97f766be7523", + "outputId": "fad5eb9a-4405-415b-945d-c347a1f46a1c", "scrolled": true }, "outputs": [ @@ -378,13 +419,13 @@ "output_type": "stream", "name": "stderr", "text": [ - "Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']\n", + "Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight']\n", "- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n", "- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n", "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']\n", "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n", - "Epoch 1: : 9984it [03:37, 45.84it/s, loss=1.05]\n", - "test: : 3328it [00:24, 137.33it/s, accuracy=87.4, f1_score=88.4, precision=83.9, recall=93.4]" + "Epoch 1: : 9312it [03:14, 47.85it/s, loss=0.471]\n", + "test: : 2016it [00:13, 146.56it/s, accuracy=88, f1_score=88.2, precision=89.6, recall=86.8]" ] }, { @@ -392,8 +433,8 @@ "name": "stdout", "text": [ "\n", - "[[1308 305]\n", - " [ 113 1593]], CONFUSION MATRIX\n" + "[[854 104]\n", + " [136 898]], CONFUSION MATRIX\n" ] }, { @@ -419,13 +460,13 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 49, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "JgSc5kd-6XrK", - "outputId": "6d1e14eb-f3dc-4f3c-e3f9-1bd6dfa6484e" + "outputId": "75fd7f5c-e7f7-43b1-876d-21021bb91afc" }, "outputs": [ { @@ -433,25 +474,26 @@ "name": "stdout", "text": [ "--------Train-----------\n", - "0.7293822199899548, acc\n", - "0.6845141251294187, prec\n", - "0.8918866833686645, recall\n", - "0.7745606694560669, f1\n", - "[[2633 2133]\n", - " [ 561 4628]], conf matrix\n", + "0.7281239909589926, acc\n", + "0.6832377310388783, prec\n", + "0.888520513883133, recall\n", + "0.7724734282111332, f1\n", + "[[2477 1988]\n", + " [ 538 4288]], conf matrix\n", "--------Test-----------\n", - "0.724013257005122, acc\n", - "0.6758682101513802, prec\n", - "0.8898007033997656, recall\n", - "0.7682186234817814, f1\n", - "[[ 885 728]\n", - " [ 188 1518]], conf matrix\n" + "0.7419678714859438, acc\n", + "0.6914580265095729, prec\n", + "0.9081237911025145, recall\n", + "0.7851170568561873, f1\n", + "[[539 419]\n", + " [ 95 939]], conf matrix\n" ] } ], "source": [ - "train = df[0:int(len(df)*0.75)]\n", - "test = df[len(train):]\n", + "train = df[0:int(len(df)*0.70)]\n", + "valid = df[len(train):int(len(df)*0.85)]\n", + "test = df[len(train)+len(valid):]\n", "\n", "vader_tr = preVader(train)\n", "vad_pos = vader_tr[:, 2]\n", @@ -508,7 +550,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 50, "metadata": { "id": "bPwQouzSuix7" }, @@ -533,7 +575,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 51, "metadata": { "id": "pCJ_D43Cuix9" }, @@ -546,8 +588,9 @@ "import numpy as np\n", "\n", "def train_mlp(bert, n_epochs=1, batch_size=32, learning_rate=1e-5):\n", - " train = df[0:int(len(df)*0.75)]\n", - " test = df[len(train):]\n", + " train = df[0:int(len(df)*0.70)]\n", + " valid = df[len(train):int(len(df)*0.85)]\n", + " test = df[len(train)+len(valid):]\n", "\n", " vader_tr = preVader(train).to(device)\n", " vad_pos = vader_tr[:, 2]\n", @@ -641,27 +684,27 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 52, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "922Umg2cuix9", - "outputId": "5d58b8a6-7cff-4aa9-d826-445f04d7a3e8" + "outputId": "a0720fdb-1aff-413e-f73f-7f3889985bb4" }, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ - "<ipython-input-14-36585a68d363>:14: UserWarning: Implicit dimension choice for softmax has been deprecated. Change the call to include dim=X as an argument.\n", + "<ipython-input-51-9ed055f35dd1>:15: UserWarning: Implicit dimension choice for softmax has been deprecated. Change the call to include dim=X as an argument.\n", " vader_tr = F.softmax(torch.column_stack((vad_neg, vad_pos)))\n", - "<ipython-input-14-36585a68d363>:19: UserWarning: Implicit dimension choice for softmax has been deprecated. Change the call to include dim=X as an argument.\n", + "<ipython-input-51-9ed055f35dd1>:20: UserWarning: Implicit dimension choice for softmax has been deprecated. Change the call to include dim=X as an argument.\n", " vader_te = F.softmax(torch.column_stack((vad_neg, vad_pos)))\n", "/usr/local/lib/python3.9/dist-packages/torch/nn/modules/lazy.py:180: UserWarning: Lazy modules are a new feature under heavy development so changes to the API or functionality can happen at any moment.\n", " warnings.warn('Lazy modules are a new feature under heavy development '\n", - "Epoch 1: : 9984it [03:23, 49.17it/s, loss=0.552] \n", - "test: : 3328it [00:24, 136.86it/s, accuracy=88.5, f1_score=89, precision=87.7, recall=90.3]" + "Epoch 1: : 9312it [03:00, 51.54it/s, loss=0.321]\n", + "test: : 2016it [00:13, 146.83it/s, accuracy=88.2, f1_score=88.7, precision=88.1, recall=89.2]" ] }, { @@ -669,8 +712,8 @@ "name": "stdout", "text": [ "\n", - "[[1398 215]\n", - " [ 166 1540]], CONFUSION MATRIX\n" + "[[834 124]\n", + " [112 922]], CONFUSION MATRIX\n" ] }, { @@ -693,12 +736,12 @@ "source": [ "## Method 2 - Weights\n", "\n", - "In method two, the respective outputs from BERT and VADER are assigned a weight and then added together. To try different weights, we simply implement a loop. Results are thus not optimal, but show an approximation of the relationship between the models." + "In method two, the respective outputs from BERT and VADER are assigned a weight and then added together. First we divide the previous testset into 50% validation and 50% test. To try different weights, we simply implement a loop. We choose the weight with the highest accuracy on the validation data. Results are then tested on the testset. Results are thus not optimal, but show an approximation of the relationship between the models." ] }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 53, "metadata": { "id": "iaVMMo2fuUW3" }, @@ -710,18 +753,19 @@ "from sklearn.metrics import confusion_matrix\n", "import numpy as np\n", "\n", - "def train_weights(bert, w, n_epochs=1, batch_size=32):\n", - " train = df[0:int(len(df)*0.75)]\n", - " test = df[len(train):]\n", - " vader = preVader(test).to(device)\n", + "def valid_weights(bert, w, n_epochs=1, batch_size=32):\n", + "\n", + " train = df[0:int(len(df)*0.70)]\n", + " valid = df[len(train):int(len(df)*0.85)]\n", " \n", + " vader = preVader(valid).to(device) \n", " train = tensorize(train)\n", - " test = tensorize(test)\n", + " valid = tensorize(valid)\n", "\n", " y,p = [], [] # init list of actuals and predictions\n", - " with tqdm(total=len(list(test))) as pbar:\n", - " pbar.set_description(f'test')\n", - " for index, batch in enumerate(DataLoader(test, batch_size, shuffle=False)): \n", + " with tqdm(total=len(list(valid))) as pbar:\n", + " pbar.set_description(f'validation')\n", + " for index, batch in enumerate(DataLoader(valid, batch_size, shuffle=False)): \n", " \n", " b_ids = batch[0].to(device)\n", " b_mask = batch[1].to(device)\n", @@ -755,208 +799,95 @@ " pbar.set_postfix(accuracy=float(acc)*100,\n", " precision=float(prec)*100,\n", " recall=float(rec)*100,\n", - " f1_score=float(f1)*100)\n", - " print(f\"\\n{cm}, CONFUSION MATRIX\")\n", - " return ['Accuracy',acc*100, 'Precision', prec*100,'recall', rec*100,'F1-score', f1, 'CM', cm]" + " f1_score=float(f1)*100,\n", + " weight_vader=w,\n", + " weight_bert=1-w)\n", + " return acc" ] }, { "cell_type": "code", - "execution_count": 20, + "source": [ + "import torch.nn.functional as F\n", + "import torch.optim as optim\n", + "from tqdm import tqdm\n", + "from sklearn.metrics import confusion_matrix\n", + "import numpy as np\n", + "\n", + "def test_weights(bert, w, n_epochs=1, batch_size=32):\n", + "\n", + " train = df[0:int(len(df)*0.70)]\n", + " valid = df[len(train):int(len(df)*0.85)]\n", + " test = df[len(train)+len(valid):]\n", + " \n", + " vader = preVader(test).to(device)\n", + " train = tensorize(train)\n", + " test = tensorize(test)\n", + "\n", + " y,p = [], [] # init list of actuals and predictions\n", + " with tqdm(total=len(list(test))) as pbar:\n", + " pbar.set_description(f'test')\n", + " for index, batch in enumerate(DataLoader(test, batch_size, shuffle=False)): \n", + " \n", + " b_ids = batch[0].to(device)\n", + " b_mask = batch[1].to(device)\n", + " b_labels = batch[2].to(device)\n", + " \n", + " outputs = bert(input_ids=b_ids, attention_mask=b_mask,\n", + " labels=b_labels)\n", + " \n", + " vad_pos = vader[batch_size*index:batch_size*index+len(b_ids), 2]\n", + " vad_neg = vader[batch_size*index:batch_size*index+len(b_ids), 0]\n", + "\n", + " pred_vad = F.softmax(torch.column_stack((vad_neg, vad_pos)), dim=0)\n", + " pred_bert = F.softmax(outputs.logits, dim=0)\n", + "\n", + " pred = torch.argmax(torch.add(w*pred_vad, (1-w)*pred_bert), dim=1)\n", + "\n", + " y.extend(b_labels.cpu().detach().numpy())\n", + " p.extend(pred.cpu().detach().numpy())\n", + "\n", + " # Update diagnostics\n", + " pbar.update(batch_size)\n", + "\n", + " # using sklearn to compute performance metrics\n", + " cm = confusion_matrix(y, p)\n", + " acc = accuracy_score(y, p)\n", + " prec = precision_score(y, p)\n", + " rec = recall_score(y, p)\n", + " f1 = f1_score(y, p)\n", + " \n", + " # update status bar with metrics\n", + " pbar.set_postfix(accuracy=float(acc)*100,\n", + " precision=float(prec)*100,\n", + " recall=float(rec)*100,\n", + " f1_score=float(f1)*100,\n", + " weight_vader=w,\n", + " weight_bert=1-w)\n", + " print(f\"\\n{cm}, CONFUSION MATRIX\")" + ], "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "ZdeslAvbU7bh", - "outputId": "097ecc22-e45c-4380-af56-72151b87d4b4" + "id": "65fCqc0Yff4e" }, - "outputs": [ - { - "output_type": "stream", - "name": "stderr", - "text": [ - "test: : 3328it [00:24, 135.77it/s, accuracy=87.9, f1_score=88, precision=89.5, recall=86.6]\n" - ] - }, - { - "output_type": "stream", - "name": "stdout", - "text": [ - "\n", - "[[1439 174]\n", - " [ 229 1477]], CONFUSION MATRIX\n" - ] - }, - { - "output_type": "stream", - "name": "stderr", - "text": [ - "test: : 3328it [00:24, 133.85it/s, accuracy=87.9, f1_score=88.1, precision=89.5, recall=86.7]\n" - ] - }, - { - "output_type": "stream", - "name": "stdout", - "text": [ - "\n", - "[[1439 174]\n", - " [ 227 1479]], CONFUSION MATRIX\n" - ] - }, - { - "output_type": "stream", - "name": "stderr", - "text": [ - "test: : 3328it [00:24, 137.30it/s, accuracy=88.1, f1_score=88.3, precision=89.7, recall=86.9]\n" - ] - }, - { - "output_type": "stream", - "name": "stdout", - "text": [ - "\n", - "[[1442 171]\n", - " [ 223 1483]], CONFUSION MATRIX\n" - ] - }, - { - "output_type": "stream", - "name": "stderr", - "text": [ - "test: : 3328it [00:24, 135.35it/s, accuracy=88, f1_score=88.1, precision=89.6, recall=86.7]\n" - ] - }, - { - "output_type": "stream", - "name": "stdout", - "text": [ - "\n", - "[[1441 172]\n", - " [ 227 1479]], CONFUSION MATRIX\n" - ] - }, - { - "output_type": "stream", - "name": "stderr", - "text": [ - "test: : 3328it [00:24, 135.73it/s, accuracy=88, f1_score=88.2, precision=89.7, recall=86.7]\n" - ] - }, - { - "output_type": "stream", - "name": "stdout", - "text": [ - "\n", - "[[1443 170]\n", - " [ 227 1479]], CONFUSION MATRIX\n" - ] - }, - { - "output_type": "stream", - "name": "stderr", - "text": [ - "test: : 3328it [00:24, 135.75it/s, accuracy=88.1, f1_score=88.2, precision=89.8, recall=86.8]\n" - ] - }, - { - "output_type": "stream", - "name": "stdout", - "text": [ - "\n", - "[[1444 169]\n", - " [ 226 1480]], CONFUSION MATRIX\n" - ] - }, - { - "output_type": "stream", - "name": "stderr", - "text": [ - "test: : 3328it [00:24, 135.45it/s, accuracy=88.3, f1_score=88.4, precision=89.9, recall=87]\n" - ] - }, - { - "output_type": "stream", - "name": "stdout", - "text": [ - "\n", - "[[1447 166]\n", - " [ 222 1484]], CONFUSION MATRIX\n" - ] - }, - { - "output_type": "stream", - "name": "stderr", - "text": [ - "test: : 3328it [00:24, 136.01it/s, accuracy=88.2, f1_score=88.4, precision=89.8, recall=87]\n" - ] - }, - { - "output_type": "stream", - "name": "stdout", - "text": [ - "\n", - "[[1444 169]\n", - " [ 221 1485]], CONFUSION MATRIX\n" - ] - }, - { - "output_type": "stream", - "name": "stderr", - "text": [ - "test: : 3328it [00:24, 135.97it/s, accuracy=88, f1_score=88.1, precision=89.7, recall=86.6]\n" - ] - }, - { - "output_type": "stream", - "name": "stdout", - "text": [ - "\n", - "[[1444 169]\n", - " [ 229 1477]], CONFUSION MATRIX\n" - ] - }, - { - "output_type": "stream", - "name": "stderr", - "text": [ - "test: : 3328it [00:24, 135.17it/s, accuracy=87.5, f1_score=87.6, precision=88.9, recall=86.3]" - ] - }, - { - "output_type": "stream", - "name": "stdout", - "text": [ - "\n", - "[[1430 183]\n", - " [ 233 1473]], CONFUSION MATRIX\n", - "{('Vader', 0, 'bert', 100): ['Accuracy', 87.8577884905092, 'Precision', 89.46093276801939, 'recall', 86.5767878077374, 'F1-score', 0.8799523383973785, 'CM', array([[1439, 174],\n", - " [ 229, 1477]])], ('Vader', 10, 'bert', 90): ['Accuracy', 87.9180476047002, 'Precision', 89.47368421052632, 'recall', 86.69402110199297, 'F1-score', 0.8806192319142602, 'CM', array([[1439, 174],\n", - " [ 227, 1479]])], ('Vader', 20, 'bert', 80): ['Accuracy', 88.12895450436878, 'Precision', 89.66142684401451, 'recall', 86.92848769050411, 'F1-score', 0.8827380952380953, 'CM', array([[1442, 171],\n", - " [ 223, 1483]])], ('Vader', 30, 'bert', 70): ['Accuracy', 87.97830671889123, 'Precision', 89.58207147183526, 'recall', 86.69402110199297, 'F1-score', 0.8811438784629134, 'CM', array([[1441, 172],\n", - " [ 227, 1479]])], ('Vader', 40, 'bert', 60): ['Accuracy', 88.03856583308225, 'Precision', 89.69072164948454, 'recall', 86.69402110199297, 'F1-score', 0.8816691505216095, 'CM', array([[1443, 170],\n", - " [ 227, 1479]])], ('Vader', 50, 'bert', 50): ['Accuracy', 88.09882494727329, 'Precision', 89.7513644633111, 'recall', 86.75263774912075, 'F1-score', 0.8822652757078987, 'CM', array([[1444, 169],\n", - " [ 226, 1480]])], ('Vader', 60, 'bert', 39): ['Accuracy', 88.30973184694186, 'Precision', 89.93939393939394, 'recall', 86.98710433763188, 'F1-score', 0.8843861740166865, 'CM', array([[1447, 166],\n", - " [ 222, 1484]])], ('Vader', 70, 'bert', 29): ['Accuracy', 88.24947273275083, 'Precision', 89.78234582829504, 'recall', 87.04572098475967, 'F1-score', 0.8839285714285713, 'CM', array([[1444, 169],\n", - " [ 221, 1485]])], ('Vader', 80, 'bert', 19): ['Accuracy', 88.00843627598674, 'Precision', 89.73268529769138, 'recall', 86.5767878077374, 'F1-score', 0.8812649164677805, 'CM', array([[1444, 169],\n", - " [ 229, 1477]])], ('Vader', 90, 'bert', 9): ['Accuracy', 87.46610424826756, 'Precision', 88.94927536231883, 'recall', 86.34232121922626, 'F1-score', 0.8762641284949435, 'CM', array([[1430, 183],\n", - " [ 233, 1473]])]}\n" - ] - }, - { - "output_type": "stream", - "name": "stderr", - "text": [ - "\n" - ] - } - ], + "execution_count": 54, + "outputs": [] + }, + { + "cell_type": "markdown", "source": [ - "acc_list = {}\n", + "acc_max = 0\n", + "w_opt = 0\n", "for w in np.arange(0, 1, 0.1):\n", - " acc = train_weights(bert, w, n_epochs=1, batch_size=32)\n", - " acc_list['Vader', int(w*100), 'bert', int((1-w)*100)] = acc\n", - "print(acc_list)" - ] + " acc = valid_weights(bert, w, n_epochs=1, batch_size=32)\n", + " if acc > acc_max:\n", + " acc_max = acc\n", + " w_opt = w\n", + "\n", + "test_weights(bert, w_opt, n_epochs=1, batch_size=32)" + ], + "metadata": { + "id": "ZdeslAvbU7bh" + } } ], "metadata": { diff --git a/old implementations/final_project_old.ipynb b/old implementations/final_project_old.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..3598b50cc6615ad11b38531eadb5bcceb28e1cbf --- /dev/null +++ b/old implementations/final_project_old.ipynb @@ -0,0 +1,987 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "psYARLpkfBjF" + }, + "source": [ + "# Project TDDE09\n", + "## Hugo Bjork || Jakob Berggren || Martin Forsberg" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "FwiPZum4uix1" + }, + "source": [ + "For this project we will need to run it on the GPU to optimize speed." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "id": "1LGzWEOnVyAr" + }, + "outputs": [], + "source": [ + "import torch\n", + "device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "IjZFoilkuix1", + "outputId": "ac122e04-10de-4338-9ea3-0395bbdc39c1" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "[nltk_data] Downloading package vader_lexicon to /root/nltk_data...\n" + ] + } + ], + "source": [ + "#!pip install nltk\n", + "import nltk\n", + "from nltk.sentiment.vader import SentimentIntensityAnalyzer\n", + "\n", + "nltk.download('vader_lexicon')\n", + "sid = SentimentIntensityAnalyzer()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "wXTrlu1-fIri" + }, + "source": [ + "## The data" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "GYP7pMf1uix4" + }, + "source": [ + "The data used fo this project is movie reviews from Imdb. The data set consists of 50 000 reviews labeled positive or negative. Our first course of action was to slim the data set down to only the revies with less than 128 words in them. This is done in order to be able to train the model within a resonable timeframe and avoid needing to chop up the reviews into chunks due to BERTs max length of 512.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "WnjtR3s3uix4", + "outputId": "6bb795ab-d9b5-4398-faca-edd03ec2ff3f" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "(50000, 2)\n", + "(13274, 2)\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "<ipython-input-3-ba7acf40090b>:9: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " df['sentiment'] = df['sentiment'].map(label_map)\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "51.94364923911405\n" + ] + } + ], + "source": [ + "#!pip install pandas\n", + "import pandas as pd\n", + "\n", + "df = pd.read_csv('imdb.csv')\n", + "print(df.shape)\n", + "df = df[df['review'].str.split().apply(len) <= 128]\n", + "print(df.shape)\n", + "label_map = {'positive': 1, 'negative': 0}\n", + "df['sentiment'] = df['sentiment'].map(label_map)\n", + "print((df[df['sentiment']==1].count()[0]/len(df))*100)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "mjMyfvzLuix5" + }, + "source": [ + "The data is loaded and preproccessed into a smaller set of max review length of 128." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "id": "ufyC5VtJuix6" + }, + "outputs": [], + "source": [ + "def preVader(data):\n", + " vadScore = [sid.polarity_scores(a) for a in data.review]\n", + " columns = list(vadScore[0].keys())\n", + " tens = torch.empty(len(vadScore), len(columns)).to(device)\n", + "\n", + " # All reviews for traning where columns is neg&neu&pos&comp\n", + " for i, score in enumerate(vadScore):\n", + " for j, key in enumerate(columns):\n", + " tens[i, j] = score[key]\n", + " return tens" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "CyAo0Vv_uix6" + }, + "source": [ + "## Baseline" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "jC8OwgS8uix6" + }, + "source": [ + "Our first task is to create our baseline by fine tuning the BERT model to our IMDB data set." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "4IuPZ7c1uix7" + }, + "source": [ + "We need two classes from the Transformers library:" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "4xZ3x_MQvYuK", + "outputId": "cdbbfac1-ab2b-461c-d6c7-f1674978bea9" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']\n", + "- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n", + "- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n", + "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']\n", + "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n" + ] + } + ], + "source": [ + "#!pip install transformers\n", + "from transformers import BertTokenizer, BertForSequenceClassification\n", + "\n", + "#Instantiating both classes with the pre-trained bert-base-uncased model.\n", + "tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')\n", + "model = BertForSequenceClassification.from_pretrained('bert-base-uncased').to(device)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "qbJDQlxmuix7" + }, + "source": [ + "In the tensorize function the data is preproccessed to fit the bert modle requirements by translating the reviews to token ids, masking the padding tokens and finaly a tensor with the labels correspinding to each reviews. These are returned by a TensorDataset so it can easily be split by a dataloader." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "id": "9DCuYESsuix8" + }, + "outputs": [], + "source": [ + "from torch.utils.data import TensorDataset\n", + "\n", + "MAX_LENGTH = 130\n", + "\n", + "def tensorize(reviews):\n", + " input_ids = []\n", + " labels = []\n", + " attention_masks = []\n", + " for index, rev in reviews.iterrows():\n", + " encoded = tokenizer.encode_plus(\n", + " rev[0].split(), \n", + " add_special_tokens=True, \n", + " max_length=MAX_LENGTH,\n", + " padding='max_length',\n", + " return_attention_mask=True,\n", + " return_tensors='pt',\n", + " )\n", + " input_ids.append(encoded['input_ids'])\n", + " attention_masks.append(encoded['attention_mask'])\n", + " labels.append(rev[1])\n", + " return TensorDataset(torch.cat(input_ids, dim=0), torch.cat(attention_masks, dim=0), torch.tensor(labels))\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "4GlHNdHPuix8" + }, + "source": [ + "Below we train our baseline model" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "id": "FvvIUO4iuix8" + }, + "outputs": [], + "source": [ + "#!pip install scikit-learn\n", + "#!pip install tqdm\n", + "from torch.utils.data import DataLoader\n", + "import torch.nn.functional as F\n", + "import torch.optim as optim\n", + "from tqdm import tqdm\n", + "from sklearn.metrics import *\n", + "\n", + "def train_bert(n_epochs=1, batch_size=32):\n", + " train = df[0:int(len(df)*0.75)]\n", + " test = df[int(len(df)*0.75):]\n", + " \n", + " train = tensorize(train)\n", + " test = tensorize(test)\n", + " \n", + " model = BertForSequenceClassification.from_pretrained(\n", + " \"bert-base-uncased\", num_labels = 2, output_attentions = False, output_hidden_states = False).to(device)\n", + " \n", + " # Initialize the optimizer\n", + " optimizer = optim.Adam(model.parameters(), lr=1e-5)\n", + " \n", + " # set model in training mode\n", + " model.train()\n", + " for i in range(n_epochs):\n", + " with tqdm(total=len(list(train))) as pbar:\n", + " pbar.set_description(f'Epoch {i + 1}')\n", + " for index, batch in enumerate(DataLoader(train, batch_size, shuffle=False)): \n", + "\n", + " # Reset the accumulated gradients\n", + " optimizer.zero_grad()\n", + "\n", + " b_ids = batch[0].to(device)\n", + " b_mask = batch[1].to(device)\n", + " b_labels = batch[2].to(device)\n", + " \n", + " outputs = model(input_ids=b_ids, attention_mask=b_mask,\n", + " labels=b_labels)\n", + " \n", + " # Backward pass; propagates the loss and computes the gradients\n", + " loss = outputs.loss\n", + " loss.backward()\n", + "\n", + " # Update the parameters of the model\n", + " optimizer.step()\n", + " \n", + " # Update diagnostics\n", + " pbar.set_postfix(loss=loss.item())\n", + " pbar.update(batch_size)\n", + " \n", + " model.eval() # Sets the model to evaluation mode\n", + " with torch.no_grad(): # Blocks the accumulation of gradients\n", + " y,p = [], [] # init list of actuals and predictions\n", + " with tqdm(total=len(list(test))) as pbar:\n", + " pbar.set_description(f'test')\n", + " for index, batch in enumerate(DataLoader(test, batch_size, shuffle=False)): \n", + " \n", + " b_ids = batch[0].to(device)\n", + " b_mask = batch[1].to(device)\n", + " b_labels = batch[2].to(device)\n", + " \n", + " outputs = model(input_ids=b_ids, attention_mask=b_mask,\n", + " labels=b_labels)\n", + " \n", + " loss = outputs.loss \n", + " pred = torch.argmax(outputs[1], dim=1)\n", + " \n", + " y.extend(b_labels.cpu().detach().numpy())\n", + " p.extend(pred.cpu().detach().numpy())\n", + " \n", + " # Update diagnostics\n", + " pbar.update(batch_size)\n", + "\n", + " # using sklearn to compute performance metrics\n", + " cm = confusion_matrix(y, p)\n", + " acc = accuracy_score(y, p)\n", + " prec = precision_score(y, p)\n", + " rec = recall_score(y, p)\n", + " f1 = f1_score(y, p)\n", + " \n", + " # update status bar with metrics\n", + " pbar.set_postfix(accuracy=float(acc)*100,\n", + " precision=float(prec)*100,\n", + " recall=float(rec)*100,\n", + " f1_score=float(f1)*100)\n", + " print(f\"\\n{cm}, CONFUSION MATRIX\")\n", + " return model" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "mH9OCuh9uix9", + "outputId": "d755a07a-e0d4-4c17-c8b8-97f766be7523", + "scrolled": true + }, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']\n", + "- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n", + "- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n", + "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']\n", + "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n", + "Epoch 1: : 9984it [03:37, 45.84it/s, loss=1.05]\n", + "test: : 3328it [00:24, 137.33it/s, accuracy=87.4, f1_score=88.4, precision=83.9, recall=93.4]" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\n", + "[[1308 305]\n", + " [ 113 1593]], CONFUSION MATRIX\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "\n" + ] + } + ], + "source": [ + "bert=train_bert(n_epochs=1, batch_size=32)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "vECn7GSs6XrK" + }, + "source": [ + "Next we will look at the perfomance of only using VADER. This will not be used as baseline, but its important information when interpreting later results." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "JgSc5kd-6XrK", + "outputId": "6d1e14eb-f3dc-4f3c-e3f9-1bd6dfa6484e" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "--------Train-----------\n", + "0.7293822199899548, acc\n", + "0.6845141251294187, prec\n", + "0.8918866833686645, recall\n", + "0.7745606694560669, f1\n", + "[[2633 2133]\n", + " [ 561 4628]], conf matrix\n", + "--------Test-----------\n", + "0.724013257005122, acc\n", + "0.6758682101513802, prec\n", + "0.8898007033997656, recall\n", + "0.7682186234817814, f1\n", + "[[ 885 728]\n", + " [ 188 1518]], conf matrix\n" + ] + } + ], + "source": [ + "train = df[0:int(len(df)*0.75)]\n", + "test = df[len(train):]\n", + "\n", + "vader_tr = preVader(train)\n", + "vad_pos = vader_tr[:, 2]\n", + "vad_neg = vader_tr[:, 0]\n", + "vader_tr = F.softmax(torch.column_stack((vad_neg, vad_pos)), dim=1)\n", + "\n", + "vader_te = preVader(test)\n", + "vad_pos = vader_te[:, 2]\n", + "vad_neg = vader_te[:, 0]\n", + "vader_te = F.softmax(torch.column_stack((vad_neg, vad_pos)), dim=1)\n", + "\n", + "train_labels = torch.tensor(list(train.sentiment))\n", + "test_labels = torch.tensor(list(test.sentiment))\n", + "\n", + "pred_train = torch.argmax(vader_tr, dim=1).cpu()\n", + "pred_test = torch.argmax(vader_te, dim=1).cpu()\n", + "\n", + "# using sklearn to compute performance metrics\n", + "cm = confusion_matrix(train_labels, pred_train)\n", + "acc = accuracy_score(train_labels, pred_train)\n", + "prec = precision_score(train_labels, pred_train)\n", + "rec = recall_score(train_labels, pred_train)\n", + "f1 = f1_score(train_labels, pred_train)\n", + "print(\"--------Train-----------\")\n", + "print(f\"{acc}, acc\")\n", + "print(f\"{prec}, prec\")\n", + "print(f\"{rec}, recall\")\n", + "print(f\"{f1}, f1\")\n", + "print(f\"{cm}, conf matrix\")\n", + "\n", + "cm = confusion_matrix(test_labels, pred_test)\n", + "acc = accuracy_score(test_labels, pred_test)\n", + "prec = precision_score(test_labels, pred_test)\n", + "rec = recall_score(test_labels, pred_test)\n", + "f1 = f1_score(test_labels, pred_test)\n", + "print(\"--------Test-----------\")\n", + "print(f\"{acc}, acc\")\n", + "print(f\"{prec}, prec\")\n", + "print(f\"{rec}, recall\")\n", + "print(f\"{f1}, f1\")\n", + "print(f\"{cm}, conf matrix\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "KGv5EFsTVyAx" + }, + "source": [ + "## Method 1 - MLP architecture\n", + "\n", + "Method 1 combines outcomes from the fine tuned BERT model with VAD-score sentiment values from the VADER model by implementing a Multi Layer Perceptron. First, we build the architecture for the multi layer perceptron, which will be used to align our data in the same vector space. Later, the last linear layer in this model will be trained." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "id": "bPwQouzSuix7" + }, + "outputs": [], + "source": [ + "import torch.nn as nn\n", + "\n", + "class MultilayerPerceptron(nn.Module):\n", + " def __init__(self, hidden_dim, output_dim):\n", + " super().__init__()\n", + " self.hidden_dim = hidden_dim\n", + " self.seq = nn.Sequential(\n", + " nn.LazyLinear(hidden_dim),\n", + " torch.nn.ReLU(),\n", + " nn.Linear(hidden_dim, output_dim),\n", + " )\n", + " def forward(self, bert, vader):\n", + " x = torch.cat((bert, vader), dim=1)\n", + " x = self.seq(x)\n", + " return x" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "id": "pCJ_D43Cuix9" + }, + "outputs": [], + "source": [ + "import torch.nn.functional as F\n", + "import torch.optim as optim\n", + "from tqdm import tqdm\n", + "from sklearn.metrics import confusion_matrix\n", + "import numpy as np\n", + "\n", + "def train_mlp(bert, n_epochs=1, batch_size=32, learning_rate=1e-5):\n", + " train = df[0:int(len(df)*0.75)]\n", + " test = df[len(train):]\n", + "\n", + " vader_tr = preVader(train).to(device)\n", + " vad_pos = vader_tr[:, 2]\n", + " vad_neg = vader_tr[:, 0]\n", + " vader_tr = F.softmax(torch.column_stack((vad_neg, vad_pos)))\n", + "\n", + " vader_te = preVader(test).to(device)\n", + " vad_pos = vader_te[:, 2]\n", + " vad_neg = vader_te[:, 0]\n", + " vader_te = F.softmax(torch.column_stack((vad_neg, vad_pos)))\n", + "\n", + " train = tensorize(train)\n", + " test = tensorize(test)\n", + " \n", + " # init multi layer perceptron\n", + " mlp = MultilayerPerceptron(250, 2).to(device)\n", + " \n", + " # Initialize the optimizer. Here we use Adam rather than plain SGD\n", + " optimizer = optim.Adam(mlp.parameters(), lr=learning_rate)\n", + " mlp.train()\n", + " \n", + " for i in range(n_epochs):\n", + " with tqdm(total=len(list(train))) as pbar:\n", + " pbar.set_description(f'Epoch {i + 1}')\n", + " for index, batch in enumerate(DataLoader(train, batch_size, shuffle=False)): \n", + "\n", + " # Reset the accumulated gradients\n", + " optimizer.zero_grad()\n", + "\n", + " b_ids = batch[0].to(device)\n", + " b_mask = batch[1].to(device)\n", + " b_labels = batch[2].to(device)\n", + " \n", + " outputs = bert(input_ids=b_ids, attention_mask=b_mask,\n", + " labels=b_labels)\n", + "\n", + " # combine features of bert and vader using the mlp\n", + " mlp_out = mlp.forward(outputs.logits, vader_tr[index*batch_size:index*batch_size+outputs.logits.size(0),:3]) \n", + " \n", + " # Backward pass; propagates the loss and computes the gradients\n", + " loss = F.cross_entropy(mlp_out, b_labels)\n", + " loss.backward()\n", + "\n", + " # Update the parameters of the model\n", + " optimizer.step()\n", + "\n", + " # Update diagnostics\n", + " pbar.set_postfix(loss=loss.item())\n", + " pbar.update(batch_size)\n", + " \n", + " model.eval() # Sets the model to evaluation mode\n", + " with torch.no_grad(): # Blocks the accumulation of gradients\n", + " y,p = [], [] # init list of actuals and predictions\n", + " with tqdm(total=len(list(test))) as pbar:\n", + " pbar.set_description(f'test')\n", + " for index, batch in enumerate(DataLoader(test, batch_size, shuffle=False)): \n", + " \n", + " b_ids = batch[0].to(device)\n", + " b_mask = batch[1].to(device)\n", + " b_labels = batch[2].to(device)\n", + " \n", + " outputs = bert(input_ids=b_ids, attention_mask=b_mask,\n", + " labels=b_labels)\n", + " \n", + " mlp_out = mlp.forward(outputs.logits, vader_te[index*batch_size:index*batch_size+outputs.logits.size(0),:3]) \n", + " \n", + " loss = F.cross_entropy(mlp_out, b_labels)\n", + " pred = torch.argmax(mlp_out, dim=1)\n", + " \n", + " y.extend(b_labels.cpu().detach().numpy())\n", + " p.extend(pred.cpu().detach().numpy())\n", + "\n", + " # Update diagnostics\n", + " pbar.update(batch_size)\n", + "\n", + " # using sklearn to compute performance metrics\n", + " cm = confusion_matrix(y, p)\n", + " acc = accuracy_score(y, p)\n", + " prec = precision_score(y, p)\n", + " rec = recall_score(y, p)\n", + " f1 = f1_score(y, p)\n", + " \n", + " # update status bar with metrics\n", + " pbar.set_postfix(accuracy=float(acc)*100,\n", + " precision=float(prec)*100,\n", + " recall=float(rec)*100,\n", + " f1_score=float(f1)*100)\n", + " print(f\"\\n{cm}, CONFUSION MATRIX\")\n", + " return model" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "922Umg2cuix9", + "outputId": "5d58b8a6-7cff-4aa9-d826-445f04d7a3e8" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "<ipython-input-14-36585a68d363>:14: UserWarning: Implicit dimension choice for softmax has been deprecated. Change the call to include dim=X as an argument.\n", + " vader_tr = F.softmax(torch.column_stack((vad_neg, vad_pos)))\n", + "<ipython-input-14-36585a68d363>:19: UserWarning: Implicit dimension choice for softmax has been deprecated. Change the call to include dim=X as an argument.\n", + " vader_te = F.softmax(torch.column_stack((vad_neg, vad_pos)))\n", + "/usr/local/lib/python3.9/dist-packages/torch/nn/modules/lazy.py:180: UserWarning: Lazy modules are a new feature under heavy development so changes to the API or functionality can happen at any moment.\n", + " warnings.warn('Lazy modules are a new feature under heavy development '\n", + "Epoch 1: : 9984it [03:23, 49.17it/s, loss=0.552] \n", + "test: : 3328it [00:24, 136.86it/s, accuracy=88.5, f1_score=89, precision=87.7, recall=90.3]" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\n", + "[[1398 215]\n", + " [ 166 1540]], CONFUSION MATRIX\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "\n" + ] + } + ], + "source": [ + "mlp_model = train_mlp(bert, n_epochs=1, batch_size=32, learning_rate=1e-4)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "7N-5opAuslZZ" + }, + "source": [ + "## Method 2 - Weights\n", + "\n", + "In method two, the respective outputs from BERT and VADER are assigned a weight and then added together. To try different weights, we simply implement a loop. Results are thus not optimal, but show an approximation of the relationship between the models." + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "id": "iaVMMo2fuUW3" + }, + "outputs": [], + "source": [ + "import torch.nn.functional as F\n", + "import torch.optim as optim\n", + "from tqdm import tqdm\n", + "from sklearn.metrics import confusion_matrix\n", + "import numpy as np\n", + "\n", + "def train_weights(bert, w, n_epochs=1, batch_size=32):\n", + " train = df[0:int(len(df)*0.75)]\n", + " test = df[len(train):]\n", + " vader = preVader(test).to(device)\n", + " \n", + " train = tensorize(train)\n", + " test = tensorize(test)\n", + "\n", + " y,p = [], [] # init list of actuals and predictions\n", + " with tqdm(total=len(list(test))) as pbar:\n", + " pbar.set_description(f'test')\n", + " for index, batch in enumerate(DataLoader(test, batch_size, shuffle=False)): \n", + " \n", + " b_ids = batch[0].to(device)\n", + " b_mask = batch[1].to(device)\n", + " b_labels = batch[2].to(device)\n", + " \n", + " outputs = bert(input_ids=b_ids, attention_mask=b_mask,\n", + " labels=b_labels)\n", + " \n", + " vad_pos = vader[batch_size*index:batch_size*index+len(b_ids), 2]\n", + " vad_neg = vader[batch_size*index:batch_size*index+len(b_ids), 0]\n", + "\n", + " pred_vad = F.softmax(torch.column_stack((vad_neg, vad_pos)), dim=0)\n", + " pred_bert = F.softmax(outputs.logits, dim=0)\n", + "\n", + " pred = torch.argmax(torch.add(w*pred_vad, (1-w)*pred_bert), dim=1)\n", + "\n", + " y.extend(b_labels.cpu().detach().numpy())\n", + " p.extend(pred.cpu().detach().numpy())\n", + "\n", + " # Update diagnostics\n", + " pbar.update(batch_size)\n", + "\n", + " # using sklearn to compute performance metrics\n", + " cm = confusion_matrix(y, p)\n", + " acc = accuracy_score(y, p)\n", + " prec = precision_score(y, p)\n", + " rec = recall_score(y, p)\n", + " f1 = f1_score(y, p)\n", + " \n", + " # update status bar with metrics\n", + " pbar.set_postfix(accuracy=float(acc)*100,\n", + " precision=float(prec)*100,\n", + " recall=float(rec)*100,\n", + " f1_score=float(f1)*100)\n", + " print(f\"\\n{cm}, CONFUSION MATRIX\")\n", + " return ['Accuracy',acc*100, 'Precision', prec*100,'recall', rec*100,'F1-score', f1, 'CM', cm]" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "ZdeslAvbU7bh", + "outputId": "097ecc22-e45c-4380-af56-72151b87d4b4" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "test: : 3328it [00:24, 135.77it/s, accuracy=87.9, f1_score=88, precision=89.5, recall=86.6]\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\n", + "[[1439 174]\n", + " [ 229 1477]], CONFUSION MATRIX\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "test: : 3328it [00:24, 133.85it/s, accuracy=87.9, f1_score=88.1, precision=89.5, recall=86.7]\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\n", + "[[1439 174]\n", + " [ 227 1479]], CONFUSION MATRIX\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "test: : 3328it [00:24, 137.30it/s, accuracy=88.1, f1_score=88.3, precision=89.7, recall=86.9]\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\n", + "[[1442 171]\n", + " [ 223 1483]], CONFUSION MATRIX\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "test: : 3328it [00:24, 135.35it/s, accuracy=88, f1_score=88.1, precision=89.6, recall=86.7]\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\n", + "[[1441 172]\n", + " [ 227 1479]], CONFUSION MATRIX\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "test: : 3328it [00:24, 135.73it/s, accuracy=88, f1_score=88.2, precision=89.7, recall=86.7]\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\n", + "[[1443 170]\n", + " [ 227 1479]], CONFUSION MATRIX\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "test: : 3328it [00:24, 135.75it/s, accuracy=88.1, f1_score=88.2, precision=89.8, recall=86.8]\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\n", + "[[1444 169]\n", + " [ 226 1480]], CONFUSION MATRIX\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "test: : 3328it [00:24, 135.45it/s, accuracy=88.3, f1_score=88.4, precision=89.9, recall=87]\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\n", + "[[1447 166]\n", + " [ 222 1484]], CONFUSION MATRIX\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "test: : 3328it [00:24, 136.01it/s, accuracy=88.2, f1_score=88.4, precision=89.8, recall=87]\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\n", + "[[1444 169]\n", + " [ 221 1485]], CONFUSION MATRIX\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "test: : 3328it [00:24, 135.97it/s, accuracy=88, f1_score=88.1, precision=89.7, recall=86.6]\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\n", + "[[1444 169]\n", + " [ 229 1477]], CONFUSION MATRIX\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "test: : 3328it [00:24, 135.17it/s, accuracy=87.5, f1_score=87.6, precision=88.9, recall=86.3]" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\n", + "[[1430 183]\n", + " [ 233 1473]], CONFUSION MATRIX\n", + "{('Vader', 0, 'bert', 100): ['Accuracy', 87.8577884905092, 'Precision', 89.46093276801939, 'recall', 86.5767878077374, 'F1-score', 0.8799523383973785, 'CM', array([[1439, 174],\n", + " [ 229, 1477]])], ('Vader', 10, 'bert', 90): ['Accuracy', 87.9180476047002, 'Precision', 89.47368421052632, 'recall', 86.69402110199297, 'F1-score', 0.8806192319142602, 'CM', array([[1439, 174],\n", + " [ 227, 1479]])], ('Vader', 20, 'bert', 80): ['Accuracy', 88.12895450436878, 'Precision', 89.66142684401451, 'recall', 86.92848769050411, 'F1-score', 0.8827380952380953, 'CM', array([[1442, 171],\n", + " [ 223, 1483]])], ('Vader', 30, 'bert', 70): ['Accuracy', 87.97830671889123, 'Precision', 89.58207147183526, 'recall', 86.69402110199297, 'F1-score', 0.8811438784629134, 'CM', array([[1441, 172],\n", + " [ 227, 1479]])], ('Vader', 40, 'bert', 60): ['Accuracy', 88.03856583308225, 'Precision', 89.69072164948454, 'recall', 86.69402110199297, 'F1-score', 0.8816691505216095, 'CM', array([[1443, 170],\n", + " [ 227, 1479]])], ('Vader', 50, 'bert', 50): ['Accuracy', 88.09882494727329, 'Precision', 89.7513644633111, 'recall', 86.75263774912075, 'F1-score', 0.8822652757078987, 'CM', array([[1444, 169],\n", + " [ 226, 1480]])], ('Vader', 60, 'bert', 39): ['Accuracy', 88.30973184694186, 'Precision', 89.93939393939394, 'recall', 86.98710433763188, 'F1-score', 0.8843861740166865, 'CM', array([[1447, 166],\n", + " [ 222, 1484]])], ('Vader', 70, 'bert', 29): ['Accuracy', 88.24947273275083, 'Precision', 89.78234582829504, 'recall', 87.04572098475967, 'F1-score', 0.8839285714285713, 'CM', array([[1444, 169],\n", + " [ 221, 1485]])], ('Vader', 80, 'bert', 19): ['Accuracy', 88.00843627598674, 'Precision', 89.73268529769138, 'recall', 86.5767878077374, 'F1-score', 0.8812649164677805, 'CM', array([[1444, 169],\n", + " [ 229, 1477]])], ('Vader', 90, 'bert', 9): ['Accuracy', 87.46610424826756, 'Precision', 88.94927536231883, 'recall', 86.34232121922626, 'F1-score', 0.8762641284949435, 'CM', array([[1430, 183],\n", + " [ 233, 1473]])]}\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "\n" + ] + } + ], + "source": [ + "acc_list = {}\n", + "for w in np.arange(0, 1, 0.1):\n", + " acc = train_weights(bert, w, n_epochs=1, batch_size=32)\n", + " acc_list['Vader', int(w*100), 'bert', int((1-w)*100)] = acc\n", + "print(acc_list)" + ] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.16" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} \ No newline at end of file