diff --git a/l3/TM-Lab3.ipynb b/l3/TM-Lab3.ipynb index ce9a0ae66b8e5a78767290517ffe5ab1c5d5dcba..6bf502bbaf6c675c49462691e825c3c2612df510 100644 --- a/l3/TM-Lab3.ipynb +++ b/l3/TM-Lab3.ipynb @@ -37,7 +37,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 303, "metadata": { "deletable": false, "editable": false, @@ -82,7 +82,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 304, "metadata": { "deletable": false, "editable": false, @@ -128,7 +128,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 305, "metadata": {}, "outputs": [ { @@ -220,7 +220,7 @@ "4 Brussels " ] }, - "execution_count": 5, + "execution_count": 305, "metadata": {}, "output_type": "execute_result" } @@ -262,7 +262,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 306, "metadata": { "deletable": false, "nbgrader": { @@ -312,7 +312,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 307, "metadata": { "deletable": false, "editable": false, @@ -350,7 +350,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 308, "metadata": { "deletable": false, "editable": false, @@ -419,7 +419,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 309, "metadata": { "deletable": false, "editable": false, @@ -464,7 +464,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 310, "metadata": { "deletable": false, "nbgrader": { @@ -498,7 +498,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 311, "metadata": { "deletable": false, "nbgrader": { @@ -552,7 +552,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 312, "metadata": {}, "outputs": [ { @@ -575,7 +575,7 @@ }, { "cell_type": "code", - "execution_count": 47, + "execution_count": 313, "metadata": { "deletable": false, "editable": false, @@ -630,7 +630,7 @@ }, { "cell_type": "code", - "execution_count": 48, + "execution_count": 314, "metadata": { "deletable": false, "editable": false, @@ -692,7 +692,7 @@ }, { "cell_type": "code", - "execution_count": 49, + "execution_count": 315, "metadata": { "tags": [ "solution" @@ -16062,7 +16062,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 323, "metadata": { "deletable": false, "nbgrader": { @@ -16092,8 +16092,22 @@ " triples consisting of the sentence id, start position, and end\n", " position of each span.\n", " \"\"\"\n", - " # YOUR CODE HERE\n", - " raise NotImplementedError()" + " blacklist = ['DATE', 'CARDINAL', 'TIME', 'QUANTITY', 'PERCENT', 'ORDINAL']\n", + "\n", + " ner = nlp.get_pipe(\"ner\")\n", + "\n", + " for row in df.itertuples():\n", + " sentence = row[2]\n", + " doc = nlp(sentence)\n", + "\n", + " for ent in doc.ents:\n", + " if ent.label_ in blacklist:\n", + " continue\n", + "\n", + " if ent.text.startswith(\"the \") or ent.text.startswith(\"The \"):\n", + " ent.start += 1\n", + " \n", + " yield row[1], ent.start, ent.end" ] }, { @@ -16109,21 +16123,31 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 324, "metadata": { "tags": [ "solution" ] }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Precision: 0.875, Recall: 0.799, F1: 0.835\n" + ] + } + ], "source": [ "scores_improved = evaluation_scores(spans_dev_gold, set(pred_spans_improved(df_dev)))\n", - "print_evaluation_scores(scores_improved)" + "print_evaluation_scores(scores_improved)\n", + "\n", + "# Previous score: Precision: 0.551, Recall: 0.775, F1: 0.644" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 325, "metadata": { "deletable": false, "editable": false, @@ -16139,7 +16163,20 @@ "task": false } }, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "<div class=\"alert alert-success\"><strong>Checks have passed!</strong></div>" + ], + "text/plain": [ + "<IPython.core.display.HTML object>" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "assert scores_improved[-1] > .8, \"F1-score should be above 0.8\"\n", "success()" @@ -16156,7 +16193,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 326, "metadata": { "deletable": false, "nbgrader": { @@ -16182,8 +16219,19 @@ " A *new* data frame with the same layout as `df`, but containing\n", " the predicted start and end positions for each token span.\n", " \"\"\"\n", - " # YOUR CODE HERE\n", - " raise NotImplementedError()" + "\n", + " sentence_dict = dict(zip(df['sentence_id'], df['sentence']))\n", + "\n", + " rows = []\n", + "\n", + " for index, (sentence_id, beg, end) in enumerate(pred_spans_improved(df)):\n", + " sentence = sentence_dict.get(sentence_id)\n", + " rows.append({'sentence_id': sentence_id, 'sentence': sentence, 'beg': beg, 'end': end})\n", + "\n", + " new_data_frame = pd.DataFrame(rows, columns=['sentence_id', 'sentence', 'beg', 'end'])\n", + "\n", + " return new_data_frame\n", + "\n" ] }, { @@ -16197,7 +16245,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 327, "metadata": { "deletable": false, "editable": false, @@ -16216,7 +16264,179 @@ "solution" ] }, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>sentence_id</th>\n", + " <th>sentence</th>\n", + " <th>beg</th>\n", + " <th>end</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>0946-001</td>\n", + " <td>LONDON 1996-08-30</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>0946-002</td>\n", + " <td>West Indian all-rounder Phil Simmons took four...</td>\n", + " <td>0</td>\n", + " <td>2</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>0946-002</td>\n", + " <td>West Indian all-rounder Phil Simmons took four...</td>\n", + " <td>3</td>\n", + " <td>5</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>0946-002</td>\n", + " <td>West Indian all-rounder Phil Simmons took four...</td>\n", + " <td>12</td>\n", + " <td>13</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>0946-002</td>\n", + " <td>West Indian all-rounder Phil Simmons took four...</td>\n", + " <td>14</td>\n", + " <td>15</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " sentence_id sentence beg end\n", + "0 0946-001 LONDON 1996-08-30 0 1\n", + "1 0946-002 West Indian all-rounder Phil Simmons took four... 0 2\n", + "2 0946-002 West Indian all-rounder Phil Simmons took four... 3 5\n", + "3 0946-002 West Indian all-rounder Phil Simmons took four... 12 13\n", + "4 0946-002 West Indian all-rounder Phil Simmons took four... 14 15" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>sentence_id</th>\n", + " <th>sentence</th>\n", + " <th>beg</th>\n", + " <th>end</th>\n", + " <th>label</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>0946-000</td>\n", + " <td>CRICKET - LEICESTERSHIRE TAKE OVER AT TOP AFTE...</td>\n", + " <td>2</td>\n", + " <td>3</td>\n", + " <td>Leicestershire_County_Cricket_Club</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>0946-001</td>\n", + " <td>LONDON 1996-08-30</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>London</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>0946-002</td>\n", + " <td>West Indian all-rounder Phil Simmons took four...</td>\n", + " <td>0</td>\n", + " <td>2</td>\n", + " <td>West_Indies_cricket_team</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>0946-002</td>\n", + " <td>West Indian all-rounder Phil Simmons took four...</td>\n", + " <td>3</td>\n", + " <td>5</td>\n", + " <td>Phil_Simmons</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>0946-002</td>\n", + " <td>West Indian all-rounder Phil Simmons took four...</td>\n", + " <td>12</td>\n", + " <td>13</td>\n", + " <td>Leicestershire_County_Cricket_Club</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " sentence_id sentence beg end \\\n", + "0 0946-000 CRICKET - LEICESTERSHIRE TAKE OVER AT TOP AFTE... 2 3 \n", + "1 0946-001 LONDON 1996-08-30 0 1 \n", + "2 0946-002 West Indian all-rounder Phil Simmons took four... 0 2 \n", + "3 0946-002 West Indian all-rounder Phil Simmons took four... 3 5 \n", + "4 0946-002 West Indian all-rounder Phil Simmons took four... 12 13 \n", + "\n", + " label \n", + "0 Leicestershire_County_Cricket_Club \n", + "1 London \n", + "2 West_Indies_cricket_team \n", + "3 Phil_Simmons \n", + "4 Leicestershire_County_Cricket_Club " + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "df_dev_pred = df_with_pred_spans(df_dev)\n", "display(df_dev_pred.head())" @@ -16247,7 +16467,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 328, "metadata": { "deletable": false, "nbgrader": { @@ -16274,8 +16494,8 @@ " quadruples consisting of the sentence id, start position, end\n", " position and entity label of each span.\n", " \"\"\"\n", - " # YOUR CODE HERE\n", - " raise NotImplementedError()" + " for row in df.itertuples():\n", + " yield row[1], row[3], row[4], row[5]" ] }, { @@ -16289,7 +16509,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 329, "metadata": { "deletable": false, "editable": false, @@ -16308,7 +16528,20 @@ "solution" ] }, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "<div class=\"alert alert-success\"><strong>Checks have passed!</strong></div>" + ], + "text/plain": [ + "<IPython.core.display.HTML object>" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "dev_gold_mentions = set(gold_mentions(df_dev))\n", "assert ('1094-020', 0, 1, 'Seattle_Mariners') in dev_gold_mentions, \"An expected tuple is not included in the results\"\n", @@ -16341,7 +16574,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 330, "metadata": { "deletable": false, "nbgrader": { @@ -16372,8 +16605,12 @@ " quadruples consisting of the sentence id, start position, end\n", " position and the predicted entity label of each span.\n", " \"\"\"\n", - " # YOUR CODE HERE\n", - " raise NotImplementedError()" + " for row in df.itertuples():\n", + " words = row[2].split(' ')\n", + " matching_wiki_name = '_'.join(words[int(row[3]):int(row[4])])\n", + " #sprint(matching_wiki_name)\n", + " yield row[1], row[3], row[4], matching_wiki_name\n", + " " ] }, { @@ -16387,7 +16624,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 331, "metadata": { "deletable": false, "editable": false, @@ -16406,7 +16643,27 @@ "solution" ] }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Precision: 0.301, Recall: 0.274, F1: 0.287\n" + ] + }, + { + "data": { + "text/html": [ + "<div class=\"alert alert-success\"><strong>Checks have passed!</strong></div>" + ], + "text/plain": [ + "<IPython.core.display.HTML object>" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "# Compute and print the evaluation scores\n", "scores = evaluation_scores(dev_gold_mentions, set(baseline(df_dev_pred)))\n",