diff --git a/l3/TM-Lab3.ipynb b/l3/TM-Lab3.ipynb index 10e046b53ec2cc3d981c35c8d84f40f17d7d74b5..fc9c5933865efdc68a5d9e8d5809dff46af7d50d 100644 --- a/l3/TM-Lab3.ipynb +++ b/l3/TM-Lab3.ipynb @@ -37,7 +37,7 @@ }, { "cell_type": "code", - "execution_count": 303, + "execution_count": 332, "metadata": { "deletable": false, "editable": false, @@ -82,7 +82,7 @@ }, { "cell_type": "code", - "execution_count": 304, + "execution_count": 333, "metadata": { "deletable": false, "editable": false, @@ -128,7 +128,7 @@ }, { "cell_type": "code", - "execution_count": 305, + "execution_count": 334, "metadata": {}, "outputs": [ { @@ -220,7 +220,7 @@ "4 Brussels " ] }, - "execution_count": 305, + "execution_count": 334, "metadata": {}, "output_type": "execute_result" } @@ -262,7 +262,7 @@ }, { "cell_type": "code", - "execution_count": 306, + "execution_count": 335, "metadata": { "deletable": false, "nbgrader": { @@ -312,7 +312,7 @@ }, { "cell_type": "code", - "execution_count": 307, + "execution_count": 336, "metadata": { "deletable": false, "editable": false, @@ -350,7 +350,7 @@ }, { "cell_type": "code", - "execution_count": 308, + "execution_count": 337, "metadata": { "deletable": false, "editable": false, @@ -419,7 +419,7 @@ }, { "cell_type": "code", - "execution_count": 309, + "execution_count": 338, "metadata": { "deletable": false, "editable": false, @@ -464,7 +464,7 @@ }, { "cell_type": "code", - "execution_count": 310, + "execution_count": 339, "metadata": { "deletable": false, "nbgrader": { @@ -498,7 +498,7 @@ }, { "cell_type": "code", - "execution_count": 311, + "execution_count": 340, "metadata": { "deletable": false, "nbgrader": { @@ -552,7 +552,7 @@ }, { "cell_type": "code", - "execution_count": 312, + "execution_count": 341, "metadata": {}, "outputs": [ { @@ -575,7 +575,7 @@ }, { "cell_type": "code", - "execution_count": 313, + "execution_count": 342, "metadata": { "deletable": false, "editable": false, @@ -630,7 +630,7 @@ }, { "cell_type": "code", - "execution_count": 314, + "execution_count": 343, "metadata": { "deletable": false, "editable": false, @@ -692,7 +692,7 @@ }, { "cell_type": "code", - "execution_count": 315, + "execution_count": 344, "metadata": { "tags": [ "solution" @@ -16062,7 +16062,7 @@ }, { "cell_type": "code", - "execution_count": 323, + "execution_count": 345, "metadata": { "deletable": false, "nbgrader": { @@ -16123,7 +16123,7 @@ }, { "cell_type": "code", - "execution_count": 324, + "execution_count": 346, "metadata": { "tags": [ "solution" @@ -16147,7 +16147,7 @@ }, { "cell_type": "code", - "execution_count": 325, + "execution_count": 347, "metadata": { "deletable": false, "editable": false, @@ -16193,7 +16193,7 @@ }, { "cell_type": "code", - "execution_count": 326, + "execution_count": 348, "metadata": { "deletable": false, "nbgrader": { @@ -16226,9 +16226,9 @@ "\n", " for index, (sentence_id, beg, end) in enumerate(pred_spans_improved(df)):\n", " sentence = sentence_dict.get(sentence_id)\n", - " rows.append({'sentence_id': sentence_id, 'sentence': sentence, 'beg': beg, 'end': end})\n", + " rows.append({'sentence_id': sentence_id, 'sentence': sentence, 'beg': beg, 'end': end, 'label': '--NME--'})\n", "\n", - " new_data_frame = pd.DataFrame(rows, columns=['sentence_id', 'sentence', 'beg', 'end'])\n", + " new_data_frame = pd.DataFrame(rows, columns=['sentence_id', 'sentence', 'beg', 'end', 'label'])\n", "\n", " return new_data_frame\n", "\n" @@ -16245,7 +16245,7 @@ }, { "cell_type": "code", - "execution_count": 327, + "execution_count": 349, "metadata": { "deletable": false, "editable": false, @@ -16343,98 +16343,6 @@ }, "metadata": {}, "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "<div>\n", - "<style scoped>\n", - " .dataframe tbody tr th:only-of-type {\n", - " vertical-align: middle;\n", - " }\n", - "\n", - " .dataframe tbody tr th {\n", - " vertical-align: top;\n", - " }\n", - "\n", - " .dataframe thead th {\n", - " text-align: right;\n", - " }\n", - "</style>\n", - "<table border=\"1\" class=\"dataframe\">\n", - " <thead>\n", - " <tr style=\"text-align: right;\">\n", - " <th></th>\n", - " <th>sentence_id</th>\n", - " <th>sentence</th>\n", - " <th>beg</th>\n", - " <th>end</th>\n", - " <th>label</th>\n", - " </tr>\n", - " </thead>\n", - " <tbody>\n", - " <tr>\n", - " <th>0</th>\n", - " <td>0946-000</td>\n", - " <td>CRICKET - LEICESTERSHIRE TAKE OVER AT TOP AFTE...</td>\n", - " <td>2</td>\n", - " <td>3</td>\n", - " <td>Leicestershire_County_Cricket_Club</td>\n", - " </tr>\n", - " <tr>\n", - " <th>1</th>\n", - " <td>0946-001</td>\n", - " <td>LONDON 1996-08-30</td>\n", - " <td>0</td>\n", - " <td>1</td>\n", - " <td>London</td>\n", - " </tr>\n", - " <tr>\n", - " <th>2</th>\n", - " <td>0946-002</td>\n", - " <td>West Indian all-rounder Phil Simmons took four...</td>\n", - " <td>0</td>\n", - " <td>2</td>\n", - " <td>West_Indies_cricket_team</td>\n", - " </tr>\n", - " <tr>\n", - " <th>3</th>\n", - " <td>0946-002</td>\n", - " <td>West Indian all-rounder Phil Simmons took four...</td>\n", - " <td>3</td>\n", - " <td>5</td>\n", - " <td>Phil_Simmons</td>\n", - " </tr>\n", - " <tr>\n", - " <th>4</th>\n", - " <td>0946-002</td>\n", - " <td>West Indian all-rounder Phil Simmons took four...</td>\n", - " <td>12</td>\n", - " <td>13</td>\n", - " <td>Leicestershire_County_Cricket_Club</td>\n", - " </tr>\n", - " </tbody>\n", - "</table>\n", - "</div>" - ], - "text/plain": [ - " sentence_id sentence beg end \\\n", - "0 0946-000 CRICKET - LEICESTERSHIRE TAKE OVER AT TOP AFTE... 2 3 \n", - "1 0946-001 LONDON 1996-08-30 0 1 \n", - "2 0946-002 West Indian all-rounder Phil Simmons took four... 0 2 \n", - "3 0946-002 West Indian all-rounder Phil Simmons took four... 3 5 \n", - "4 0946-002 West Indian all-rounder Phil Simmons took four... 12 13 \n", - "\n", - " label \n", - "0 Leicestershire_County_Cricket_Club \n", - "1 London \n", - "2 West_Indies_cricket_team \n", - "3 Phil_Simmons \n", - "4 Leicestershire_County_Cricket_Club " - ] - }, - "metadata": {}, - "output_type": "display_data" } ], "source": [ @@ -16467,7 +16375,7 @@ }, { "cell_type": "code", - "execution_count": 328, + "execution_count": 350, "metadata": { "deletable": false, "nbgrader": { @@ -16509,7 +16417,7 @@ }, { "cell_type": "code", - "execution_count": 329, + "execution_count": 351, "metadata": { "deletable": false, "editable": false, @@ -16574,7 +16482,7 @@ }, { "cell_type": "code", - "execution_count": 330, + "execution_count": 352, "metadata": { "deletable": false, "nbgrader": { @@ -16623,7 +16531,7 @@ }, { "cell_type": "code", - "execution_count": 331, + "execution_count": 353, "metadata": { "deletable": false, "editable": false, @@ -16692,7 +16600,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 354, "metadata": { "deletable": false, "editable": false, @@ -16722,9 +16630,70 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 355, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>mention</th>\n", + " <th>entity</th>\n", + " <th>prob</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>17436</th>\n", + " <td>Sweden</td>\n", + " <td>Sweden</td>\n", + " <td>0.985768</td>\n", + " </tr>\n", + " <tr>\n", + " <th>17437</th>\n", + " <td>Sweden</td>\n", + " <td>Sweden_national_football_team</td>\n", + " <td>0.014173</td>\n", + " </tr>\n", + " <tr>\n", + " <th>17438</th>\n", + " <td>Sweden</td>\n", + " <td>Sweden_men's_national_ice_hockey_team</td>\n", + " <td>0.000059</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " mention entity prob\n", + "17436 Sweden Sweden 0.985768\n", + "17437 Sweden Sweden_national_football_team 0.014173\n", + "17438 Sweden Sweden_men's_national_ice_hockey_team 0.000059" + ] + }, + "execution_count": 355, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "df_kb.loc[df_kb.mention == 'Sweden']" ] @@ -16745,7 +16714,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 356, "metadata": { "deletable": false, "nbgrader": { @@ -16791,7 +16760,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 357, "metadata": { "deletable": false, "editable": false, @@ -16810,7 +16779,20 @@ "solution" ] }, - "outputs": [], + "outputs": [ + { + "ename": "NotImplementedError", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNotImplementedError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[357], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m scores \u001b[38;5;241m=\u001b[39m evaluation_scores(dev_gold_mentions, \u001b[38;5;28mset\u001b[39m(\u001b[43mmost_probable_method\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdf_dev_pred\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdf_kb\u001b[49m\u001b[43m)\u001b[49m))\n\u001b[1;32m 2\u001b[0m print_evaluation_scores(scores)\n\u001b[1;32m 4\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m scores[\u001b[38;5;241m0\u001b[39m] \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m.64\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mPrecision should be above 64\u001b[39m\u001b[38;5;124m%\u001b[39m\u001b[38;5;124m\"\u001b[39m\n", + "Cell \u001b[0;32mIn[356], line 14\u001b[0m, in \u001b[0;36mmost_probable_method\u001b[0;34m(df, df_kb)\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"An entity linker that resolves each mention to the most probably entity in a knowledge base.\u001b[39;00m\n\u001b[1;32m 3\u001b[0m \n\u001b[1;32m 4\u001b[0m \u001b[38;5;124;03mArguments:\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 11\u001b[0m \u001b[38;5;124;03m position and the predicted entity label of each span.\u001b[39;00m\n\u001b[1;32m 12\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 13\u001b[0m \u001b[38;5;66;03m# YOUR CODE HERE\u001b[39;00m\n\u001b[0;32m---> 14\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mNotImplementedError\u001b[39;00m()\n", + "\u001b[0;31mNotImplementedError\u001b[0m: " + ] + } + ], "source": [ "scores = evaluation_scores(dev_gold_mentions, set(most_probable_method(df_dev_pred, df_kb)))\n", "print_evaluation_scores(scores)\n",