Part 4 done

430429c5 · Filip Johnsson · b76f6bfb · 430429c5
Commit 430429c5 authored 7 months ago by Filip Johnsson
--- a/l3/TM-Lab3.ipynb
+++ b/l3/TM-Lab3.ipynb
@@ -37,7 +37,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 303,
   "metadata": {
    "deletable": false,
    "editable": false,
@@ -82,7 +82,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 304,
   "metadata": {
    "deletable": false,
    "editable": false,
@@ -128,7 +128,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 305,
   "metadata": {},
   "outputs": [
    {
@@ -220,7 +220,7 @@
       "4        Brussels  "
      ]
     },
-     "execution_count": 5,
+     "execution_count": 305,
     "metadata": {},
     "output_type": "execute_result"
    }
@@ -262,7 +262,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 306,
   "metadata": {
    "deletable": false,
    "nbgrader": {
@@ -312,7 +312,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 307,
   "metadata": {
    "deletable": false,
    "editable": false,
@@ -350,7 +350,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 308,
   "metadata": {
    "deletable": false,
    "editable": false,
@@ -419,7 +419,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 309,
   "metadata": {
    "deletable": false,
    "editable": false,
@@ -464,7 +464,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 310,
   "metadata": {
    "deletable": false,
    "nbgrader": {
@@ -498,7 +498,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 25,
+   "execution_count": 311,
   "metadata": {
    "deletable": false,
    "nbgrader": {
@@ -552,7 +552,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 27,
+   "execution_count": 312,
   "metadata": {},
   "outputs": [
    {
@@ -575,7 +575,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 47,
+   "execution_count": 313,
   "metadata": {
    "deletable": false,
    "editable": false,
@@ -630,7 +630,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 48,
+   "execution_count": 314,
   "metadata": {
    "deletable": false,
    "editable": false,
@@ -692,7 +692,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 49,
+   "execution_count": 315,
   "metadata": {
    "tags": [
     "solution"
@@ -16062,7 +16062,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 323,
   "metadata": {
    "deletable": false,
    "nbgrader": {
@@ -16092,8 +16092,22 @@
    "        triples consisting of the sentence id, start position, and end\n",
    "        position of each span.\n",
    "    \"\"\"\n",
-    "    # YOUR CODE HERE\n",
+    "    blacklist =  ['DATE', 'CARDINAL', 'TIME', 'QUANTITY', 'PERCENT', 'ORDINAL']\n",
-    "    raise NotImplementedError()"
+    "\n",
+    "    ner = nlp.get_pipe(\"ner\")\n",
+    "\n",
+    "    for row in df.itertuples():\n",
+    "        sentence = row[2]\n",
+    "        doc = nlp(sentence)\n",
+    "\n",
+    "        for ent in doc.ents:\n",
+    "            if ent.label_ in blacklist:\n",
+    "                continue\n",
+    "\n",
+    "            if ent.text.startswith(\"the \") or ent.text.startswith(\"The \"):\n",
+    "               ent.start += 1\n",
+    "            \n",
+    "            yield row[1], ent.start, ent.end"
   ]
  },
  {
@@ -16109,21 +16123,31 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 324,
   "metadata": {
    "tags": [
     "solution"
    ]
   },
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Precision: 0.875, Recall: 0.799, F1: 0.835\n"
+     ]
+    }
+   ],
   "source": [
    "scores_improved = evaluation_scores(spans_dev_gold, set(pred_spans_improved(df_dev)))\n",
-    "print_evaluation_scores(scores_improved)"
+    "print_evaluation_scores(scores_improved)\n",
+    "\n",
+    "# Previous score: Precision: 0.551, Recall: 0.775, F1: 0.644"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 325,
   "metadata": {
    "deletable": false,
    "editable": false,
@@ -16139,7 +16163,20 @@
     "task": false
    }
   },
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div class=\"alert alert-success\"><strong>Checks have passed!</strong></div>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
   "source": [
    "assert scores_improved[-1] > .8, \"F1-score should be above 0.8\"\n",
    "success()"
@@ -16156,7 +16193,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 326,
   "metadata": {
    "deletable": false,
    "nbgrader": {
@@ -16182,8 +16219,19 @@
    "        A *new* data frame with the same layout as `df`, but containing\n",
    "        the predicted start and end positions for each token span.\n",
    "    \"\"\"\n",
-    "    # YOUR CODE HERE\n",
+    "\n",
-    "    raise NotImplementedError()"
+    "    sentence_dict = dict(zip(df['sentence_id'], df['sentence']))\n",
+    "\n",
+    "    rows = []\n",
+    "\n",
+    "    for index, (sentence_id, beg, end) in enumerate(pred_spans_improved(df)):\n",
+    "        sentence = sentence_dict.get(sentence_id)\n",
+    "        rows.append({'sentence_id': sentence_id, 'sentence': sentence, 'beg': beg, 'end': end})\n",
+    "\n",
+    "    new_data_frame = pd.DataFrame(rows, columns=['sentence_id', 'sentence', 'beg', 'end'])\n",
+    "\n",
+    "    return new_data_frame\n",
+    "\n"
   ]
  },
  {
@@ -16197,7 +16245,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 327,
   "metadata": {
    "deletable": false,
    "editable": false,
@@ -16216,7 +16264,179 @@
     "solution"
    ]
   },
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>sentence_id</th>\n",
+       "      <th>sentence</th>\n",
+       "      <th>beg</th>\n",
+       "      <th>end</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>0946-001</td>\n",
+       "      <td>LONDON 1996-08-30</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>0946-002</td>\n",
+       "      <td>West Indian all-rounder Phil Simmons took four...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>0946-002</td>\n",
+       "      <td>West Indian all-rounder Phil Simmons took four...</td>\n",
+       "      <td>3</td>\n",
+       "      <td>5</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>0946-002</td>\n",
+       "      <td>West Indian all-rounder Phil Simmons took four...</td>\n",
+       "      <td>12</td>\n",
+       "      <td>13</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>0946-002</td>\n",
+       "      <td>West Indian all-rounder Phil Simmons took four...</td>\n",
+       "      <td>14</td>\n",
+       "      <td>15</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "  sentence_id                                           sentence  beg  end\n",
+       "0    0946-001                                  LONDON 1996-08-30    0    1\n",
+       "1    0946-002  West Indian all-rounder Phil Simmons took four...    0    2\n",
+       "2    0946-002  West Indian all-rounder Phil Simmons took four...    3    5\n",
+       "3    0946-002  West Indian all-rounder Phil Simmons took four...   12   13\n",
+       "4    0946-002  West Indian all-rounder Phil Simmons took four...   14   15"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>sentence_id</th>\n",
+       "      <th>sentence</th>\n",
+       "      <th>beg</th>\n",
+       "      <th>end</th>\n",
+       "      <th>label</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>0946-000</td>\n",
+       "      <td>CRICKET - LEICESTERSHIRE TAKE OVER AT TOP AFTE...</td>\n",
+       "      <td>2</td>\n",
+       "      <td>3</td>\n",
+       "      <td>Leicestershire_County_Cricket_Club</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>0946-001</td>\n",
+       "      <td>LONDON 1996-08-30</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>London</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>0946-002</td>\n",
+       "      <td>West Indian all-rounder Phil Simmons took four...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>2</td>\n",
+       "      <td>West_Indies_cricket_team</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>0946-002</td>\n",
+       "      <td>West Indian all-rounder Phil Simmons took four...</td>\n",
+       "      <td>3</td>\n",
+       "      <td>5</td>\n",
+       "      <td>Phil_Simmons</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>0946-002</td>\n",
+       "      <td>West Indian all-rounder Phil Simmons took four...</td>\n",
+       "      <td>12</td>\n",
+       "      <td>13</td>\n",
+       "      <td>Leicestershire_County_Cricket_Club</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "  sentence_id                                           sentence  beg  end  \\\n",
+       "0    0946-000  CRICKET - LEICESTERSHIRE TAKE OVER AT TOP AFTE...    2    3   \n",
+       "1    0946-001                                  LONDON 1996-08-30    0    1   \n",
+       "2    0946-002  West Indian all-rounder Phil Simmons took four...    0    2   \n",
+       "3    0946-002  West Indian all-rounder Phil Simmons took four...    3    5   \n",
+       "4    0946-002  West Indian all-rounder Phil Simmons took four...   12   13   \n",
+       "\n",
+       "                                label  \n",
+       "0  Leicestershire_County_Cricket_Club  \n",
+       "1                              London  \n",
+       "2            West_Indies_cricket_team  \n",
+       "3                        Phil_Simmons  \n",
+       "4  Leicestershire_County_Cricket_Club  "
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
   "source": [
    "df_dev_pred = df_with_pred_spans(df_dev)\n",
    "display(df_dev_pred.head())"
@@ -16247,7 +16467,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 328,
   "metadata": {
    "deletable": false,
    "nbgrader": {
@@ -16274,8 +16494,8 @@
    "        quadruples consisting of the sentence id, start position, end\n",
    "        position and entity label of each span.\n",
    "    \"\"\"\n",
-    "    # YOUR CODE HERE\n",
+    "    for row in df.itertuples():\n",
-    "    raise NotImplementedError()"
+    "        yield row[1], row[3], row[4], row[5]"
   ]
  },
  {
@@ -16289,7 +16509,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 329,
   "metadata": {
    "deletable": false,
    "editable": false,
@@ -16308,7 +16528,20 @@
     "solution"
    ]
   },
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div class=\"alert alert-success\"><strong>Checks have passed!</strong></div>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
   "source": [
    "dev_gold_mentions = set(gold_mentions(df_dev))\n",
    "assert ('1094-020', 0, 1, 'Seattle_Mariners') in dev_gold_mentions, \"An expected tuple is not included in the results\"\n",
@@ -16341,7 +16574,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 330,
   "metadata": {
    "deletable": false,
    "nbgrader": {
@@ -16372,8 +16605,12 @@
    "        quadruples consisting of the sentence id, start position, end\n",
    "        position and the predicted entity label of each span.\n",
    "    \"\"\"\n",
-    "    # YOUR CODE HERE\n",
+    "    for row in df.itertuples():\n",
-    "    raise NotImplementedError()"
+    "        words = row[2].split(' ')\n",
+    "        matching_wiki_name = '_'.join(words[int(row[3]):int(row[4])])\n",
+    "        #sprint(matching_wiki_name)\n",
+    "        yield row[1], row[3], row[4], matching_wiki_name\n",
+    "    "
   ]
  },
  {
@@ -16387,7 +16624,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 331,
   "metadata": {
    "deletable": false,
    "editable": false,
@@ -16406,7 +16643,27 @@
     "solution"
    ]
   },
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Precision: 0.301, Recall: 0.274, F1: 0.287\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div class=\"alert alert-success\"><strong>Checks have passed!</strong></div>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
   "source": [
    "# Compute and print the evaluation scores\n",
    "scores = evaluation_scores(dev_gold_mentions, set(baseline(df_dev_pred)))\n",

 %% Cell type:markdown id: tags:
 <div class="alert alert-info">
 ➡️ Before you start, make sure that you are familiar with the **[study guide](https://liu-nlp.ai/text-mining/logistics/)**, in particular the rules around **cheating and plagiarism** (found in the course memo).
 ➡️ If you use code from external sources (e.g. StackOverflow, ChatGPT, ...) as part of your solutions, don't forget to add a reference to these source(s) (for example as a comment above your code).
 ➡️ Make sure you fill in all cells that say **`YOUR CODE HERE`** or **YOUR ANSWER HERE**.  You normally shouldn't need to modify any of the other cells.
 </div>
 %% Cell type:markdown id: tags:
 # L3: Information Extraction
 %% Cell type:markdown id: tags:
 Information extraction (IE) is the task of identifying named entities and semantic relations between these entities in text data. In this lab we will focus on two sub-tasks in IE, **named entity recognition** (identifying mentions of entities) and **entity linking** (matching these mentions to entities in a knowledge base).
 %% Cell type:code id: tags:
 ``` python
 # Define some helper functions that are used in this notebook
 from IPython.display import display, HTML
 def success():
    display(HTML('<div class="alert alert-success"><strong>Checks have passed!</strong></div>'))
 ```
 %% Cell type:markdown id: tags:
 ## Dataset
 %% Cell type:markdown id: tags:
 The main data set for this lab is a collection of news wire articles in which mentions of **named entities** have been annotated with **page names** from the [English Wikipedia](https://en.wikipedia.org/wiki/). The next code cell loads the training and the development parts of the data into Pandas data frames.
 %% Cell type:code id: tags:
 ``` python
 import bz2
 import csv
 import pandas as pd
 import numpy as np
 with bz2.open('ner-train.tsv.bz2', mode='rt', encoding='utf-8') as source:
    df_train = pd.read_csv(source, encoding='utf-8', sep='\t', quoting=csv.QUOTE_NONE)
 with bz2.open('ner-dev.tsv.bz2', mode='rt', encoding='utf-8') as source:
    df_dev = pd.read_csv(source, encoding='utf-8', sep='\t', quoting=csv.QUOTE_NONE)
 ```
 %% Cell type:markdown id: tags:
 Each row in these two data frames corresponds to one mention of a named entity and has five columns:
 1. a unique identifier for the sentence containing the entity mention
 2. the pre-tokenized sentence, with tokens separated by spaces
 3. the start position of the token span containing the entity mention
 4. the end position of the token span (exclusive, as in Python list indexing)
 5. the entity label; either a Wikipedia page name or the generic label `--NME--`
 The following cell prints the first five samples from the training data:
 %% Cell type:code id: tags:
 ``` python
 df_train.head()
 ```
 %% Output
      sentence_id                                          sentence  beg  end  \
    0    0000-000  EU rejects German call to boycott British lamb .    0    1
    1    0000-000  EU rejects German call to boycott British lamb .    2    3
    2    0000-000  EU rejects German call to boycott British lamb .    6    7
    3    0000-001                                   Peter Blackburn    0    2
    4    0000-002                               BRUSSELS 1996-08-22    0    1
                label
    0         --NME--
    1         Germany
    2  United_Kingdom
    3         --NME--
    4        Brussels
 %% Cell type:markdown id: tags:
 In this sample, we see that the first sentence is annotated with three entity mentions:
 * the span 0–1 &lsquo;EU&rsquo; is annotated as an entity but only labelled with the generic `--NME--`
 * the span 2–3 &lsquo;German&rsquo; is annotated with the page [Germany](http://en.wikipedia.org/wiki/Germany)
 * the span 6–7 &lsquo;British&rsquo; is annotated with the page [United_Kingdom](http://en.wikipedia.org/wiki/United_Kingdom)
 %% Cell type:markdown id: tags:
 ## Problem 1: Evaluation measures
 %% Cell type:markdown id: tags:
 To warm up, we ask you to write code to print the three measures that you will be using for evaluation:
 %% Cell type:code id: tags:
 ``` python
 def evaluation_scores(gold, pred):
    """Return precision, recall, and F1 score.
    Arguments:
        gold: The set with the gold-standard values.
        pred: The set with the predicted values.
    Returns:
        A tuple or list containing the precision, recall, and F1 values
        (in that order), computed based on the specified sets.
    """
    num_correct_pred = len([element for element in pred if element in gold])
    num_wrong_pred = len([element for element in pred if element not in gold])
    precision = num_correct_pred / (num_correct_pred + num_wrong_pred)
    recall = num_correct_pred / len(gold)
    f1 = 2 * (precision * recall) / (precision + recall)
    return precision, recall, f1
 ```
 %% Cell type:markdown id: tags:
 Note that for implementing this function, it doesn’t matter what exactly `gold` and `pred` will contain, except that they will be Python `set` objects.
 Let's also define a convenience function that prints the scores nicely:
 %% Cell type:code id: tags:
 ``` python
 def print_evaluation_scores(result):
    p, r, f = result
    print(f"Precision: {p:.3f}, Recall: {r:.3f}, F1: {f:.3f}")
 ```
 %% Cell type:markdown id: tags:
 ### 🤞 Test your code
 %% Cell type:markdown id: tags:
 To test your code, you can run the following cell. This should give you a precision of 50%, a recall of 33.3%, and an F1-value of 0.4.
 %% Cell type:code id: tags:
 ``` python
 # Some example NER spans, to illustrate how the evaluation function will be used later
 example_gold = {("0000-000", 0, 1), ("0000-000", 2, 3), ("0000-000", 6, 7)}
 example_pred = {("0000-000", 2, 3), ("0000-000", 6, 8)}
 # Compute and print the scores
 result = evaluation_scores(example_gold, example_pred)
 print_evaluation_scores(result)
 # Check if the scores appear correct
 assert np.isclose(result, (.5, 1. / 3, .4)).all(), "Should be close to the expected values"
 success()
 ```
 %% Output
    Precision: 0.500, Recall: 0.333, F1: 0.400
 %% Cell type:markdown id: tags:
 ## Problem 2: Named entity recognition
 %% Cell type:markdown id: tags:
 One of the first tasks that an information extraction system has to solve is to locate and classify (mentions of) named entities, such as persons and organizations, a task usually known as **named entity recognition (NER)**.  For this lab, we will consider a slightly simplified version of NER, by only looking at the _spans_ of tokens containing an entity mention, without the actual entity label.
 The English language models in spaCy feature a full-fledged [named entity recognizer](https://spacy.io/usage/linguistic-features#named-entities) that identifies a variety of entities, and can be updated with new entity types by the user.  We therefore start by loading spaCy. _However,_ the data that we will be using has already been tokenized (following the conventions of the [Penn Treebank](ftp://ftp.cis.upenn.edu/pub/treebank/public_html/tokenization.html)), so we need to prevent spaCy from using its own tokenizer on top of this.  We therefore override spaCy&rsquo;s tokenizer with the default one that simply splits on whitespace:
 %% Cell type:code id: tags:
 ``` python
 import spacy
 from spacy.tokenizer import Tokenizer
 nlp = spacy.load('en_core_web_md')    # Let’s use the "medium" (md) model this time
 nlp.tokenizer = Tokenizer(nlp.vocab)  # ...but override the tokenizer
 ```
 %% Cell type:markdown id: tags:
 Your task in this problem is to **evaluate the performance of spaCy’s NER component** when predicting entity spans in the **development data**.
 This can be done in the following three steps:
 1. Write a function `gold_spans()` that takes a DataFrame and returns a set of triples of the form `(sentence_id, start_position, end_position)`, one for each entity mention _in the dataset_.
 2. Write a function `pred_spans()` that takes a DataFrame, runs spaCy’s NER on each sentence, and returns a set of triples (in the same form as above), one for each entity mention _predicted by spaCy_.
 3. Evaluate the results using your function from Problem&nbsp;1.
 We ask you to implement `gold_spans()` and `pred_spans()` as _generator functions_ that “yield” a single triple at a time, and provide stubs of such functions below that you can use as a starting point. (If you're not familiar with the `yield` keyword in Python, check out [this brief explanation](https://www.nbshare.io/notebook/851988260/Python-Yield/).)
 %% Cell type:code id: tags:
 ``` python
 def gold_spans(df):
    """Yield the gold-standard mention spans in a data frame.
    Arguments:
        df: A data frame.
    Yields:
        The gold-standard mention spans in the specified data frame as
        triples consisting of the sentence id, start position, and end
        position of each span.
    """
    # Hint: The Pandas method .itertuples() is useful for iterating over rows in a DataFrame
    for row in df.itertuples():
        yield row[1], row[3], row[4]
 ```
 %% Cell type:code id: tags:solution
 ``` python
 def pred_spans(df):
    """Run and evaluate spaCy's NER.
    Arguments:
        df: A data frame.
    Yields:
        The predicted mention spans in the specified data frame as
        triples consisting of the sentence id, start position, and end
        position of each span.
    """
    ner = nlp.get_pipe("ner")
    for row in df.itertuples():
        sentence = row[2]
        doc = nlp(sentence)
        for ent in doc.ents:
            yield row[1], ent.start, ent.end
 ```
 %% Cell type:markdown id: tags:
 #### 🤞 Putting it all together
 The following cell shows how you can put it all together and produce the evaluation report, provided you have implemented the functions as generator functions.  You should get a precision above 50%, with a recall above 70%, and an F1-score above 60%.
 %% Cell type:code id: tags:
 ``` python
 # Produce the spans
 spans_dev_pred = set(pred_spans(df_dev))
 spans_dev_gold = set(gold_spans(df_dev))
 # Compute and print the evaluation scores
 scores = evaluation_scores(spans_dev_gold, spans_dev_pred)
 print_evaluation_scores(scores)
 ```
 %% Output
    Precision: 0.551, Recall: 0.775, F1: 0.644
 %% Cell type:code id: tags:
 ``` python
 # Check if the scores appear correct
 assert scores[0] > .50, "Precision should be above 50%."
 assert scores[1] > .70, "Recall should be above 70%."
 success()
 ```
 %% Output
 %% Cell type:markdown id: tags:
 ## Problem 3: Error analysis
 %% Cell type:markdown id: tags:
 As you can see in Problem&nbsp;2, the span accuracy of the named entity recognizer is far from perfect. In particular, only slightly more than half of the predicted spans are correct according to the gold standard. Your next task is to analyse this result in more detail.
 Below is a function that uses spaCy’s span visualizer to visualize sentences containing _at least one mistake_ (i.e., either a false positive, a false negative, or both):
 %% Cell type:code id: tags:
 ``` python
 from collections import defaultdict
 from spacy import displacy
 from spacy.tokens import Span
 def error_report(df, spans_gold, spans_pred):
    """Run and evaluate spaCy's NER.
    Arguments:
        df: A data frame.
        spans_gold: The set of gold-standard entity spans from the data frame.
        spans_pred: The set of predicted entity spans from the data frame.
    Yields:
        The predicted mention spans in the specified data frame as
        triples consisting of the sentence id, start position, and end
        position of each span.
    """
    gold_by_sid = defaultdict(set)
    for (sentence_id, span_s, span_e) in spans_gold:
        gold_by_sid[sentence_id].add((span_s, span_e))
    pred_by_sid = defaultdict(set)
    for (sentence_id, span_s, span_e) in spans_pred:
        pred_by_sid[sentence_id].add((span_s, span_e))
    for row in df.drop_duplicates('sentence_id').itertuples():
        if gold_by_sid[row.sentence_id] == pred_by_sid[row.sentence_id]:
            continue
        doc = nlp(row.sentence)
        doc.spans["sc"] = [
            Span(doc, span_s, span_e, "GOLD") for (span_s, span_e) in gold_by_sid[row.sentence_id]
        ] + [
            Span(doc, span_s, span_e, "PRED") for (span_s, span_e) in pred_by_sid[row.sentence_id]
        ]
        yield doc
 ```
 %% Cell type:markdown id: tags:
 Let’s use a small sample of the training data to inspect this way.  The following cell renders sentences containing mistakes that the automated prediction makes based on the _first 500 rows_ of the training data (you may have to click on “Show more outputs” at the bottom to see all of them):
 %% Cell type:code id: tags:solution
 ``` python
 df_inspect = df_train[:500]
 spans_inspect_pred = set(pred_spans(df_inspect))
 for doc in error_report(df_inspect, set(gold_spans(df_inspect)), spans_inspect_pred):
    displacy.render(doc, style="span", options={"colors": {"GOLD": "gold", "PRED": "aqua"}})
 ```
 %% Output
 %% Cell type:markdown id: tags:
 ### Task 3.1
 Can you see any patterns in the mistakes from the sample above?  **Write a short text** that summarizes your observations!
 %% Cell type:markdown id: tags:
 We can see four types of mistakes that our model consistently makes. It labels points in time such as "tomorrow or Monday", and it labels numbers such as "16.4 percent" or "two". It also labels "the" in front of words such as "the European...". It also misses non english words.
 %% Cell type:markdown id: tags:
 ### Task 3.2
 Based on your insights from the error analysis, you should be able to improve the automated prediction that you implemented in Problem&nbsp;2.  While the best way to do this would be to [update spaCy&rsquo;s NER model](https://spacy.io/usage/linguistic-features#updating) using domain-specific training data, for this lab it suffices to **write code to post-process the output** produced by spaCy.  To filter out specific labels it is useful to know the named entity label scheme, which can be found in the [model's documentation](https://spacy.io/models/en#en_core_web_sm).
 %% Cell type:code id: tags:solution
 ``` python
 def pred_spans_improved(df):
    """Run and evaluate spaCy's NER, with post-processing to improve the results.
    Arguments:
        df: A data frame.
    Yields:
        The predicted mention spans in the specified data frame as
        triples consisting of the sentence id, start position, and end
        position of each span.
    """
-    # YOUR CODE HERE
+    blacklist =  ['DATE', 'CARDINAL', 'TIME', 'QUANTITY', 'PERCENT', 'ORDINAL']
-    raise NotImplementedError()
+    ner = nlp.get_pipe("ner")
+    for row in df.itertuples():
+        sentence = row[2]
+        doc = nlp(sentence)
+        for ent in doc.ents:
+            if ent.label_ in blacklist:
+                continue
+            if ent.text.startswith("the ") or ent.text.startswith("The "):
+               ent.start += 1
+            yield row[1], ent.start, ent.end
 ```
 %% Cell type:markdown id: tags:
 #### 🤞 Test your code
 You should be able to obtain an **F1 score of at least 0.80** this way, i.e., a substantial improvement over the scores you got in Problem&nbsp;2.
 The following cells report the evaluation measures and test if you achieve the performance goal:
 %% Cell type:code id: tags:solution
 ``` python
 scores_improved = evaluation_scores(spans_dev_gold, set(pred_spans_improved(df_dev)))
 print_evaluation_scores(scores_improved)
+# Previous score: Precision: 0.551, Recall: 0.775, F1: 0.644
 ```
+%% Output
+    Precision: 0.875, Recall: 0.799, F1: 0.835
 %% Cell type:code id: tags:
 ``` python
 assert scores_improved[-1] > .8, "F1-score should be above 0.8"
 success()
 ```
+%% Output
 %% Cell type:markdown id: tags:
 ### Task 3.3
 Before moving on, we ask you to **store the outputs of the improved named entity recognizer in a new data frame**. This new frame should have the same layout as the original data frame for the _development data_ that you loaded above, but should contain the *predicted* start and end positions for each token span, rather than the gold positions. As the `label` of each span, you can use the special value `--NME--` for now.
 %% Cell type:code id: tags:
 ``` python
 def df_with_pred_spans(df):
    """Make a new DataFrame with *predicted* NER spans.
    Arguments:
        df: A data frame.
    Returns:
        A *new* data frame with the same layout as `df`, but containing
        the predicted start and end positions for each token span.
    """
-    # YOUR CODE HERE
-    raise NotImplementedError()
+    sentence_dict = dict(zip(df['sentence_id'], df['sentence']))
+    rows = []
+    for index, (sentence_id, beg, end) in enumerate(pred_spans_improved(df)):
+        sentence = sentence_dict.get(sentence_id)
+        rows.append({'sentence_id': sentence_id, 'sentence': sentence, 'beg': beg, 'end': end})
+    new_data_frame = pd.DataFrame(rows, columns=['sentence_id', 'sentence', 'beg', 'end'])
+    return new_data_frame
 ```
 %% Cell type:markdown id: tags:
 #### 🤞 Test your code
 Run the following cell to run your function and display the first few lines of the new data frame:
 %% Cell type:code id: tags:solution
 ``` python
 df_dev_pred = df_with_pred_spans(df_dev)
 display(df_dev_pred.head())
 ```
+%% Output
 %% Cell type:markdown id: tags:
 ## Problem 4: Entity linking
 %% Cell type:markdown id: tags:
 Now that we have a method for predicting mention spans, we turn to the task of **entity linking**, which amounts to predicting the knowledge base entity that is referenced by a given mention. In our case, for each span, we want to predict the Wikipedia page that this mention references.
 %% Cell type:markdown id: tags:
 ### Task 4.1
 Start by **extending the generator function** that you implemented in Problem&nbsp;2 to **labelled spans**.
 %% Cell type:code id: tags:
 ``` python
 def gold_mentions(df):
    """Yield the gold-standard mentions in a data frame.
    Args:
        df: A data frame.
    Yields:
        The gold-standard mention spans in the specified data frame as
        quadruples consisting of the sentence id, start position, end
        position and entity label of each span.
    """
-    # YOUR CODE HERE
+    for row in df.itertuples():
-    raise NotImplementedError()
+        yield row[1], row[3], row[4], row[5]
 ```
 %% Cell type:markdown id: tags:
 #### 🤞 Test your code
 To test your code, you can run the following cell, which checks if one of the expected tuples is included in the results:
 %% Cell type:code id: tags:solution
 ``` python
 dev_gold_mentions = set(gold_mentions(df_dev))
 assert ('1094-020', 0, 1, 'Seattle_Mariners') in dev_gold_mentions, "An expected tuple is not included in the results"
 success()
 ```
+%% Output
 %% Cell type:markdown id: tags:
 ### Task 4.2
 A naive baseline for entity linking on our data set is to link each mention span to the Wikipedia page name that we get when we join the tokens in the span by underscores, as is standard in Wikipedia page names. Suppose, for example, that a span contains the two tokens
    Jimi Hendrix
 The baseline Wikipedia page name for this span would be
    Jimi_Hendrix
 **Implement this naive baseline and evaluate its performance!**
 %% Cell type:markdown id: tags:
 **_Important:_** Here and in the remainder of this lab, you should base your experiments on the _predicted spans_ that you computed in Problem&nbsp;3.
 %% Cell type:code id: tags:solution
 ``` python
 def baseline(df):
    """A naive baseline for entity linking that "predicts" Wikipedia
       page names from the tokens in the mention span.
    Arguments:
        df: A data frame.
    Yields:
        The predicted mention spans in the specified data frame as
        quadruples consisting of the sentence id, start position, end
        position and the predicted entity label of each span.
    """
-    # YOUR CODE HERE
+    for row in df.itertuples():
-    raise NotImplementedError()
+        words = row[2].split(' ')
+        matching_wiki_name = '_'.join(words[int(row[3]):int(row[4])])
+        #sprint(matching_wiki_name)
+        yield row[1], row[3], row[4], matching_wiki_name
 ```
 %% Cell type:markdown id: tags:
 #### 🤞 Test your code
 Again, we can turn to the evaluation measures that we implemented in Problem&nbsp;1.  The expected precision should be around 29%, with an F1-score around 28%.
 %% Cell type:code id: tags:solution
 ``` python
 # Compute and print the evaluation scores
 scores = evaluation_scores(dev_gold_mentions, set(baseline(df_dev_pred)))
 print_evaluation_scores(scores)
 # Check if scores are as expected
 assert scores[0] > .28, "Precision should be above 28%"
 assert scores[-1] > .27, "F1-score should be above 27%"
 success()
 ```
+%% Output
+    Precision: 0.301, Recall: 0.274, F1: 0.287
 %% Cell type:markdown id: tags:
 ## Problem 5: Extending the training data using the knowledge base
 %% Cell type:markdown id: tags:
 State-of-the-art approaches to entity linking exploit information in knowledge bases. In our case, where Wikipedia is the knowledge base, one particularly useful type of information are links to other Wikipedia pages. In particular, we can interpret the anchor texts (the highlighted texts that you click on) as mentions of the entities (pages) that they link to. This allows us to harvest long lists of mention–entity pairings.
 The following cell loads a data frame summarizing anchor texts and page references harvested from the first paragraphs of the English Wikipedia. The data frame also contains all entity mentions in the training data (but not the development or the test data).
 %% Cell type:code id: tags:
 ``` python
 with bz2.open('kb.tsv.bz2', 'rt') as source:
    df_kb = pd.read_csv(source, sep='\t', quoting=csv.QUOTE_NONE)
 ```
 %% Cell type:markdown id: tags:
 To understand what information is available in this data, the following cell shows the entry for the anchor text `Sweden`.
 %% Cell type:code id: tags:
 ``` python
 df_kb.loc[df_kb.mention == 'Sweden']
 ```
 %% Cell type:markdown id: tags:
 As you can see, each row of the data frame contains a pair $(m, e)$ of a mention $m$ and an entity $e$, as well as the conditional probability $P(e|m)$ for mention $m$ referring to entity $e$. These probabilities were estimated based on the frequencies of mention–entity pairs in the knowledge base. The example shows that the anchor text &lsquo;Sweden&rsquo; is most often used to refer to the entity [Sweden](http://en.wikipedia.org/wiki/Sweden), but in a few cases also to refer to Sweden&rsquo;s national football and ice hockey teams. Note that references are sorted in decreasing order of probability, so that the most probable pairing come first.
 %% Cell type:markdown id: tags:
 **Implement an entity linking method** that resolves each mention to the most probable entity in the data frame. If the mention is not included in the data frame, you can predict the generic label `--NME--`.
 %% Cell type:code id: tags:solution
 ``` python
 def most_probable_method(df, df_kb):
    """An entity linker that resolves each mention to the most probably entity in a knowledge base.
    Arguments:
        df: A data frame containing the mention spans.
        df_kb: A data frame containing the knowledge base.
    Yields:
        The predicted mention spans in the specified data frame as
        quadruples consisting of the sentence id, start position, end
        position and the predicted entity label of each span.
    """
    # YOUR CODE HERE
    raise NotImplementedError()
 ```
 %% Cell type:markdown id: tags:
 ### 🤞 Test your code
 We run the same evaluation as before. The expected precision should now be around 65%, with an F1-score just around 59%.
 %% Cell type:code id: tags:solution
 ``` python
 scores = evaluation_scores(dev_gold_mentions, set(most_probable_method(df_dev_pred, df_kb)))
 print_evaluation_scores(scores)
 assert scores[0] > .64, "Precision should be above 64%"
 assert scores[-1] > .58, "F1-score should be above 58%"
 success()
 ```
 %% Cell type:markdown id: tags:
 ## Problem 6: Context-sensitive disambiguation
 %% Cell type:markdown id: tags:
 Consider the entity mention &lsquo;Lincoln&rsquo;. The most probable entity for this mention turns out to be [Lincoln, Nebraska](http://en.wikipedia.org/Lincoln,_Nebraska); but in pages about American history, we would be better off to predict [Abraham Lincoln](http://en.wikipedia.org/Abraham_Lincoln). This suggests that we should try to disambiguate between different entity references based on the textual context on the page from which the mention was taken. Your task in this last problem is to implement this idea.
 Set up a dictionary that contains, for each mention $m$ that can refer to more than one entity $e$, a separate Naive Bayes classifier that is trained to predict the correct entity $e$, given the textual context of the mention. As the prior probabilities of the classifier, choose the probabilities $P(e|m)$ that you used in Problem&nbsp;5. To let you estimate the context-specific probabilities, we have compiled a data set with mention contexts:
 %% Cell type:code id: tags:
 ``` python
 with bz2.open('contexts.tsv.bz2') as source:
    df_contexts = pd.read_csv(source, sep='\t', quoting=csv.QUOTE_NONE)
 ```
 %% Cell type:markdown id: tags:
 This data frame contains, for each ambiguous mention $m$ and each knowledge base entity $e$ to which this mention can refer, up to 100 randomly selected contexts in which $m$ is used to refer to $e$. For this data, a **context** is defined as the 5 tokens to the left and the 5 tokens to the right of the mention. Here are a few examples:
 %% Cell type:code id: tags:
 ``` python
 df_contexts.head()
 ```
 %% Cell type:markdown id: tags:
 Note that, in each context, the position of the mention is indicated by the `@` symbol.
 From this data frame, it is easy to select the data that you need to train the classifiers – the contexts and corresponding entities for all mentions. To illustrate this, the following cell shows how to select all contexts that belong to the mention &lsquo;Lincoln&rsquo;:
 %% Cell type:code id: tags:
 ``` python
 df_contexts.context[df_contexts.mention == 'Lincoln']
 ```
 %% Cell type:markdown id: tags:
 Implement the context-sensitive disambiguation method and evaluate its performance.  Do this in two parts, first implementing a function that builds the classifiers _(refer to the text above for a detailed description)_, then implementing a prediction function that uses these classifiers to perform the entity prediction.
 Here are some more **hints** that may help you along the way:
 1. The prior probabilities for a Naive Bayes classifier can be specified using the `class_prior` option. You will have to provide the probabilities in the same order as the alphabetically sorted class (entity) names.
 2. Not all mentions in the knowledge base are ambiguous, and therefore not all mentions have context data. If a mention has only one possible entity, pick that one. If a mention has no entity at all, predict the `--NME--` label.
 %% Cell type:code id: tags:solution
 ``` python
 def build_entity_classifiers(df_kb, df_contexts):
    """Build Naive Bayes classifiers for entity prediction.
    Arguments:
        df_kb: A data frame with the knowledge base.
        df_contexts: A data frame with contexts for each mention.
    Returns:
        A dictionary where the keys are mentions and the values are Naive Bayes
        classifiers trained to predict the correct entity, given the textual
        context of the mention (as described in detail above).
    """
    # YOUR CODE HERE
    raise NotImplementedError()
 ```
 %% Cell type:code id: tags:solution
 ``` python
 def extended_dictionary_method(df, classifiers, df_kb):
    """An entity linker that resolves each mention to the most probably entity in a knowledge base.
    Arguments:
        df: A data frame containing the mention spans.
        classifiers: A dictionary of classifiers as produced by the
            `build_entity_classifiers` function.
        df_kb: A data frame with the knowledge base. (Should be used
            to look up a mention if it doesn't have a classifier.)
    Yields:
        The predicted mention spans in the specified data frame as
        quadruples consisting of the sentence id, start position, end
        position and the predicted entity label of each span.
    """
    # YOUR CODE HERE
    raise NotImplementedError()
 ```
 %% Cell type:markdown id: tags:
 ### 🤞 Test your code
 The cell below shows how your functions should all come together.
 %% Cell type:code id: tags:
 ``` python
 classifiers = build_entity_classifiers(df_kb, df_contexts)
 dev_pred_dict_mentions = set(extended_dictionary_method(df_dev_pred, classifiers, df_kb))
 ```
 %% Cell type:markdown id: tags:
 Finally, the cell below evaluates the results as before. You should expect to see a small (around 1&nbsp;unit) increase in each of precision, recall, and F1.
 %% Cell type:code id: tags:solution
 ``` python
 scores = evaluation_scores(dev_gold_mentions, dev_pred_dict_mentions)
 print_evaluation_scores(scores)
 ```
 %% Cell type:markdown id: tags:
 ## Individual reflection
 <div class="alert alert-info">
    <strong>After you have solved the lab,</strong> write a <em>brief</em> reflection (max. one A4 page) on the question(s) below.  Remember:
    <ul>
        <li>You are encouraged to discuss this part with your lab partner, but you should each write up your reflection <strong>individually</strong>.</li>
        <li><strong>Do not put your answers in the notebook</strong>; upload them in the separate submission opportunity for the reflections on Lisam.</li>
    </ul>
 </div>
 %% Cell type:markdown id: tags:
 1. In Problem 3, you performed an error analysis and implemented some post-processing to improve the model’s evaluation scores.  How could you improve the model’s performance further, and what kind of resources (such as data, compute, etc.) would you need for that?  Discuss this based on two or three concrete examples from the error analysis.
 2. How does the “context” data from Problem 6 help to disambiguate between different entities?  Can you think of other types of “context” that you could use for disambiguation?  Illustrate this with a specific example.
 %% Cell type:markdown id:125ccdbd-4375-4d2f-8b1d-f47097ef2e84 tags:
 **Congratulations on finishing this lab! 👍**
 <div class="alert alert-info">
 ➡️ Before you submit, **make sure the notebook can be run from start to finish** without errors.  For this, _restart the kernel_ and _run all cells_ from top to bottom. In Jupyter Notebook version 7 or higher, you can do this via "Run$\rightarrow$Restart Kernel and Run All Cells..." in the menu (or the "⏩" button in the toolbar).
 </div>
 %% Cell type:code id:f3ad192d-7557-4cd9-9ead-6699b8de9114 tags:
 ``` python
 ```