diff --git a/l2/TM-Lab2.ipynb b/l2/TM-Lab2.ipynb index 309b149a4670a7baa178c18043e497b34ecb0a71..34f2502510d6c383ceb12334b361584b0a77f2e6 100644 --- a/l2/TM-Lab2.ipynb +++ b/l2/TM-Lab2.ipynb @@ -37,7 +37,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 311, "metadata": { "editable": true, "slideshow": { @@ -47,7 +47,7 @@ }, "outputs": [], "source": [ - "%matplotlib inline" + "#%matplotlib inline" ] }, { @@ -71,7 +71,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 312, "metadata": { "deletable": false, "editable": false, @@ -111,9 +111,84 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 313, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>id</th>\n", + " <th>words</th>\n", + " <th>party</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>H5-002-004</td>\n", + " <td>eders majestäter eders kungliga högheter herr ...</td>\n", + " <td>S</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>H5-003-001</td>\n", + " <td>aktuell debatt om situationen för ensamkommand...</td>\n", + " <td>V</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>H5-003-002</td>\n", + " <td>herr talman och ledamöter jag vill börja med a...</td>\n", + " <td>S</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>H5-003-003</td>\n", + " <td>herr talman åhörare den här debatten handlar a...</td>\n", + " <td>M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>H5-003-004</td>\n", + " <td>herr talman ansvar och rättssäkerhet är två or...</td>\n", + " <td>SD</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " id words party\n", + "0 H5-002-004 eders majestäter eders kungliga högheter herr ... S\n", + "1 H5-003-001 aktuell debatt om situationen för ensamkommand... V\n", + "2 H5-003-002 herr talman och ledamöter jag vill börja med a... S\n", + "3 H5-003-003 herr talman åhörare den här debatten handlar a... M\n", + "4 H5-003-004 herr talman ansvar och rättssäkerhet är två or... SD" + ] + }, + "execution_count": 313, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "speeches_201718.head()" ] @@ -127,7 +202,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 314, "metadata": { "deletable": false, "editable": false, @@ -156,9 +231,17 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 315, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['C', 'KD', 'L', 'M', 'MP', 'S', 'SD', 'V']\n" + ] + } + ], "source": [ "parties = sorted(training_data['party'].unique())\n", "print(parties)" @@ -191,7 +274,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 316, "metadata": {}, "outputs": [], "source": [ @@ -202,7 +285,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 317, "metadata": { "deletable": false, "nbgrader": { @@ -220,17 +303,37 @@ "solution" ] }, - "outputs": [], + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "<Figure size 640x480 with 1 Axes>" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "\"\"\"Produce a plot for the 2017/2018 speeches.\"\"\"\n", "\n", - "# YOUR CODE HERE\n", - "raise NotImplementedError()" + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "def plot_speeches(speech_data):\n", + " pa = np.array(speech_data[\"party\"])\n", + " unique, counts = np.unique(pa, return_counts=True)\n", + " argsorter = counts.argsort()[::-1]\n", + " sorted_uniques = unique[argsorter]\n", + " sorted_counts = counts[argsorter]\n", + " plt.bar(sorted_uniques, sorted_counts)\n", + " plt.show()\n", + "plot_speeches(training_data)\n" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 318, "metadata": { "deletable": false, "nbgrader": { @@ -248,12 +351,22 @@ "solution" ] }, - "outputs": [], + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "<Figure size 640x480 with 1 Axes>" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "\"\"\"Produce a plot for the 2018/2019 speeches.\"\"\"\n", "\n", - "# YOUR CODE HERE\n", - "raise NotImplementedError()" + "plot_speeches(test_data)" ] }, { @@ -286,7 +399,7 @@ } }, "source": [ - "YOUR ANSWER HERE" + "While S and M dominated both years, there were some changes. MP made third most speeches the first year, but was then dropped to sixth place, for instance. Most parties made fewer (or far fewer) speeches in the second year, the exceptions being SD and V. This isn't very strange, since 2018 was the election year, so they probably campaigned a lot before it. It's also worth noting that SD and V gained a lot more votes that election, while MP lost votes, and that is reflected in the number of speeches seen in the plots. In regards to SD and V, edge parties tend to do more campaigning, so that could be a factor." ] }, { @@ -313,7 +426,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 319, "metadata": { "deletable": false, "nbgrader": { @@ -331,10 +444,479 @@ "solution" ] }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Macro precision score against test data: 0.5736152919943005\n", + "Macro precision score against training data: 0.8911125377417104\n", + "Classification report against test data:\n", + " precision recall f1-score support\n", + "\n", + " C 0.63 0.04 0.07 671\n", + " KD 0.70 0.02 0.03 821\n", + " L 0.92 0.02 0.04 560\n", + " M 0.36 0.68 0.47 1644\n", + " MP 0.36 0.25 0.29 809\n", + " S 0.46 0.84 0.59 2773\n", + " SD 0.57 0.12 0.20 1060\n", + " V 0.59 0.15 0.24 950\n", + "\n", + " accuracy 0.43 9288\n", + " macro avg 0.57 0.26 0.24 9288\n", + "weighted avg 0.52 0.43 0.34 9288\n", + "\n" + ] + }, + { + "data": { + "text/html": [ + "<style>#sk-container-id-7 {\n", + " /* Definition of color scheme common for light and dark mode */\n", + " --sklearn-color-text: black;\n", + " --sklearn-color-line: gray;\n", + " /* Definition of color scheme for unfitted estimators */\n", + " --sklearn-color-unfitted-level-0: #fff5e6;\n", + " --sklearn-color-unfitted-level-1: #f6e4d2;\n", + " --sklearn-color-unfitted-level-2: #ffe0b3;\n", + " --sklearn-color-unfitted-level-3: chocolate;\n", + " /* Definition of color scheme for fitted estimators */\n", + " --sklearn-color-fitted-level-0: #f0f8ff;\n", + " --sklearn-color-fitted-level-1: #d4ebff;\n", + " --sklearn-color-fitted-level-2: #b3dbfd;\n", + " --sklearn-color-fitted-level-3: cornflowerblue;\n", + "\n", + " /* Specific color for light theme */\n", + " --sklearn-color-text-on-default-background: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, black)));\n", + " --sklearn-color-background: var(--sg-background-color, var(--theme-background, var(--jp-layout-color0, white)));\n", + " --sklearn-color-border-box: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, black)));\n", + " --sklearn-color-icon: #696969;\n", + "\n", + " @media (prefers-color-scheme: dark) {\n", + " /* Redefinition of color scheme for dark theme */\n", + " --sklearn-color-text-on-default-background: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, white)));\n", + " --sklearn-color-background: var(--sg-background-color, var(--theme-background, var(--jp-layout-color0, #111)));\n", + " --sklearn-color-border-box: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, white)));\n", + " --sklearn-color-icon: #878787;\n", + " }\n", + "}\n", + "\n", + "#sk-container-id-7 {\n", + " color: var(--sklearn-color-text);\n", + "}\n", + "\n", + "#sk-container-id-7 pre {\n", + " padding: 0;\n", + "}\n", + "\n", + "#sk-container-id-7 input.sk-hidden--visually {\n", + " border: 0;\n", + " clip: rect(1px 1px 1px 1px);\n", + " clip: rect(1px, 1px, 1px, 1px);\n", + " height: 1px;\n", + " margin: -1px;\n", + " overflow: hidden;\n", + " padding: 0;\n", + " position: absolute;\n", + " width: 1px;\n", + "}\n", + "\n", + "#sk-container-id-7 div.sk-dashed-wrapped {\n", + " border: 1px dashed var(--sklearn-color-line);\n", + " margin: 0 0.4em 0.5em 0.4em;\n", + " box-sizing: border-box;\n", + " padding-bottom: 0.4em;\n", + " background-color: var(--sklearn-color-background);\n", + "}\n", + "\n", + "#sk-container-id-7 div.sk-container {\n", + " /* jupyter's `normalize.less` sets `[hidden] { display: none; }`\n", + " but bootstrap.min.css set `[hidden] { display: none !important; }`\n", + " so we also need the `!important` here to be able to override the\n", + " default hidden behavior on the sphinx rendered scikit-learn.org.\n", + " See: https://github.com/scikit-learn/scikit-learn/issues/21755 */\n", + " display: inline-block !important;\n", + " position: relative;\n", + "}\n", + "\n", + "#sk-container-id-7 div.sk-text-repr-fallback {\n", + " display: none;\n", + "}\n", + "\n", + "div.sk-parallel-item,\n", + "div.sk-serial,\n", + "div.sk-item {\n", + " /* draw centered vertical line to link estimators */\n", + " background-image: linear-gradient(var(--sklearn-color-text-on-default-background), var(--sklearn-color-text-on-default-background));\n", + " background-size: 2px 100%;\n", + " background-repeat: no-repeat;\n", + " background-position: center center;\n", + "}\n", + "\n", + "/* Parallel-specific style estimator block */\n", + "\n", + "#sk-container-id-7 div.sk-parallel-item::after {\n", + " content: \"\";\n", + " width: 100%;\n", + " border-bottom: 2px solid var(--sklearn-color-text-on-default-background);\n", + " flex-grow: 1;\n", + "}\n", + "\n", + "#sk-container-id-7 div.sk-parallel {\n", + " display: flex;\n", + " align-items: stretch;\n", + " justify-content: center;\n", + " background-color: var(--sklearn-color-background);\n", + " position: relative;\n", + "}\n", + "\n", + "#sk-container-id-7 div.sk-parallel-item {\n", + " display: flex;\n", + " flex-direction: column;\n", + "}\n", + "\n", + "#sk-container-id-7 div.sk-parallel-item:first-child::after {\n", + " align-self: flex-end;\n", + " width: 50%;\n", + "}\n", + "\n", + "#sk-container-id-7 div.sk-parallel-item:last-child::after {\n", + " align-self: flex-start;\n", + " width: 50%;\n", + "}\n", + "\n", + "#sk-container-id-7 div.sk-parallel-item:only-child::after {\n", + " width: 0;\n", + "}\n", + "\n", + "/* Serial-specific style estimator block */\n", + "\n", + "#sk-container-id-7 div.sk-serial {\n", + " display: flex;\n", + " flex-direction: column;\n", + " align-items: center;\n", + " background-color: var(--sklearn-color-background);\n", + " padding-right: 1em;\n", + " padding-left: 1em;\n", + "}\n", + "\n", + "\n", + "/* Toggleable style: style used for estimator/Pipeline/ColumnTransformer box that is\n", + "clickable and can be expanded/collapsed.\n", + "- Pipeline and ColumnTransformer use this feature and define the default style\n", + "- Estimators will overwrite some part of the style using the `sk-estimator` class\n", + "*/\n", + "\n", + "/* Pipeline and ColumnTransformer style (default) */\n", + "\n", + "#sk-container-id-7 div.sk-toggleable {\n", + " /* Default theme specific background. It is overwritten whether we have a\n", + " specific estimator or a Pipeline/ColumnTransformer */\n", + " background-color: var(--sklearn-color-background);\n", + "}\n", + "\n", + "/* Toggleable label */\n", + "#sk-container-id-7 label.sk-toggleable__label {\n", + " cursor: pointer;\n", + " display: block;\n", + " width: 100%;\n", + " margin-bottom: 0;\n", + " padding: 0.5em;\n", + " box-sizing: border-box;\n", + " text-align: center;\n", + "}\n", + "\n", + "#sk-container-id-7 label.sk-toggleable__label-arrow:before {\n", + " /* Arrow on the left of the label */\n", + " content: \"▸\";\n", + " float: left;\n", + " margin-right: 0.25em;\n", + " color: var(--sklearn-color-icon);\n", + "}\n", + "\n", + "#sk-container-id-7 label.sk-toggleable__label-arrow:hover:before {\n", + " color: var(--sklearn-color-text);\n", + "}\n", + "\n", + "/* Toggleable content - dropdown */\n", + "\n", + "#sk-container-id-7 div.sk-toggleable__content {\n", + " max-height: 0;\n", + " max-width: 0;\n", + " overflow: hidden;\n", + " text-align: left;\n", + " /* unfitted */\n", + " background-color: var(--sklearn-color-unfitted-level-0);\n", + "}\n", + "\n", + "#sk-container-id-7 div.sk-toggleable__content.fitted {\n", + " /* fitted */\n", + " background-color: var(--sklearn-color-fitted-level-0);\n", + "}\n", + "\n", + "#sk-container-id-7 div.sk-toggleable__content pre {\n", + " margin: 0.2em;\n", + " border-radius: 0.25em;\n", + " color: var(--sklearn-color-text);\n", + " /* unfitted */\n", + " background-color: var(--sklearn-color-unfitted-level-0);\n", + "}\n", + "\n", + "#sk-container-id-7 div.sk-toggleable__content.fitted pre {\n", + " /* unfitted */\n", + " background-color: var(--sklearn-color-fitted-level-0);\n", + "}\n", + "\n", + "#sk-container-id-7 input.sk-toggleable__control:checked~div.sk-toggleable__content {\n", + " /* Expand drop-down */\n", + " max-height: 200px;\n", + " max-width: 100%;\n", + " overflow: auto;\n", + "}\n", + "\n", + "#sk-container-id-7 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {\n", + " content: \"▾\";\n", + "}\n", + "\n", + "/* Pipeline/ColumnTransformer-specific style */\n", + "\n", + "#sk-container-id-7 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {\n", + " color: var(--sklearn-color-text);\n", + " background-color: var(--sklearn-color-unfitted-level-2);\n", + "}\n", + "\n", + "#sk-container-id-7 div.sk-label.fitted input.sk-toggleable__control:checked~label.sk-toggleable__label {\n", + " background-color: var(--sklearn-color-fitted-level-2);\n", + "}\n", + "\n", + "/* Estimator-specific style */\n", + "\n", + "/* Colorize estimator box */\n", + "#sk-container-id-7 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {\n", + " /* unfitted */\n", + " background-color: var(--sklearn-color-unfitted-level-2);\n", + "}\n", + "\n", + "#sk-container-id-7 div.sk-estimator.fitted input.sk-toggleable__control:checked~label.sk-toggleable__label {\n", + " /* fitted */\n", + " background-color: var(--sklearn-color-fitted-level-2);\n", + "}\n", + "\n", + "#sk-container-id-7 div.sk-label label.sk-toggleable__label,\n", + "#sk-container-id-7 div.sk-label label {\n", + " /* The background is the default theme color */\n", + " color: var(--sklearn-color-text-on-default-background);\n", + "}\n", + "\n", + "/* On hover, darken the color of the background */\n", + "#sk-container-id-7 div.sk-label:hover label.sk-toggleable__label {\n", + " color: var(--sklearn-color-text);\n", + " background-color: var(--sklearn-color-unfitted-level-2);\n", + "}\n", + "\n", + "/* Label box, darken color on hover, fitted */\n", + "#sk-container-id-7 div.sk-label.fitted:hover label.sk-toggleable__label.fitted {\n", + " color: var(--sklearn-color-text);\n", + " background-color: var(--sklearn-color-fitted-level-2);\n", + "}\n", + "\n", + "/* Estimator label */\n", + "\n", + "#sk-container-id-7 div.sk-label label {\n", + " font-family: monospace;\n", + " font-weight: bold;\n", + " display: inline-block;\n", + " line-height: 1.2em;\n", + "}\n", + "\n", + "#sk-container-id-7 div.sk-label-container {\n", + " text-align: center;\n", + "}\n", + "\n", + "/* Estimator-specific */\n", + "#sk-container-id-7 div.sk-estimator {\n", + " font-family: monospace;\n", + " border: 1px dotted var(--sklearn-color-border-box);\n", + " border-radius: 0.25em;\n", + " box-sizing: border-box;\n", + " margin-bottom: 0.5em;\n", + " /* unfitted */\n", + " background-color: var(--sklearn-color-unfitted-level-0);\n", + "}\n", + "\n", + "#sk-container-id-7 div.sk-estimator.fitted {\n", + " /* fitted */\n", + " background-color: var(--sklearn-color-fitted-level-0);\n", + "}\n", + "\n", + "/* on hover */\n", + "#sk-container-id-7 div.sk-estimator:hover {\n", + " /* unfitted */\n", + " background-color: var(--sklearn-color-unfitted-level-2);\n", + "}\n", + "\n", + "#sk-container-id-7 div.sk-estimator.fitted:hover {\n", + " /* fitted */\n", + " background-color: var(--sklearn-color-fitted-level-2);\n", + "}\n", + "\n", + "/* Specification for estimator info (e.g. \"i\" and \"?\") */\n", + "\n", + "/* Common style for \"i\" and \"?\" */\n", + "\n", + ".sk-estimator-doc-link,\n", + "a:link.sk-estimator-doc-link,\n", + "a:visited.sk-estimator-doc-link {\n", + " float: right;\n", + " font-size: smaller;\n", + " line-height: 1em;\n", + " font-family: monospace;\n", + " background-color: var(--sklearn-color-background);\n", + " border-radius: 1em;\n", + " height: 1em;\n", + " width: 1em;\n", + " text-decoration: none !important;\n", + " margin-left: 1ex;\n", + " /* unfitted */\n", + " border: var(--sklearn-color-unfitted-level-1) 1pt solid;\n", + " color: var(--sklearn-color-unfitted-level-1);\n", + "}\n", + "\n", + ".sk-estimator-doc-link.fitted,\n", + "a:link.sk-estimator-doc-link.fitted,\n", + "a:visited.sk-estimator-doc-link.fitted {\n", + " /* fitted */\n", + " border: var(--sklearn-color-fitted-level-1) 1pt solid;\n", + " color: var(--sklearn-color-fitted-level-1);\n", + "}\n", + "\n", + "/* On hover */\n", + "div.sk-estimator:hover .sk-estimator-doc-link:hover,\n", + ".sk-estimator-doc-link:hover,\n", + "div.sk-label-container:hover .sk-estimator-doc-link:hover,\n", + ".sk-estimator-doc-link:hover {\n", + " /* unfitted */\n", + " background-color: var(--sklearn-color-unfitted-level-3);\n", + " color: var(--sklearn-color-background);\n", + " text-decoration: none;\n", + "}\n", + "\n", + "div.sk-estimator.fitted:hover .sk-estimator-doc-link.fitted:hover,\n", + ".sk-estimator-doc-link.fitted:hover,\n", + "div.sk-label-container:hover .sk-estimator-doc-link.fitted:hover,\n", + ".sk-estimator-doc-link.fitted:hover {\n", + " /* fitted */\n", + " background-color: var(--sklearn-color-fitted-level-3);\n", + " color: var(--sklearn-color-background);\n", + " text-decoration: none;\n", + "}\n", + "\n", + "/* Span, style for the box shown on hovering the info icon */\n", + ".sk-estimator-doc-link span {\n", + " display: none;\n", + " z-index: 9999;\n", + " position: relative;\n", + " font-weight: normal;\n", + " right: .2ex;\n", + " padding: .5ex;\n", + " margin: .5ex;\n", + " width: min-content;\n", + " min-width: 20ex;\n", + " max-width: 50ex;\n", + " color: var(--sklearn-color-text);\n", + " box-shadow: 2pt 2pt 4pt #999;\n", + " /* unfitted */\n", + " background: var(--sklearn-color-unfitted-level-0);\n", + " border: .5pt solid var(--sklearn-color-unfitted-level-3);\n", + "}\n", + "\n", + ".sk-estimator-doc-link.fitted span {\n", + " /* fitted */\n", + " background: var(--sklearn-color-fitted-level-0);\n", + " border: var(--sklearn-color-fitted-level-3);\n", + "}\n", + "\n", + ".sk-estimator-doc-link:hover span {\n", + " display: block;\n", + "}\n", + "\n", + "/* \"?\"-specific style due to the `<a>` HTML tag */\n", + "\n", + "#sk-container-id-7 a.estimator_doc_link {\n", + " float: right;\n", + " font-size: 1rem;\n", + " line-height: 1em;\n", + " font-family: monospace;\n", + " background-color: var(--sklearn-color-background);\n", + " border-radius: 1rem;\n", + " height: 1rem;\n", + " width: 1rem;\n", + " text-decoration: none;\n", + " /* unfitted */\n", + " color: var(--sklearn-color-unfitted-level-1);\n", + " border: var(--sklearn-color-unfitted-level-1) 1pt solid;\n", + "}\n", + "\n", + "#sk-container-id-7 a.estimator_doc_link.fitted {\n", + " /* fitted */\n", + " border: var(--sklearn-color-fitted-level-1) 1pt solid;\n", + " color: var(--sklearn-color-fitted-level-1);\n", + "}\n", + "\n", + "/* On hover */\n", + "#sk-container-id-7 a.estimator_doc_link:hover {\n", + " /* unfitted */\n", + " background-color: var(--sklearn-color-unfitted-level-3);\n", + " color: var(--sklearn-color-background);\n", + " text-decoration: none;\n", + "}\n", + "\n", + "#sk-container-id-7 a.estimator_doc_link.fitted:hover {\n", + " /* fitted */\n", + " background-color: var(--sklearn-color-fitted-level-3);\n", + "}\n", + "</style><div id=\"sk-container-id-7\" class=\"sk-top-container\"><div class=\"sk-text-repr-fallback\"><pre>Pipeline(steps=[('vectorizer', CountVectorizer()),\n", + " ('classifier', MultinomialNB())])</pre><b>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. <br />On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.</b></div><div class=\"sk-container\" hidden><div class=\"sk-item sk-dashed-wrapped\"><div class=\"sk-label-container\"><div class=\"sk-label fitted sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-21\" type=\"checkbox\" ><label for=\"sk-estimator-id-21\" class=\"sk-toggleable__label fitted sk-toggleable__label-arrow fitted\"> Pipeline<a class=\"sk-estimator-doc-link fitted\" rel=\"noreferrer\" target=\"_blank\" href=\"https://scikit-learn.org/1.5/modules/generated/sklearn.pipeline.Pipeline.html\">?<span>Documentation for Pipeline</span></a><span class=\"sk-estimator-doc-link fitted\">i<span>Fitted</span></span></label><div class=\"sk-toggleable__content fitted\"><pre>Pipeline(steps=[('vectorizer', CountVectorizer()),\n", + " ('classifier', MultinomialNB())])</pre></div> </div></div><div class=\"sk-serial\"><div class=\"sk-item\"><div class=\"sk-estimator fitted sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-22\" type=\"checkbox\" ><label for=\"sk-estimator-id-22\" class=\"sk-toggleable__label fitted sk-toggleable__label-arrow fitted\"> CountVectorizer<a class=\"sk-estimator-doc-link fitted\" rel=\"noreferrer\" target=\"_blank\" href=\"https://scikit-learn.org/1.5/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html\">?<span>Documentation for CountVectorizer</span></a></label><div class=\"sk-toggleable__content fitted\"><pre>CountVectorizer()</pre></div> </div></div><div class=\"sk-item\"><div class=\"sk-estimator fitted sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-23\" type=\"checkbox\" ><label for=\"sk-estimator-id-23\" class=\"sk-toggleable__label fitted sk-toggleable__label-arrow fitted\"> MultinomialNB<a class=\"sk-estimator-doc-link fitted\" rel=\"noreferrer\" target=\"_blank\" href=\"https://scikit-learn.org/1.5/modules/generated/sklearn.naive_bayes.MultinomialNB.html\">?<span>Documentation for MultinomialNB</span></a></label><div class=\"sk-toggleable__content fitted\"><pre>MultinomialNB()</pre></div> </div></div></div></div></div></div>" + ], + "text/plain": [ + "Pipeline(steps=[('vectorizer', CountVectorizer()),\n", + " ('classifier', MultinomialNB())])" + ] + }, + "execution_count": 319, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# YOUR CODE HERE\n", - "raise NotImplementedError()" + "from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB\n", + "from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer\n", + "from sklearn.pipeline import Pipeline\n", + "from sklearn.metrics import classification_report, precision_score\n", + "train_X = np.array(training_data[\"words\"])\n", + "train_Y = np.array(training_data[\"party\"])\n", + "\n", + "test_X = np.array(test_data[\"words\"])\n", + "test_Y = np.array(test_data[\"party\"])\n", + "def run_naive_bayes_classifier(train_X, train_Y, test_X, test_Y):\n", + " pipeline = Pipeline(steps=[\n", + " ('vectorizer', CountVectorizer()),\n", + " ('classifier', MultinomialNB())\n", + " ])\n", + " model = pipeline.fit(train_X, train_Y)\n", + " predicted = model.predict(test_X)\n", + " predicted2 = model.predict(train_X)\n", + "\n", + " score1 = (precision_score(test_Y, predicted, average='macro'))\n", + " score2 = (precision_score(train_Y, predicted2, average='macro'))\n", + " print(f'Macro precision score against test data: {score1}')\n", + " print(f'Macro precision score against training data: {score2}')\n", + " print(\"Classification report against test data:\")\n", + " print(classification_report(test_Y, predicted))\n", + " return model\n", + "run_naive_bayes_classifier(train_X, train_Y, test_X, test_Y)" ] }, { @@ -364,7 +946,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 320, "metadata": { "deletable": false, "nbgrader": { @@ -382,10 +964,49 @@ "solution" ] }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " precision recall f1-score support\n", + "\n", + " C 0.00 0.00 0.00 671\n", + " KD 0.00 0.00 0.00 821\n", + " L 0.00 0.00 0.00 560\n", + " M 0.00 0.00 0.00 1644\n", + " MP 0.00 0.00 0.00 809\n", + " S 0.30 1.00 0.46 2773\n", + " SD 0.00 0.00 0.00 1060\n", + " V 0.00 0.00 0.00 950\n", + "\n", + " accuracy 0.30 9288\n", + " macro avg 0.04 0.12 0.06 9288\n", + "weighted avg 0.09 0.30 0.14 9288\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/jackkolm/Documents/TextMining/text-mining/.venv/lib/python3.11/site-packages/sklearn/metrics/_classification.py:1531: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n", + " _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n", + "/Users/jackkolm/Documents/TextMining/text-mining/.venv/lib/python3.11/site-packages/sklearn/metrics/_classification.py:1531: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n", + " _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n", + "/Users/jackkolm/Documents/TextMining/text-mining/.venv/lib/python3.11/site-packages/sklearn/metrics/_classification.py:1531: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n", + " _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n" + ] + } + ], "source": [ - "# YOUR CODE HERE\n", - "raise NotImplementedError()" + "from sklearn.dummy import DummyClassifier\n", + "\n", + "dc_mf = DummyClassifier(strategy='most_frequent')\n", + "model = dc_mf.fit(train_X, train_Y)\n", + "#print(model.score(test_X, test_Y))\n", + "predicted = model.predict(test_X)\n", + "print(classification_report(test_Y, predicted))\n" ] }, { @@ -399,7 +1020,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 321, "metadata": { "deletable": false, "nbgrader": { @@ -417,10 +1038,35 @@ "solution" ] }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " precision recall f1-score support\n", + "\n", + " C 0.09 0.08 0.08 671\n", + " KD 0.10 0.07 0.08 821\n", + " L 0.06 0.06 0.06 560\n", + " M 0.18 0.20 0.19 1644\n", + " MP 0.09 0.11 0.10 809\n", + " S 0.30 0.35 0.32 2773\n", + " SD 0.13 0.08 0.10 1060\n", + " V 0.10 0.07 0.08 950\n", + "\n", + " accuracy 0.18 9288\n", + " macro avg 0.13 0.13 0.13 9288\n", + "weighted avg 0.17 0.18 0.18 9288\n", + "\n" + ] + } + ], "source": [ - "# YOUR CODE HERE\n", - "raise NotImplementedError()" + "dc_stratified = DummyClassifier(strategy='stratified')\n", + "model = dc_stratified.fit(train_X, train_Y)\n", + "#print(model.score(test_X, test_Y))\n", + "predicted = model.predict(test_X)\n", + "print(classification_report(test_Y, predicted))\n" ] }, { @@ -443,7 +1089,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 322, "metadata": { "deletable": false, "nbgrader": { @@ -461,12 +1107,70 @@ "solution" ] }, - "outputs": [], + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "<Figure size 640x480 with 1 Axes>" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Macro precision score against test data: 0.40694919223605797\n", + "Macro precision score against training data: 0.9349225198554201\n", + "Classification report against test data:\n", + " precision recall f1-score support\n", + "\n", + " C 0.27 0.45 0.34 671\n", + " KD 0.30 0.37 0.34 821\n", + " L 0.27 0.45 0.34 560\n", + " M 0.37 0.52 0.43 1644\n", + " MP 0.33 0.36 0.34 809\n", + " S 0.85 0.26 0.40 2773\n", + " SD 0.45 0.41 0.43 1060\n", + " V 0.40 0.53 0.46 950\n", + "\n", + " accuracy 0.40 9288\n", + " macro avg 0.41 0.42 0.39 9288\n", + "weighted avg 0.50 0.40 0.40 9288\n", + "\n" + ] + } + ], "source": [ "\"\"\"Implement undersampling with the classifier from Problem 2 and report its performance on the test data.\"\"\"\n", + "import random\n", "\n", - "# YOUR CODE HERE\n", - "raise NotImplementedError()" + "\n", + "parties, _count = np.unique(training_data[\"party\"], return_counts=True)\n", + "min_count = min(_count.tolist())\n", + "\n", + "count_dict = {}\n", + "for party in parties:\n", + " count_dict[party] = 0\n", + "\n", + "indexes = [i for i in range(len(training_data[\"party\"]))]\n", + "random.shuffle(indexes)\n", + "\n", + "to_drop = []\n", + "\n", + "for i in indexes:\n", + " party = training_data[\"party\"][i]\n", + " count_dict[party] += 1\n", + " \n", + " if count_dict[party] > min_count:\n", + " to_drop.append(i)\n", + "\n", + "new_train_data = training_data.copy().drop(to_drop)\n", + "\n", + "plot_speeches(new_train_data)\n", + "current_model = run_naive_bayes_classifier(new_train_data[\"words\"], new_train_data[\"party\"], test_X, test_Y)\n" ] }, { @@ -487,19 +1191,30 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 323, "metadata": { "tags": [ "solution" ] }, - "outputs": [], + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "<Figure size 640x480 with 2 Axes>" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "from sklearn.metrics import ConfusionMatrixDisplay\n", "\n", "with sns.axes_style(\"white\"): # Seaborn’s default style doesn’t play well with ConfusionMatrixDisplay, so we change it temporarily\n", " ConfusionMatrixDisplay.from_estimator(\n", - " model, # The model that you want to plot the confusion matrix for\n", + " current_model, # The model that you want to plot the confusion matrix for\n", " test_data['words'], # The input data for the model\n", " test_data['party'], # The correct (gold-standard) labels for the input data\n", " normalize='true',\n", @@ -507,6 +1222,13 @@ " )" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, { "cell_type": "markdown", "metadata": {}, @@ -518,7 +1240,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 324, "metadata": { "deletable": false, "nbgrader": { @@ -533,10 +1255,38 @@ "task": false } }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "C is most often confused with M\n", + "KD is most often confused with M\n", + "L is most often confused with M\n", + "M is most often confused with C\n", + "MP is most often confused with M\n", + "S is most often confused with M\n", + "SD is most often confused with M\n", + "V is most often confused with M\n" + ] + } + ], "source": [ "# YOUR CODE HERE\n", - "raise NotImplementedError()" + "from sklearn.metrics import confusion_matrix\n", + "\n", + "predictions = current_model.predict(test_X)\n", + "cm = confusion_matrix(test_Y, predictions, normalize='true')\n", + "for i in range(len(cm)):\n", + " cm[i][i] = 0\n", + "for index, val in enumerate(cm):\n", + " val = val.tolist()\n", + " biggest = max(val)\n", + " biggest_index = val.index(biggest)\n", + " party = parties[index]\n", + " biggest_party = parties[biggest_index]\n", + " print(f'{party} is most often confused with {biggest_party}')\n", + "#for predicted_class in predictions:" ] }, { @@ -566,12 +1316,49 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 325, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "{'memory': None,\n", + " 'steps': [('vectorizer', CountVectorizer()), ('classifier', MultinomialNB())],\n", + " 'verbose': False,\n", + " 'vectorizer': CountVectorizer(),\n", + " 'classifier': MultinomialNB(),\n", + " 'vectorizer__analyzer': 'word',\n", + " 'vectorizer__binary': False,\n", + " 'vectorizer__decode_error': 'strict',\n", + " 'vectorizer__dtype': numpy.int64,\n", + " 'vectorizer__encoding': 'utf-8',\n", + " 'vectorizer__input': 'content',\n", + " 'vectorizer__lowercase': True,\n", + " 'vectorizer__max_df': 1.0,\n", + " 'vectorizer__max_features': None,\n", + " 'vectorizer__min_df': 1,\n", + " 'vectorizer__ngram_range': (1, 1),\n", + " 'vectorizer__preprocessor': None,\n", + " 'vectorizer__stop_words': None,\n", + " 'vectorizer__strip_accents': None,\n", + " 'vectorizer__token_pattern': '(?u)\\\\b\\\\w\\\\w+\\\\b',\n", + " 'vectorizer__tokenizer': None,\n", + " 'vectorizer__vocabulary': None,\n", + " 'classifier__alpha': 1.0,\n", + " 'classifier__class_prior': None,\n", + " 'classifier__fit_prior': True,\n", + " 'classifier__force_alpha': True}" + ] + }, + "execution_count": 325, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Replace \"nb_pipe\" with the variable that has your model pipeline, if necessary\n", - "nb_pipe.get_params()" + " \n", + "current_model.get_params()" ] }, { @@ -607,10 +1394,57 @@ "solution" ] }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Fitting 5 folds for each of 12 candidates, totalling 60 fits\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/jackkolm/Documents/TextMining/text-mining/.venv/lib/python3.11/site-packages/joblib/externals/loky/process_executor.py:752: UserWarning: A worker stopped while some jobs were given to the executor. This can be caused by a too short worker timeout or by a memory leak.\n", + " warnings.warn(\n" + ] + } + ], "source": [ - "# YOUR CODE HERE\n", - "raise NotImplementedError()" + "from sklearn.feature_extraction.text import TfidfVectorizer \n", + "from sklearn.linear_model import LogisticRegression \n", + "from sklearn.preprocessing import StandardScaler \n", + "from sklearn.decomposition import TruncatedSVD \n", + "from sklearn.model_selection import GridSearchCV\n", + "\n", + "\n", + "\n", + "\n", + "new_model_params = current_model.get_params()\n", + "mid_pipeline = Pipeline(steps=[\n", + " ('vectorizer', CountVectorizer()),\n", + " ('classifier', MultinomialNB())\n", + " ])\n", + "me_params = {\n", + " 'vectorizer__ngram_range': [(1, 1), (1, 2), (1, 3)],\n", + " 'vectorizer__max_df': [0.2, 0.5, 0.75, 1.0]\n", + " }\n", + "\n", + "gs = GridSearchCV(estimator=mid_pipeline, param_grid=me_params, cv=5, n_jobs=-1, verbose=1)\n", + "gs.fit(np.array(new_train_data['words']), np.array(new_train_data['party']))\n", + "print(f'Best params: {gs.best_params_}\\nBest estimator: {gs.best_estimator_}\\nBest index: {gs.best_index_}\\nBest score: {gs.best_score_}')\n", + "#super_pipeline = Pipeline(steps=[ ('tfidf', TfidfVectorizer()), \n", + "# ('svd', TruncatedSVD(n_components=100)), \n", + "# ('scaler', StandardScaler(with_mean=False)), \n", + "# ('logreg', LogisticRegression(max_iter=1000))])\n", + "#new_model = super_pipeline.fit(new_train_data['words'], new_train_data['party'])\n", + "#super_pipeline.set_params([new_model_params])\n", + "#p = new_model.predict(test_X)\n", + "#t = precision_score(test_Y, p, average='macro')\n", + "#print(t)\n", + "# Macro precision score against test data: 0.4101062636660445\n", + "\n" ] }, { @@ -667,7 +1501,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": ".venv", "language": "python", "name": "python3" }, @@ -681,7 +1515,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.7" + "version": "3.11.0" } }, "nbformat": 4,