diff --git a/project-tdde16/.ipynb b/project-tdde16/.ipynb deleted file mode 100644 index 1069a8a41a71ec343256e2272a462cb6063cc450..0000000000000000000000000000000000000000 --- a/project-tdde16/.ipynb +++ /dev/null @@ -1,491 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Import libraries" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "import numpy as np \n", - "import pandas as pd \n", - "import re\n", - "import string\n", - "\n", - "import matplotlib.pyplot as plt\n", - "import plotly.graph_objects as go\n", - "\n", - "from sklearn.model_selection import train_test_split\n", - "\n", - "from sklearn.feature_extraction.text import CountVectorizer\n", - "\n", - "from sklearn.naive_bayes import MultinomialNB\n", - "from sklearn.dummy import DummyClassifier\n", - "from sklearn.feature_extraction.text import TfidfVectorizer\n", - "\n", - "from sklearn.metrics import accuracy_score\n", - "from sklearn.metrics import classification_report\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Import dataset" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df = pd.read_csv(\"tripadvisor_hotel_reviews.csv\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df.head()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df[\"Rating\"].value_counts()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df['Rating'].value_counts().plot(kind='bar', xlabel='Ratings', ylabel = 'Counts of Ratings');" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Preprocess data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def pre_processing(review):\n", - " \n", - " #Make review lowercase\n", - " review = review.lower()\n", - " \n", - " #Remove text in square brackets \n", - " review = re.sub('\\[.*?\\]', '', review)\n", - " \n", - " #Remove punctuation and remove words containing numbers.\"\n", - " review = re.sub('[%s]' % re.escape(string.punctuation), '', review)\n", - " \n", - " #Remove numbers in review\n", - " review = re.sub('\\w*\\d\\w*', '', review)\n", - "\n", - " \n", - " return review" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df[\"Review\"] = df[\"Review\"].apply(pre_processing) #Apply preprocessing to every review" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Split into test and training data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "X = df[\"Review\"]\n", - "Y = df[\"Rating\"]\n", - "\n", - "train, test = train_test_split(df, random_state = 0, test_size = 0.2) #Used for undersampling\n", - "x_train, x_test, y_train, y_test = train_test_split(X, Y, random_state = 0, test_size = 0.2)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Undersampling" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "tot_ratings = train['Rating'].value_counts()\n", - "print(\"The ratings and their amounts\", tot_ratings)\n", - "\n", - "#Find the minorty class\n", - "samples = min(tot_ratings) \n", - "\n", - "training_data = train.sample(frac=1)\n", - "\n", - "#Scale classes to be equal to the number of samples of the minority class\n", - "rate_1 = training_data.loc[training_data['Rating'] == 1][:samples]\n", - "rate_2 = training_data.loc[training_data['Rating'] == 2][:samples]\n", - "rate_3 = training_data.loc[training_data['Rating'] == 3][:samples]\n", - "rate_4 = training_data.loc[training_data['Rating'] == 4][:samples]\n", - "rate_5 = training_data.loc[training_data['Rating'] == 5][:samples]\n", - "\n", - "#Set new training data to the balanced class samples\n", - "new_training_data = pd.concat([rate_1, rate_2, rate_3, rate_4, rate_5])\n", - "\n", - "# Shuffle dataframe rows and make it to the new df which is now undersampled\n", - "new_df = new_training_data.sample(frac=1, random_state=samples)\n", - "\n", - "new_tot_ratings = new_df['Rating'].value_counts()\n", - "print(\"Undersampling results:\\n\",new_tot_ratings)\n", - "\n", - "#Make new balanced data traning data\n", - "x_train_undersampled = new_df['Review']\n", - "y_train_undersampled = new_df['Rating']" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Dummy Baseline" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dummy_clf = DummyClassifier(strategy=\"stratified\")\n", - "\n", - "\n", - "dummy_clf.fit(x_train, y_train) #Train classifier\n", - "\n", - "\n", - "y_pred = dummy_clf.predict(x_test)\n", - "y_true = y_test\n", - "\n", - "print(\"Mean Accuracy:\", dummy_clf.score(y_true , y_pred))\n", - "\n", - "dummy_clf.fit(x_train_undersampled, y_train_undersampled) #Train classifier\n", - "\n", - "y_pred = dummy_clf.predict(x_test)\n", - "y_true = y_test\n", - "\n", - "print(\"Mean Accuracy For Undersampled Data:\", dummy_clf.score(y_true , y_pred))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Multinomial Naive Bayes" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Apply Tfidf Vectorizer to convert sentence to tokens\n", - "vectorizer = TfidfVectorizer(min_df = 0.01,stop_words = 'english')\n", - "vectorizer.fit(X)\n", - "\n", - "\n", - "X_train_tfidf = vectorizer.transform(x_train)\n", - "X_test_tfidf = vectorizer.transform(x_test) \n", - "\n", - "\n", - "nb=MultinomialNB()\n", - "nb_model = nb.fit(X_train_tfidf, y_train) # Train classifier\n", - "\n", - "predicted_result=nb_model.predict(X_test_tfidf)\n", - "print(\"Imbalanced results\", classification_report(y_test,predicted_result))\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Multinomial Naive Bayes (balanced dataset)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Apply Tfidf Vectorizer to convert sentence to tokens\n", - "vectorizer = TfidfVectorizer(min_df = 0.01,stop_words = 'english')\n", - "vectorizer.fit(new_df['Review'])\n", - "\n", - "\n", - "X_train_tfidf = vectorizer.transform(x_train_undersampled)\n", - "X_test_tfidf = vectorizer.transform(x_test) \n", - "\n", - "nb=MultinomialNB()\n", - "nb_model = nb.fit(X_train_tfidf, y_train_undersampled)# Train classifier\n", - "\n", - "predicted_result=nb_model.predict(X_test_tfidf)\n", - "print(\"Balanced results\", classification_report(y_test,predicted_result))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Bert" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "pip install tensorflow" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "pip install ktrain" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#Sentiment visualization\n", - "pos = [4, 5]\n", - "neg = [1, 2]\n", - "neu = [3]\n", - "\n", - "def sentiment(rating):\n", - " if rating in pos:\n", - " return 2\n", - " elif rating in neg:\n", - " return 0\n", - " else:\n", - " return 1 \n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import ktrain\n", - "from ktrain import text" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df['Sentiment'] = df['Rating'].apply(sentiment) #Apply sentiment labels to reviews\n", - "df.head()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#Sentiment distrubution for df\n", - "fig = go.Figure([go.Bar(x=df.Sentiment.value_counts().index, y=df.Sentiment.value_counts().tolist())])\n", - "fig.update_layout(\n", - " title=\"Values in each Sentiment\",\n", - " xaxis_title=\"Sentiment\",\n", - " yaxis_title=\"Values\")\n", - "fig.show()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#Load the df in and preprocess it according to the BERT model.\n", - "(x_train, y_train), (x_test, y_test), preproc = text.texts_from_df(df, \n", - " 'Review',\n", - " label_columns=['Sentiment'],\n", - " preprocess_mode='bert')\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#Load model, use text classifier bert.\n", - "model = text.text_classifier(name='bert', train_data=(x_train,y_train), preproc=preproc)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#Create an instance of a learner that will learn the model\n", - "learner = ktrain.get_learner(model=model,\n", - " train_data=(x_train, y_train),\n", - " val_data=(x_test, y_test),\n", - " batch_size=6)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#Set learningrate and cycle learning policy\n", - "learner.fit_onecycle(lr=2e-5,\n", - " epochs=1)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "learner.validate(val_data=(x_test,y_test))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Repeat process for undersampled data - BERT" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "new_df['Sentiment'] = new_df['Rating'].apply(sentiment) #Apply sentiment labels to reviews\n", - "new_df.head()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#Sentiment distrubution for new_df\n", - "fig = go.Figure([go.Bar(x=new_df.Sentiment.value_counts().index, y=new_df.Sentiment.value_counts().tolist())])\n", - "fig.update_layout(\n", - " title=\"Values in each Sentiment\",\n", - " xaxis_title=\"Sentiment\",\n", - " yaxis_title=\"Values\")\n", - "fig.show()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#Load the df in and preprocess it according to the BERT model.\n", - "(x_train, y_train), (x_test, y_test), preproc = text.texts_from_df(new_df, 'Review',label_columns=['Sentiment'], preprocess_mode='bert')\n", - "\n", - "#Load model, use text classifier bert.\n", - "model = text.text_classifier(name='bert', train_data=(x_train,y_train), preproc=preproc)\n", - "\n", - "#Create an instance of a learner that will learn the model\n", - "learner = ktrain.get_learner(model=model,train_data=(x_train, y_train),val_data=(x_test, y_test),batch_size=6)\n", - "\n", - "#Set learningrate and cycle learning policy\n", - "learner.fit_onecycle(lr=2e-5,\n", - " epochs=1)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "learner.validate(val_data=(x_test,y_test))" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.10" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/project-tdde16/.ipynb_checkpoints/.ipynb-checkpoint b/project-tdde16/.ipynb_checkpoints/.ipynb-checkpoint deleted file mode 100644 index 1069a8a41a71ec343256e2272a462cb6063cc450..0000000000000000000000000000000000000000 --- a/project-tdde16/.ipynb_checkpoints/.ipynb-checkpoint +++ /dev/null @@ -1,491 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Import libraries" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "import numpy as np \n", - "import pandas as pd \n", - "import re\n", - "import string\n", - "\n", - "import matplotlib.pyplot as plt\n", - "import plotly.graph_objects as go\n", - "\n", - "from sklearn.model_selection import train_test_split\n", - "\n", - "from sklearn.feature_extraction.text import CountVectorizer\n", - "\n", - "from sklearn.naive_bayes import MultinomialNB\n", - "from sklearn.dummy import DummyClassifier\n", - "from sklearn.feature_extraction.text import TfidfVectorizer\n", - "\n", - "from sklearn.metrics import accuracy_score\n", - "from sklearn.metrics import classification_report\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Import dataset" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df = pd.read_csv(\"tripadvisor_hotel_reviews.csv\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df.head()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df[\"Rating\"].value_counts()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df['Rating'].value_counts().plot(kind='bar', xlabel='Ratings', ylabel = 'Counts of Ratings');" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Preprocess data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def pre_processing(review):\n", - " \n", - " #Make review lowercase\n", - " review = review.lower()\n", - " \n", - " #Remove text in square brackets \n", - " review = re.sub('\\[.*?\\]', '', review)\n", - " \n", - " #Remove punctuation and remove words containing numbers.\"\n", - " review = re.sub('[%s]' % re.escape(string.punctuation), '', review)\n", - " \n", - " #Remove numbers in review\n", - " review = re.sub('\\w*\\d\\w*', '', review)\n", - "\n", - " \n", - " return review" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df[\"Review\"] = df[\"Review\"].apply(pre_processing) #Apply preprocessing to every review" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Split into test and training data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "X = df[\"Review\"]\n", - "Y = df[\"Rating\"]\n", - "\n", - "train, test = train_test_split(df, random_state = 0, test_size = 0.2) #Used for undersampling\n", - "x_train, x_test, y_train, y_test = train_test_split(X, Y, random_state = 0, test_size = 0.2)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Undersampling" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "tot_ratings = train['Rating'].value_counts()\n", - "print(\"The ratings and their amounts\", tot_ratings)\n", - "\n", - "#Find the minorty class\n", - "samples = min(tot_ratings) \n", - "\n", - "training_data = train.sample(frac=1)\n", - "\n", - "#Scale classes to be equal to the number of samples of the minority class\n", - "rate_1 = training_data.loc[training_data['Rating'] == 1][:samples]\n", - "rate_2 = training_data.loc[training_data['Rating'] == 2][:samples]\n", - "rate_3 = training_data.loc[training_data['Rating'] == 3][:samples]\n", - "rate_4 = training_data.loc[training_data['Rating'] == 4][:samples]\n", - "rate_5 = training_data.loc[training_data['Rating'] == 5][:samples]\n", - "\n", - "#Set new training data to the balanced class samples\n", - "new_training_data = pd.concat([rate_1, rate_2, rate_3, rate_4, rate_5])\n", - "\n", - "# Shuffle dataframe rows and make it to the new df which is now undersampled\n", - "new_df = new_training_data.sample(frac=1, random_state=samples)\n", - "\n", - "new_tot_ratings = new_df['Rating'].value_counts()\n", - "print(\"Undersampling results:\\n\",new_tot_ratings)\n", - "\n", - "#Make new balanced data traning data\n", - "x_train_undersampled = new_df['Review']\n", - "y_train_undersampled = new_df['Rating']" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Dummy Baseline" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dummy_clf = DummyClassifier(strategy=\"stratified\")\n", - "\n", - "\n", - "dummy_clf.fit(x_train, y_train) #Train classifier\n", - "\n", - "\n", - "y_pred = dummy_clf.predict(x_test)\n", - "y_true = y_test\n", - "\n", - "print(\"Mean Accuracy:\", dummy_clf.score(y_true , y_pred))\n", - "\n", - "dummy_clf.fit(x_train_undersampled, y_train_undersampled) #Train classifier\n", - "\n", - "y_pred = dummy_clf.predict(x_test)\n", - "y_true = y_test\n", - "\n", - "print(\"Mean Accuracy For Undersampled Data:\", dummy_clf.score(y_true , y_pred))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Multinomial Naive Bayes" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Apply Tfidf Vectorizer to convert sentence to tokens\n", - "vectorizer = TfidfVectorizer(min_df = 0.01,stop_words = 'english')\n", - "vectorizer.fit(X)\n", - "\n", - "\n", - "X_train_tfidf = vectorizer.transform(x_train)\n", - "X_test_tfidf = vectorizer.transform(x_test) \n", - "\n", - "\n", - "nb=MultinomialNB()\n", - "nb_model = nb.fit(X_train_tfidf, y_train) # Train classifier\n", - "\n", - "predicted_result=nb_model.predict(X_test_tfidf)\n", - "print(\"Imbalanced results\", classification_report(y_test,predicted_result))\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Multinomial Naive Bayes (balanced dataset)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Apply Tfidf Vectorizer to convert sentence to tokens\n", - "vectorizer = TfidfVectorizer(min_df = 0.01,stop_words = 'english')\n", - "vectorizer.fit(new_df['Review'])\n", - "\n", - "\n", - "X_train_tfidf = vectorizer.transform(x_train_undersampled)\n", - "X_test_tfidf = vectorizer.transform(x_test) \n", - "\n", - "nb=MultinomialNB()\n", - "nb_model = nb.fit(X_train_tfidf, y_train_undersampled)# Train classifier\n", - "\n", - "predicted_result=nb_model.predict(X_test_tfidf)\n", - "print(\"Balanced results\", classification_report(y_test,predicted_result))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Bert" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "pip install tensorflow" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "pip install ktrain" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#Sentiment visualization\n", - "pos = [4, 5]\n", - "neg = [1, 2]\n", - "neu = [3]\n", - "\n", - "def sentiment(rating):\n", - " if rating in pos:\n", - " return 2\n", - " elif rating in neg:\n", - " return 0\n", - " else:\n", - " return 1 \n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import ktrain\n", - "from ktrain import text" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df['Sentiment'] = df['Rating'].apply(sentiment) #Apply sentiment labels to reviews\n", - "df.head()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#Sentiment distrubution for df\n", - "fig = go.Figure([go.Bar(x=df.Sentiment.value_counts().index, y=df.Sentiment.value_counts().tolist())])\n", - "fig.update_layout(\n", - " title=\"Values in each Sentiment\",\n", - " xaxis_title=\"Sentiment\",\n", - " yaxis_title=\"Values\")\n", - "fig.show()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#Load the df in and preprocess it according to the BERT model.\n", - "(x_train, y_train), (x_test, y_test), preproc = text.texts_from_df(df, \n", - " 'Review',\n", - " label_columns=['Sentiment'],\n", - " preprocess_mode='bert')\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#Load model, use text classifier bert.\n", - "model = text.text_classifier(name='bert', train_data=(x_train,y_train), preproc=preproc)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#Create an instance of a learner that will learn the model\n", - "learner = ktrain.get_learner(model=model,\n", - " train_data=(x_train, y_train),\n", - " val_data=(x_test, y_test),\n", - " batch_size=6)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#Set learningrate and cycle learning policy\n", - "learner.fit_onecycle(lr=2e-5,\n", - " epochs=1)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "learner.validate(val_data=(x_test,y_test))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Repeat process for undersampled data - BERT" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "new_df['Sentiment'] = new_df['Rating'].apply(sentiment) #Apply sentiment labels to reviews\n", - "new_df.head()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#Sentiment distrubution for new_df\n", - "fig = go.Figure([go.Bar(x=new_df.Sentiment.value_counts().index, y=new_df.Sentiment.value_counts().tolist())])\n", - "fig.update_layout(\n", - " title=\"Values in each Sentiment\",\n", - " xaxis_title=\"Sentiment\",\n", - " yaxis_title=\"Values\")\n", - "fig.show()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#Load the df in and preprocess it according to the BERT model.\n", - "(x_train, y_train), (x_test, y_test), preproc = text.texts_from_df(new_df, 'Review',label_columns=['Sentiment'], preprocess_mode='bert')\n", - "\n", - "#Load model, use text classifier bert.\n", - "model = text.text_classifier(name='bert', train_data=(x_train,y_train), preproc=preproc)\n", - "\n", - "#Create an instance of a learner that will learn the model\n", - "learner = ktrain.get_learner(model=model,train_data=(x_train, y_train),val_data=(x_test, y_test),batch_size=6)\n", - "\n", - "#Set learningrate and cycle learning policy\n", - "learner.fit_onecycle(lr=2e-5,\n", - " epochs=1)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "learner.validate(val_data=(x_test,y_test))" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.10" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/project-tdde16/.ipynb_checkpoints/project_tdde16-checkpoint.ipynb b/project-tdde16/.ipynb_checkpoints/project_tdde16-checkpoint.ipynb deleted file mode 100644 index b63b66f3675c586559c8392c84ab0283b69d05d6..0000000000000000000000000000000000000000 --- a/project-tdde16/.ipynb_checkpoints/project_tdde16-checkpoint.ipynb +++ /dev/null @@ -1,773 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Import libraries" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "import numpy as np \n", - "import pandas as pd \n", - "import re\n", - "import string\n", - "\n", - "import matplotlib.pyplot as plt\n", - "import plotly.graph_objects as go\n", - "\n", - "from sklearn.model_selection import train_test_split\n", - "\n", - "from sklearn.feature_extraction.text import CountVectorizer\n", - "\n", - "from sklearn.naive_bayes import MultinomialNB\n", - "from sklearn.dummy import DummyClassifier\n", - "from sklearn.feature_extraction.text import TfidfVectorizer\n", - "\n", - "from sklearn.metrics import accuracy_score\n", - "from sklearn.metrics import classification_report\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Import dataset" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "df = pd.read_csv(\"tripadvisor_hotel_reviews.csv\")" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "<div>\n", - "<style scoped>\n", - " .dataframe tbody tr th:only-of-type {\n", - " vertical-align: middle;\n", - " }\n", - "\n", - " .dataframe tbody tr th {\n", - " vertical-align: top;\n", - " }\n", - "\n", - " .dataframe thead th {\n", - " text-align: right;\n", - " }\n", - "</style>\n", - "<table border=\"1\" class=\"dataframe\">\n", - " <thead>\n", - " <tr style=\"text-align: right;\">\n", - " <th></th>\n", - " <th>Review</th>\n", - " <th>Rating</th>\n", - " </tr>\n", - " </thead>\n", - " <tbody>\n", - " <tr>\n", - " <th>0</th>\n", - " <td>nice hotel expensive parking got good deal sta...</td>\n", - " <td>4</td>\n", - " </tr>\n", - " <tr>\n", - " <th>1</th>\n", - " <td>ok nothing special charge diamond member hilto...</td>\n", - " <td>2</td>\n", - " </tr>\n", - " <tr>\n", - " <th>2</th>\n", - " <td>nice rooms not 4* experience hotel monaco seat...</td>\n", - " <td>3</td>\n", - " </tr>\n", - " <tr>\n", - " <th>3</th>\n", - " <td>unique, great stay, wonderful time hotel monac...</td>\n", - " <td>5</td>\n", - " </tr>\n", - " <tr>\n", - " <th>4</th>\n", - " <td>great stay great stay, went seahawk game aweso...</td>\n", - " <td>5</td>\n", - " </tr>\n", - " </tbody>\n", - "</table>\n", - "</div>" - ], - "text/plain": [ - " Review Rating\n", - "0 nice hotel expensive parking got good deal sta... 4\n", - "1 ok nothing special charge diamond member hilto... 2\n", - "2 nice rooms not 4* experience hotel monaco seat... 3\n", - "3 unique, great stay, wonderful time hotel monac... 5\n", - "4 great stay great stay, went seahawk game aweso... 5" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "5 9054\n", - "4 6039\n", - "3 2184\n", - "2 1793\n", - "1 1421\n", - "Name: Rating, dtype: int64" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df[\"Rating\"].value_counts()" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYsAAAEDCAYAAADEAyg+AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy8li6FKAAAVO0lEQVR4nO3dfdBedX3n8ffHRB4EgkECiwkYdDNaZC1IQHwYtxZHstBp0BVlp5bUss1UacXasgPbbt3OLA5bXLfFLWyzRQitlUEWlqcFxFTYWRcCAYM8RCAFxEiEiLuARZGE7/5xnTSXd+7c5wRyPST3+zVzzTnnd87vur73JfEz5/c751ypKiRJmsqrRl2AJGn8GRaSpFaGhSSplWEhSWplWEiSWhkWkqRWM0ddwKDsv//+NX/+/FGXIUk7lbvuuuuHVTVnYvsuGxbz589n1apVoy5DknYqSb47WbvDUJKkVoaFJKmVYSFJamVYSJJaGRaSpFaGhSSplWEhSWplWEiSWu2yN+W9UvPPun7UJQDw2LknjroESfLMQpLUzrCQJLUyLCRJrQwLSVIrw0KS1MqwkCS1MiwkSa0MC0lSK8NCktTKsJAktTIsJEmtDAtJUivDQpLUyrCQJLUyLCRJrQwLSVIrw0KS1MqwkCS1MiwkSa0MC0lSq4GGRZLfS3J/kvuSfCXJHkn2S3Jzkoeb5ey+489OsjbJg0mO72s/Ksm9zb7zk2SQdUuSft7AwiLJXOBTwMKqOhyYAZwCnAWsqKoFwIpmmySHNfvfCiwCLkgyo3m7C4GlwILmtWhQdUuStjboYaiZwJ5JZgKvAZ4AFgPLm/3LgZOa9cXAZVX1QlU9CqwFjklyEDCrqm6rqgIu7esjSRqCgYVFVX0f+DzwOLAeeKaqvgYcWFXrm2PWAwc0XeYC3+t7i3VN29xmfWL7VpIsTbIqyaoNGzbsyD9Hkqa1QQ5DzaZ3tnAo8HpgryQfm6rLJG01RfvWjVXLqmphVS2cM2fO9pYsSdqGQQ5DvR94tKo2VNWLwJXAu4Anm6ElmuVTzfHrgIP7+s+jN2y1rlmf2C5JGpJBhsXjwLFJXtNcvXQcsAa4BljSHLMEuLpZvwY4JcnuSQ6lN5F9RzNU9VySY5v3ObWvjyRpCGYO6o2ramWSK4C7gY3At4BlwN7A5UlOoxcoJzfH35/kcuCB5vjTq2pT83afAC4B9gRuaF6SpCEZWFgAVNVngc9OaH6B3lnGZMefA5wzSfsq4PAdXqAkqRPv4JYktTIsJEmtDAtJUivDQpLUyrCQJLUyLCRJrQwLSVIrw0KS1MqwkCS1MiwkSa0MC0lSK8NCktTKsJAktTIsJEmtDAtJUivDQpLUyrCQJLUyLCRJrQwLSVIrw0KS1MqwkCS1MiwkSa0MC0lSK8NCktTKsJAktTIsJEmtWsMiyRlJZqXnoiR3J/nAMIqTJI2HLmcWv1lVzwIfAOYAHwfOHWhVkqSx0iUs0ixPAC6uqnv62iRJ00CXsLgrydfohcVNSfYBXhpsWZKkcTKzwzGnAUcAj1TV80leR28oSpI0TXQJiyOa5RuTfxx9eibJzKraOJiyJEnjpEtYXAC8Hfg2vbmKw5v11yX57ar62gDrkySNgS5zFo8BR1bVwqo6CjgSuA94P/CnA6xNkjQmuoTFW6rq/s0bVfUAvfB4ZHBlSZLGSZdhqAeTXAhc1mx/FHgoye7AiwOrTJI0NrqcWfwGsBb4NPB7wCNN24vA+6bqmOS1Sa5I8p0ka5K8M8l+SW5O8nCznN13/NlJ1iZ5MMnxfe1HJbm32Xd++mbaJUmD1xoWVfWTqvpPVfXBqjqpqj5fVc9X1UtV9eOW7n8O3FhVbwF+EVgDnAWsqKoFwIpmmySHAacAbwUWARckmdG8z4XAUmBB81q03X+pJOll6/JsqHc3ZwAPJXlk86tDv1nAe4GLAKrqZ1X1/4DFwPLmsOXASc36YuCyqnqhqh6ldzZzTJKDgFlVdVtVFXBpXx9J0hB0mbO4iN7w013Apu147zcCG4CLk/xi0/8M4MCqWg9QVeuTHNAcPxe4va//uqbtxWZ9YruGZP5Z14+6BAAeO/fEUZcgTVtd5iyeqaobquqpqnp686tDv5n07s+4sKqOBP6BZshpGyabh6gp2rd+g2RpklVJVm3YsKFDiZKkLrqExTeSnNdMTr9986tDv3XAuqpa2WxfQS88nmyGlmiWT/Udf3Bf/3nAE037vEnat1JVy5r7QRbOmTOnQ4mSpC66DEO9o1ku7Gsr4Jen6lRVP0jyvSRvrqoHgeOAB5rXEnqPOV8CXN10uQb42yRfAF5PbyL7jqralOS5JMcCK4FTgS92+uskSTtEa1hU1ZSXx7b4XeDLSXajd8ntx+mdzVye5DTgceDk5nPuT3I5vTDZCJxeVZvnSD4BXALsCdzQvCRJQ7LNsEjysar6mySfmWx/VX2h7c2rajU/f0ay2XHbOP4c4JxJ2lfReyaVJGkEpjqz2KtZ7jPJvkknmCVJu6ZthkVV/WWz+vWq+mb/viTvHmhVkqSx0uVqqMkmk51glqRpZKo5i3cC7wLmTJi3mAXMmLyXJGlXNNWcxW7A3s0x/fMWzwIfHmRRkqTxMtWcxa3ArUkuqarvDrEmSdKY6XJT3vNJzqP3NNg9NjdW1ZQ35UmSdh1dJri/DHwHOBT4E3o/s3rnAGuSJI2ZLmHxuqq6CHixqm6tqt8Ejh1wXZKkMdJlGGrzT6euT3IivYf4zZvieEnSLqZLWPyHJPsCv0/v/opZ9H7fQpI0TXR5kOB1zeozNL+5nWSvbfeQJO1qppyzSDI3ycLmqbEkOSDJ54CHh1KdJGksbDMsknwaWE1v6On2JEuANfQeE37UcMqTJI2DqYahlgJvrqofJTkEWAu8t6pun6KPJGkXNNUw1E+r6kcAVfU48JBBIUnT01RnFvOSnN+3fUD/dlV9anBlSZLGyVRhceaE7bsGWYgkaXxN9SDB5cMsRJI0vro87kOSNM0ZFpKkVlPdZ/Efm+XJwytHkjSOpjqzOCHJq4Gzh1WMJGk8TXU11I3AD4G9kjwLBKjNy6qaNYT6JEljYJtnFlV1ZlXtC1xfVbOqap/+5RBrlCSNWJenzi5OciBwdNO0sqo2DLYsSdI4ab0aqpngvgM4GfgIcEeSDw+6MEnS+Ojy40d/BBxdVU8BJJkDfB24YpCFSZLGR5f7LF61OSgaT3fsJ0naRXQ5s7gxyU3AV5rtjwL/c3AlSZLGTZcJ7jOTfAh4D73LZpdV1VUDr0ySNDa6nFlQVVcCVw64FknSmHLuQZLUyrCQJLXarrBIMjvJ2wZVjCRpPHW5Ke+WJLOS7AfcA1yc5AuDL02SNC66nFnsW1XPAh8CLq6qo4D3D7YsSdI46RIWM5McRO9RH9dt7wckmZHkW0mua7b3S3Jzkoeb5ey+Y89OsjbJg0mO72s/Ksm9zb7zk2R765AkvXxdwuJPgJuAtVV1Z5I3Ag9vx2ecAazp2z4LWFFVC4AVzTZJDgNOAd4KLAIuSDKj6XMhsBRY0LwWbcfnS5JeoS5hsb6q3lZVnwSoqkeATnMWSeYBJwJ/1de8GFjerC8HTuprv6yqXqiqR4G1wDHNWc2sqrqtqgq4tK+PJGkIuoTFFzu2TebPgH8DvNTXdmBVrQdolgc07XOB7/Udt65pm9usT2yXJA3JNu/gTvJO4F3AnCSf6ds1C5gxea+f6/8rwFNVdVeSX+pQy2TzEDVF+2SfuZTecBWHHHJIh4+UJHUx1eM+dgP2bo7Zp6/9WaDL71m8G/jVJCcAewCzkvwN8GSSg6pqfTPEtPmJtuuAg/v6zwOeaNrnTdK+lapaBiwDWLhw4aSBIknaftsMi6q6Fbg1ySVV9d3tfeOqOhs4G6A5s/iDqvpYkvOAJcC5zfLqpss1wN8293C8nt5E9h1VtSnJc0mOBVYCp9J9GEyStAN0eZDg7kmWAfP7j6+qX36Zn3kucHmS04DH6f0CH1V1f5LLgQeAjcDpVbWp6fMJ4BJgT+CG5iVJGpIuYfFV4L/Su6JpU8uxk6qqW4BbmvWngeO2cdw5wDmTtK8CDn85ny1JeuW6hMXGqrpw4JVIksZWl0tnr03yySQHNXdf79c8J0qSNE10ObNY0izP7Gsr4I07vhxJ0jjq8rOqhw6jEEnS+GoNiySnTtZeVZfu+HIkSeOoyzDU0X3re9C7kulues9okiRNA12GoX63fzvJvsBfD6wiSdLYeTm/wf08vburJUnTRJc5i2vZ8uC+GcAvAJcPsihJ0njpMmfx+b71jcB3q2rdtg6WJO16WoehmgcKfofek2dnAz8bdFGSpPHSGhZJPgLcQe+Bfx8BVibp8ohySdIuossw1B8CR1fVUwBJ5gBfB64YZGGSpPHR5WqoV20OisbTHftJknYRXc4sbkxyE/CVZvuj+HsSkjStdLkp78wkHwLeQ+/3sJdV1VUDr0ySNDa2GRZJ/ilwYFV9s6quBK5s2t+b5E1V9ffDKlKSNFpTzT38GfDcJO3PN/skSdPEVGExv6q+PbGx+YnT+QOrSJI0dqYKiz2m2Lfnji5EkjS+pgqLO5P81sTGJKcBdw2uJEnSuJnqaqhPA1cl+TW2hMNCYDfgg4MuTJI0PrYZFlX1JPCuJO8DDm+ar6+qvxtKZZKksdHlPotvAN8YQi2SpDHlYzskSa0MC0lSK8NCktTKsJAktTIsJEmtDAtJUivDQpLUyrCQJLUyLCRJrQwLSVIrw0KS1MqwkCS1an2QoKQt5p91/ahLAOCxc08cdQmaZgZ2ZpHk4CTfSLImyf1Jzmja90tyc5KHm+Xsvj5nJ1mb5MEkx/e1H5Xk3mbf+UkyqLolSVsb5DDURuD3q+oXgGOB05McBpwFrKiqBcCKZptm3ynAW4FFwAVJZjTvdSGwFFjQvBYNsG5J0gQDC4uqWl9VdzfrzwFrgLnAYmB5c9hy4KRmfTFwWVW9UFWPAmuBY5IcBMyqqtuqqoBL+/pIkoZgKBPcSeYDRwIrgQOraj30AgU4oDlsLvC9vm7rmra5zfrEdknSkAw8LJLsDfx34NNV9exUh07SVlO0T/ZZS5OsSrJqw4YN21+sJGlSAw2LJK+mFxRfrqorm+Ynm6ElmuVTTfs64OC+7vOAJ5r2eZO0b6WqllXVwqpaOGfOnB33h0jSNDfIq6ECXASsqaov9O26BljSrC8Bru5rPyXJ7kkOpTeRfUczVPVckmOb9zy1r48kaQgGeZ/Fu4FfB+5Nsrpp+7fAucDlSU4DHgdOBqiq+5NcDjxA70qq06tqU9PvE8AlwJ7ADc1LkjQkAwuLqvrfTD7fAHDcNvqcA5wzSfsq4PAdV52kV8obFKcXH/chSWplWEiSWhkWkqRWhoUkqZVhIUlqZVhIkloZFpKkVoaFJKmVv5QnSa/QdLhB0TMLSVIrw0KS1MqwkCS1MiwkSa0MC0lSK8NCktTKsJAktTIsJEmtDAtJUivDQpLUyrCQJLUyLCRJrQwLSVIrw0KS1MqwkCS1MiwkSa0MC0lSK8NCktTKsJAktTIsJEmtDAtJUivDQpLUyrCQJLUyLCRJrQwLSVIrw0KS1MqwkCS1MiwkSa12mrBIsijJg0nWJjlr1PVI0nSyU4RFkhnAXwD/AjgM+FdJDhttVZI0fewUYQEcA6ytqkeq6mfAZcDiEdckSdNGqmrUNbRK8mFgUVX962b714F3VNXvTDhuKbC02Xwz8OBQC93a/sAPR1zDuPC72MLvYgu/iy3G5bt4Q1XNmdg4cxSVvAyZpG2rlKuqZcCywZfTTZJVVbVw1HWMA7+LLfwutvC72GLcv4udZRhqHXBw3/Y84IkR1SJJ087OEhZ3AguSHJpkN+AU4JoR1yRJ08ZOMQxVVRuT/A5wEzAD+FJV3T/isroYmyGxMeB3sYXfxRZ+F1uM9XexU0xwS5JGa2cZhpIkjZBhIUlqZVhIkloZFgOS5D1JPpPkA6OuZRwkuXTUNYxKkmOSHN2sH9b8d3HCqOsahSRvSXJckr0ntC8aVU3qxgnuHSTJHVV1TLP+W8DpwFXAB4Brq+rcUdY3TEkmXtYc4H3A3wFU1a8OvagRSfJZes80mwncDLwDuAV4P3BTVZ0zuuqGK8mn6P27WAMcAZxRVVc3++6uqrePsr5xkeTjVXXxqOuYyLDYQZJ8q6qObNbvBE6oqg1J9gJur6p/NtoKhyfJ3cADwF/Ru9M+wFfo3R9DVd06uuqGK8m99P6PcXfgB8C8qno2yZ7Ayqp620gLHKLmu3hnVf04yXzgCuCvq+rP+//9THdJHq+qQ0Zdx0Q7xX0WO4lXJZlNb2gvVbUBoKr+IcnG0ZY2dAuBM4A/BM6sqtVJfjKdQqLPxqraBDyf5O+r6lmAqvpJkpdGXNuwzaiqHwNU1WNJfgm4IskbmPyRPrusJN/e1i7gwGHW0pVhsePsC9xF73/sSvJPquoHzdjstPqHUFUvAf85yVeb5ZNM3//WfpbkNVX1PHDU5sYk+wLTLSx+kOSIqloN0Jxh/ArwJWDanHk3DgSOB/7vhPYA/2f45bSbrv+Ad7iqmr+NXS8BHxxiKWOjqtYBJyc5EXh21PWMyHur6gX4xxDd7NXAktGUNDKnAj93ll1VG4FTk/zlaEoameuAvTcHZ78ktwy/nHbOWUiSWnnprCSplWEhSWplWEgdJdmUZHWS+5Jcm+S1Lce/Nskn+7Zfn+SKwVcq7XjOWUgdJflxVe3drC8HHprqprrmXoLrqurw4VQoDY5nFtLLcxswFyDJ3klWJLk7yb1JFjfHnAu8qTkbOS/J/CT3NX1+I8mVSW5M8nCSP938xklOS/JQkluS/Lck/6VpP7k5q7knyf8a8t+rac5LZ6XtlGQGcBxwUdP0U+CDzZ3Z+wO3N488OQs4vKqOaPrNn/BWRwBHAi8ADyb5IrAJ+HfA24Hn6D0i5Z7m+D8Gjq+q77cNgUk7mmcWUnd7JlkNPA3sR+9ZT9C7kepzzV25X6d3xtHlLtwVVfVMVf2U3uNR3gAcA9xaVT+qqheBr/Yd/03gkubZYzN2yF8kdWRYSN39pDlLeAOwG72H4gH8GjAHOKrZ/ySwR4f3e6FvfRO9M/1t3u1fVb8N/BFwMLA6yeu2+y+QXibDQtpOVfUM8CngD5K8mt6jXp6qqheTvI9emEBvGGmf7Xz7O4B/nmR2kpnAv9y8I8mbqmplVf0x8EN6oSENhXMW0stQVd9Kcg+9J+l+Gbg2ySpgNfCd5pink3yzmdS+AfiLDu/7/SSfA1YCT9Abnnqm2X1ekgX0zj5WsGUuQxo4L52VxkySvZuH7M2k95soX6qqq0Zdl6Y3h6Gk8fPvm4n0+4BHgf8x4nokzywkSe08s5AktTIsJEmtDAtJUivDQpLUyrCQJLUyLCRJrf4/4Sb3l7Rgu70AAAAASUVORK5CYII=\n", - "text/plain": [ - "<Figure size 432x288 with 1 Axes>" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "df['Rating'].value_counts().plot(kind='bar', xlabel='Ratings', ylabel = 'Counts of Ratings');" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Preprocess data" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "def pre_processing(review):\n", - " \n", - " #Make review lowercase\n", - " review = review.lower()\n", - " \n", - " #Remove text in square brackets \n", - " review = re.sub('\\[.*?\\]', '', review)\n", - " \n", - " #Remove punctuation and remove words containing numbers.\"\n", - " review = re.sub('[%s]' % re.escape(string.punctuation), '', review)\n", - " \n", - " #Remove numbers in review\n", - " review = re.sub('\\w*\\d\\w*', '', review)\n", - "\n", - " \n", - " return review" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "df[\"Review\"] = df[\"Review\"].apply(pre_processing) #Apply preprocessing to every review" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Split into test and training data" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [], - "source": [ - "X = df[\"Review\"]\n", - "Y = df[\"Rating\"]\n", - "\n", - "train, test = train_test_split(df, random_state = 0, test_size = 0.2) #Used for undersampling\n", - "x_train, x_test, y_train, y_test = train_test_split(X, Y, random_state = 0, test_size = 0.2)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Undersampling" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "The ratings and their amounts 5 7269\n", - "4 4836\n", - "3 1713\n", - "2 1438\n", - "1 1136\n", - "Name: Rating, dtype: int64\n", - "Undersampling results:\n", - " 1 1136\n", - "4 1136\n", - "5 1136\n", - "2 1136\n", - "3 1136\n", - "Name: Rating, dtype: int64\n" - ] - } - ], - "source": [ - "tot_ratings = train['Rating'].value_counts()\n", - "print(\"The ratings and their amounts\", tot_ratings)\n", - "\n", - "#Find the minorty class\n", - "samples = min(tot_ratings) \n", - "\n", - "training_data = train.sample(frac=1)\n", - "\n", - "#Scale classes to be equal to the number of samples of the minority class\n", - "rate_1 = training_data.loc[training_data['Rating'] == 1][:samples]\n", - "rate_2 = training_data.loc[training_data['Rating'] == 2][:samples]\n", - "rate_3 = training_data.loc[training_data['Rating'] == 3][:samples]\n", - "rate_4 = training_data.loc[training_data['Rating'] == 4][:samples]\n", - "rate_5 = training_data.loc[training_data['Rating'] == 5][:samples]\n", - "\n", - "#Set new training data to the balanced class samples\n", - "new_training_data = pd.concat([rate_1, rate_2, rate_3, rate_4, rate_5])\n", - "\n", - "# Shuffle dataframe rows and make it to the new df which is now undersampled\n", - "new_df = new_training_data.sample(frac=1, random_state=samples)\n", - "\n", - "new_tot_ratings = new_df['Rating'].value_counts()\n", - "print(\"Undersampling results:\\n\",new_tot_ratings)\n", - "\n", - "#Make new balanced data traning data\n", - "x_train_undersampled = new_df['Review']\n", - "y_train_undersampled = new_df['Rating']" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Dummy Baseline" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Mean Accuracy: 0.3030007318858258\n", - "Mean Accuracy For Undersampled Data: 0.194193705781898\n" - ] - } - ], - "source": [ - "dummy_clf = DummyClassifier(strategy=\"stratified\")\n", - "\n", - "\n", - "dummy_clf.fit(x_train, y_train) #Train classifier\n", - "\n", - "\n", - "y_pred = dummy_clf.predict(x_test)\n", - "y_true = y_test\n", - "\n", - "print(\"Mean Accuracy:\", dummy_clf.score(y_true , y_pred))\n", - "\n", - "dummy_clf.fit(x_train_undersampled, y_train_undersampled) #Train classifier\n", - "\n", - "y_pred = dummy_clf.predict(x_test)\n", - "y_true = y_test\n", - "\n", - "print(\"Mean Accuracy For Undersampled Data:\", dummy_clf.score(y_true , y_pred))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Multinomial Naive Bayes" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 39, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Fitting 5 folds for each of 4 candidates, totalling 20 fits\n", - "[CV] alpha=0.01 ......................................................\n", - "[CV] ....................................... alpha=0.01, total= 0.0s\n", - "[CV] alpha=0.01 ......................................................\n", - "[CV] ....................................... alpha=0.01, total= 0.0s\n", - "[CV] alpha=0.01 ......................................................\n", - "[CV] ....................................... alpha=0.01, total= 0.0s\n", - "[CV] alpha=0.01 ......................................................\n", - "[CV] ....................................... alpha=0.01, total= 0.0s\n", - "[CV] alpha=0.01 ......................................................\n", - "[CV] ....................................... alpha=0.01, total= 0.0s\n", - "[CV] alpha=0.1 .......................................................\n", - "[CV] ........................................ alpha=0.1, total= 0.0s\n", - "[CV] alpha=0.1 .......................................................\n", - "[CV] ........................................ alpha=0.1, total= 0.0s\n", - "[CV] alpha=0.1 .......................................................\n", - "[CV] ........................................ alpha=0.1, total= 0.0s\n", - "[CV] alpha=0.1 .......................................................\n", - "[CV] ........................................ alpha=0.1, total= 0.0s\n", - "[CV] alpha=0.1 .......................................................\n", - "[CV] ........................................ alpha=0.1, total= 0.0s\n", - "[CV] alpha=1.0 .......................................................\n", - "[CV] ........................................ alpha=1.0, total= 0.0s\n", - "[CV] alpha=1.0 .......................................................\n", - "[CV] ........................................ alpha=1.0, total= 0.0s\n", - "[CV] alpha=1.0 .......................................................\n", - "[CV] ........................................ alpha=1.0, total= 0.0s\n", - "[CV] alpha=1.0 .......................................................\n", - "[CV] ........................................ alpha=1.0, total= 0.0s\n", - "[CV] alpha=1.0 .......................................................\n", - "[CV] ........................................ alpha=1.0, total= 0.0s\n", - "[CV] alpha=10 ........................................................\n", - "[CV] ......................................... alpha=10, total= 0.0s\n", - "[CV] alpha=10 ........................................................\n", - "[CV] ......................................... alpha=10, total= 0.0s\n", - "[CV] alpha=10 ........................................................\n", - "[CV] ......................................... alpha=10, total= 0.0s\n", - "[CV] alpha=10 ........................................................\n", - "[CV] ......................................... alpha=10, total= 0.0s\n", - "[CV] alpha=10 ........................................................\n", - "[CV] ......................................... alpha=10, total= 0.0s\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n", - "/home/racho401/.local/lib/python3.8/site-packages/sklearn/model_selection/_validation.py:532: FitFailedWarning: Estimator fit failed. The score on this train-test partition for these parameters will be set to nan. Details: \n", - "ValueError: could not convert string to float: 'excellent location excellent value hotel just walk chinatown centre sydney quiet residential area door small park area good quiet location turn left hotel hill footbridge power msuem easy hotel slightly dated not say dirty rooms ok needed clean working no complaints best performing aircon hotel room work bonus nice balcony staff helpful sightseeing nice little cheap night hotel benefit excellent location similar hotel centre london wold prob cost night does not facilities larger mercure novotel rarely use does room service laundry '\n", - "\n", - " warnings.warn(\"Estimator fit failed. The score on this train-test\"\n", - "[Parallel(n_jobs=1)]: Done 1 out of 1 | elapsed: 0.0s remaining: 0.0s\n", - "/home/racho401/.local/lib/python3.8/site-packages/sklearn/model_selection/_validation.py:532: FitFailedWarning: Estimator fit failed. The score on this train-test partition for these parameters will be set to nan. Details: \n", - "ValueError: could not convert string to float: 'good place stay check rainforest biobay viequesculebra rate place diamond hotel certain aspects rooms make place diamond aspects place exceed diamond hotel room stayed main building main building rooms odd numbers overlook town area rooms numbers views ocean room queen beds huge say quite nice stay place actually bed wall room inroom safe small refrigerator table suitable eating working thing missing room microwave furniture nice modern beds mushy soft bedspreads little worn tv fairly small bathroom little dated shower stall instead tub overall room acceptable little better expected thing not turn ac leave room hours ac powerful room quite cold came no heat room open windows let heat hotel hotel small business center wireless access unfortunately wireless not reached room porch lobby access fortunately tableschairs porch work hotel staff friendly sure ask copy pasaç goç did not eat seafood restaurant hotel mesones restaurant mesones kind like seal approval restaurants serving authentic puerto rico cuisine mexican restaurant hotel fairly inexpensive nothing write home hotel pools hotel typical pool hill cocoçé expect resort need check cocoçé guest house visit cocoçé pool towels guest house use pool area warned confusing signs pool hours içé\\x96 not sure pool actually closed signs pools said pool open till said pm note door hotel room said nice gym free weights site place play minigolf tennis basketball noted hotel easy miss oneway loop docks went far sign says hotel posters noted \\x8e blue sign easy miss case miss turn sign direction partially obscured vegetation road hotel fairly wide road guess feet wide eatshop american fast food family style chain restaurants route route including mcdonalds wendyçé ponderosa sizzler walmart grocery stores walgreens area golden bagel restaurants hotel little commercial activity near hotel recommend renting car area nearby activities drive docks ferry vieques culebra gated lots charge parking fee ferry docks north ferry adult oneway purchase tickets street ferry dock not want ferry islands storm coming high winds water choppy not pick outside seats ferry unless want smell diesel exhaust hour trip nearest beach hotel seven seas beach quite easy bioluminescent bay fajardo near seven seas beach '\n", - "\n", - " warnings.warn(\"Estimator fit failed. The score on this train-test\"\n", - "/home/racho401/.local/lib/python3.8/site-packages/sklearn/model_selection/_validation.py:532: FitFailedWarning: Estimator fit failed. The score on this train-test partition for these parameters will be set to nan. Details: \n", - "ValueError: could not convert string to float: 'good place stay check rainforest biobay viequesculebra rate place diamond hotel certain aspects rooms make place diamond aspects place exceed diamond hotel room stayed main building main building rooms odd numbers overlook town area rooms numbers views ocean room queen beds huge say quite nice stay place actually bed wall room inroom safe small refrigerator table suitable eating working thing missing room microwave furniture nice modern beds mushy soft bedspreads little worn tv fairly small bathroom little dated shower stall instead tub overall room acceptable little better expected thing not turn ac leave room hours ac powerful room quite cold came no heat room open windows let heat hotel hotel small business center wireless access unfortunately wireless not reached room porch lobby access fortunately tableschairs porch work hotel staff friendly sure ask copy pasaç goç did not eat seafood restaurant hotel mesones restaurant mesones kind like seal approval restaurants serving authentic puerto rico cuisine mexican restaurant hotel fairly inexpensive nothing write home hotel pools hotel typical pool hill cocoçé expect resort need check cocoçé guest house visit cocoçé pool towels guest house use pool area warned confusing signs pool hours içé\\x96 not sure pool actually closed signs pools said pool open till said pm note door hotel room said nice gym free weights site place play minigolf tennis basketball noted hotel easy miss oneway loop docks went far sign says hotel posters noted \\x8e blue sign easy miss case miss turn sign direction partially obscured vegetation road hotel fairly wide road guess feet wide eatshop american fast food family style chain restaurants route route including mcdonalds wendyçé ponderosa sizzler walmart grocery stores walgreens area golden bagel restaurants hotel little commercial activity near hotel recommend renting car area nearby activities drive docks ferry vieques culebra gated lots charge parking fee ferry docks north ferry adult oneway purchase tickets street ferry dock not want ferry islands storm coming high winds water choppy not pick outside seats ferry unless want smell diesel exhaust hour trip nearest beach hotel seven seas beach quite easy bioluminescent bay fajardo near seven seas beach '\n", - "\n", - " warnings.warn(\"Estimator fit failed. The score on this train-test\"\n", - "/home/racho401/.local/lib/python3.8/site-packages/sklearn/model_selection/_validation.py:532: FitFailedWarning: Estimator fit failed. The score on this train-test partition for these parameters will be set to nan. Details: \n", - "ValueError: could not convert string to float: 'good place stay check rainforest biobay viequesculebra rate place diamond hotel certain aspects rooms make place diamond aspects place exceed diamond hotel room stayed main building main building rooms odd numbers overlook town area rooms numbers views ocean room queen beds huge say quite nice stay place actually bed wall room inroom safe small refrigerator table suitable eating working thing missing room microwave furniture nice modern beds mushy soft bedspreads little worn tv fairly small bathroom little dated shower stall instead tub overall room acceptable little better expected thing not turn ac leave room hours ac powerful room quite cold came no heat room open windows let heat hotel hotel small business center wireless access unfortunately wireless not reached room porch lobby access fortunately tableschairs porch work hotel staff friendly sure ask copy pasaç goç did not eat seafood restaurant hotel mesones restaurant mesones kind like seal approval restaurants serving authentic puerto rico cuisine mexican restaurant hotel fairly inexpensive nothing write home hotel pools hotel typical pool hill cocoçé expect resort need check cocoçé guest house visit cocoçé pool towels guest house use pool area warned confusing signs pool hours içé\\x96 not sure pool actually closed signs pools said pool open till said pm note door hotel room said nice gym free weights site place play minigolf tennis basketball noted hotel easy miss oneway loop docks went far sign says hotel posters noted \\x8e blue sign easy miss case miss turn sign direction partially obscured vegetation road hotel fairly wide road guess feet wide eatshop american fast food family style chain restaurants route route including mcdonalds wendyçé ponderosa sizzler walmart grocery stores walgreens area golden bagel restaurants hotel little commercial activity near hotel recommend renting car area nearby activities drive docks ferry vieques culebra gated lots charge parking fee ferry docks north ferry adult oneway purchase tickets street ferry dock not want ferry islands storm coming high winds water choppy not pick outside seats ferry unless want smell diesel exhaust hour trip nearest beach hotel seven seas beach quite easy bioluminescent bay fajardo near seven seas beach '\n", - "\n", - " warnings.warn(\"Estimator fit failed. The score on this train-test\"\n", - "/home/racho401/.local/lib/python3.8/site-packages/sklearn/model_selection/_validation.py:532: FitFailedWarning: Estimator fit failed. The score on this train-test partition for these parameters will be set to nan. Details: \n", - "ValueError: could not convert string to float: 'good place stay check rainforest biobay viequesculebra rate place diamond hotel certain aspects rooms make place diamond aspects place exceed diamond hotel room stayed main building main building rooms odd numbers overlook town area rooms numbers views ocean room queen beds huge say quite nice stay place actually bed wall room inroom safe small refrigerator table suitable eating working thing missing room microwave furniture nice modern beds mushy soft bedspreads little worn tv fairly small bathroom little dated shower stall instead tub overall room acceptable little better expected thing not turn ac leave room hours ac powerful room quite cold came no heat room open windows let heat hotel hotel small business center wireless access unfortunately wireless not reached room porch lobby access fortunately tableschairs porch work hotel staff friendly sure ask copy pasaç goç did not eat seafood restaurant hotel mesones restaurant mesones kind like seal approval restaurants serving authentic puerto rico cuisine mexican restaurant hotel fairly inexpensive nothing write home hotel pools hotel typical pool hill cocoçé expect resort need check cocoçé guest house visit cocoçé pool towels guest house use pool area warned confusing signs pool hours içé\\x96 not sure pool actually closed signs pools said pool open till said pm note door hotel room said nice gym free weights site place play minigolf tennis basketball noted hotel easy miss oneway loop docks went far sign says hotel posters noted \\x8e blue sign easy miss case miss turn sign direction partially obscured vegetation road hotel fairly wide road guess feet wide eatshop american fast food family style chain restaurants route route including mcdonalds wendyçé ponderosa sizzler walmart grocery stores walgreens area golden bagel restaurants hotel little commercial activity near hotel recommend renting car area nearby activities drive docks ferry vieques culebra gated lots charge parking fee ferry docks north ferry adult oneway purchase tickets street ferry dock not want ferry islands storm coming high winds water choppy not pick outside seats ferry unless want smell diesel exhaust hour trip nearest beach hotel seven seas beach quite easy bioluminescent bay fajardo near seven seas beach '\n", - "\n", - " warnings.warn(\"Estimator fit failed. The score on this train-test\"\n", - "/home/racho401/.local/lib/python3.8/site-packages/sklearn/model_selection/_validation.py:532: FitFailedWarning: Estimator fit failed. The score on this train-test partition for these parameters will be set to nan. Details: \n", - "ValueError: could not convert string to float: 'excellent location excellent value hotel just walk chinatown centre sydney quiet residential area door small park area good quiet location turn left hotel hill footbridge power msuem easy hotel slightly dated not say dirty rooms ok needed clean working no complaints best performing aircon hotel room work bonus nice balcony staff helpful sightseeing nice little cheap night hotel benefit excellent location similar hotel centre london wold prob cost night does not facilities larger mercure novotel rarely use does room service laundry '\n", - "\n", - " warnings.warn(\"Estimator fit failed. The score on this train-test\"\n", - "/home/racho401/.local/lib/python3.8/site-packages/sklearn/model_selection/_validation.py:532: FitFailedWarning: Estimator fit failed. The score on this train-test partition for these parameters will be set to nan. Details: \n", - "ValueError: could not convert string to float: 'good place stay check rainforest biobay viequesculebra rate place diamond hotel certain aspects rooms make place diamond aspects place exceed diamond hotel room stayed main building main building rooms odd numbers overlook town area rooms numbers views ocean room queen beds huge say quite nice stay place actually bed wall room inroom safe small refrigerator table suitable eating working thing missing room microwave furniture nice modern beds mushy soft bedspreads little worn tv fairly small bathroom little dated shower stall instead tub overall room acceptable little better expected thing not turn ac leave room hours ac powerful room quite cold came no heat room open windows let heat hotel hotel small business center wireless access unfortunately wireless not reached room porch lobby access fortunately tableschairs porch work hotel staff friendly sure ask copy pasaç goç did not eat seafood restaurant hotel mesones restaurant mesones kind like seal approval restaurants serving authentic puerto rico cuisine mexican restaurant hotel fairly inexpensive nothing write home hotel pools hotel typical pool hill cocoçé expect resort need check cocoçé guest house visit cocoçé pool towels guest house use pool area warned confusing signs pool hours içé\\x96 not sure pool actually closed signs pools said pool open till said pm note door hotel room said nice gym free weights site place play minigolf tennis basketball noted hotel easy miss oneway loop docks went far sign says hotel posters noted \\x8e blue sign easy miss case miss turn sign direction partially obscured vegetation road hotel fairly wide road guess feet wide eatshop american fast food family style chain restaurants route route including mcdonalds wendyçé ponderosa sizzler walmart grocery stores walgreens area golden bagel restaurants hotel little commercial activity near hotel recommend renting car area nearby activities drive docks ferry vieques culebra gated lots charge parking fee ferry docks north ferry adult oneway purchase tickets street ferry dock not want ferry islands storm coming high winds water choppy not pick outside seats ferry unless want smell diesel exhaust hour trip nearest beach hotel seven seas beach quite easy bioluminescent bay fajardo near seven seas beach '\n", - "\n", - " warnings.warn(\"Estimator fit failed. The score on this train-test\"\n", - "/home/racho401/.local/lib/python3.8/site-packages/sklearn/model_selection/_validation.py:532: FitFailedWarning: Estimator fit failed. The score on this train-test partition for these parameters will be set to nan. Details: \n", - "ValueError: could not convert string to float: 'good place stay check rainforest biobay viequesculebra rate place diamond hotel certain aspects rooms make place diamond aspects place exceed diamond hotel room stayed main building main building rooms odd numbers overlook town area rooms numbers views ocean room queen beds huge say quite nice stay place actually bed wall room inroom safe small refrigerator table suitable eating working thing missing room microwave furniture nice modern beds mushy soft bedspreads little worn tv fairly small bathroom little dated shower stall instead tub overall room acceptable little better expected thing not turn ac leave room hours ac powerful room quite cold came no heat room open windows let heat hotel hotel small business center wireless access unfortunately wireless not reached room porch lobby access fortunately tableschairs porch work hotel staff friendly sure ask copy pasaç goç did not eat seafood restaurant hotel mesones restaurant mesones kind like seal approval restaurants serving authentic puerto rico cuisine mexican restaurant hotel fairly inexpensive nothing write home hotel pools hotel typical pool hill cocoçé expect resort need check cocoçé guest house visit cocoçé pool towels guest house use pool area warned confusing signs pool hours içé\\x96 not sure pool actually closed signs pools said pool open till said pm note door hotel room said nice gym free weights site place play minigolf tennis basketball noted hotel easy miss oneway loop docks went far sign says hotel posters noted \\x8e blue sign easy miss case miss turn sign direction partially obscured vegetation road hotel fairly wide road guess feet wide eatshop american fast food family style chain restaurants route route including mcdonalds wendyçé ponderosa sizzler walmart grocery stores walgreens area golden bagel restaurants hotel little commercial activity near hotel recommend renting car area nearby activities drive docks ferry vieques culebra gated lots charge parking fee ferry docks north ferry adult oneway purchase tickets street ferry dock not want ferry islands storm coming high winds water choppy not pick outside seats ferry unless want smell diesel exhaust hour trip nearest beach hotel seven seas beach quite easy bioluminescent bay fajardo near seven seas beach '\n", - "\n", - " warnings.warn(\"Estimator fit failed. The score on this train-test\"\n", - "/home/racho401/.local/lib/python3.8/site-packages/sklearn/model_selection/_validation.py:532: FitFailedWarning: Estimator fit failed. The score on this train-test partition for these parameters will be set to nan. Details: \n", - "ValueError: could not convert string to float: 'good place stay check rainforest biobay viequesculebra rate place diamond hotel certain aspects rooms make place diamond aspects place exceed diamond hotel room stayed main building main building rooms odd numbers overlook town area rooms numbers views ocean room queen beds huge say quite nice stay place actually bed wall room inroom safe small refrigerator table suitable eating working thing missing room microwave furniture nice modern beds mushy soft bedspreads little worn tv fairly small bathroom little dated shower stall instead tub overall room acceptable little better expected thing not turn ac leave room hours ac powerful room quite cold came no heat room open windows let heat hotel hotel small business center wireless access unfortunately wireless not reached room porch lobby access fortunately tableschairs porch work hotel staff friendly sure ask copy pasaç goç did not eat seafood restaurant hotel mesones restaurant mesones kind like seal approval restaurants serving authentic puerto rico cuisine mexican restaurant hotel fairly inexpensive nothing write home hotel pools hotel typical pool hill cocoçé expect resort need check cocoçé guest house visit cocoçé pool towels guest house use pool area warned confusing signs pool hours içé\\x96 not sure pool actually closed signs pools said pool open till said pm note door hotel room said nice gym free weights site place play minigolf tennis basketball noted hotel easy miss oneway loop docks went far sign says hotel posters noted \\x8e blue sign easy miss case miss turn sign direction partially obscured vegetation road hotel fairly wide road guess feet wide eatshop american fast food family style chain restaurants route route including mcdonalds wendyçé ponderosa sizzler walmart grocery stores walgreens area golden bagel restaurants hotel little commercial activity near hotel recommend renting car area nearby activities drive docks ferry vieques culebra gated lots charge parking fee ferry docks north ferry adult oneway purchase tickets street ferry dock not want ferry islands storm coming high winds water choppy not pick outside seats ferry unless want smell diesel exhaust hour trip nearest beach hotel seven seas beach quite easy bioluminescent bay fajardo near seven seas beach '\n", - "\n", - " warnings.warn(\"Estimator fit failed. The score on this train-test\"\n", - "/home/racho401/.local/lib/python3.8/site-packages/sklearn/model_selection/_validation.py:532: FitFailedWarning: Estimator fit failed. The score on this train-test partition for these parameters will be set to nan. Details: \n", - "ValueError: could not convert string to float: 'good place stay check rainforest biobay viequesculebra rate place diamond hotel certain aspects rooms make place diamond aspects place exceed diamond hotel room stayed main building main building rooms odd numbers overlook town area rooms numbers views ocean room queen beds huge say quite nice stay place actually bed wall room inroom safe small refrigerator table suitable eating working thing missing room microwave furniture nice modern beds mushy soft bedspreads little worn tv fairly small bathroom little dated shower stall instead tub overall room acceptable little better expected thing not turn ac leave room hours ac powerful room quite cold came no heat room open windows let heat hotel hotel small business center wireless access unfortunately wireless not reached room porch lobby access fortunately tableschairs porch work hotel staff friendly sure ask copy pasaç goç did not eat seafood restaurant hotel mesones restaurant mesones kind like seal approval restaurants serving authentic puerto rico cuisine mexican restaurant hotel fairly inexpensive nothing write home hotel pools hotel typical pool hill cocoçé expect resort need check cocoçé guest house visit cocoçé pool towels guest house use pool area warned confusing signs pool hours içé\\x96 not sure pool actually closed signs pools said pool open till said pm note door hotel room said nice gym free weights site place play minigolf tennis basketball noted hotel easy miss oneway loop docks went far sign says hotel posters noted \\x8e blue sign easy miss case miss turn sign direction partially obscured vegetation road hotel fairly wide road guess feet wide eatshop american fast food family style chain restaurants route route including mcdonalds wendyçé ponderosa sizzler walmart grocery stores walgreens area golden bagel restaurants hotel little commercial activity near hotel recommend renting car area nearby activities drive docks ferry vieques culebra gated lots charge parking fee ferry docks north ferry adult oneway purchase tickets street ferry dock not want ferry islands storm coming high winds water choppy not pick outside seats ferry unless want smell diesel exhaust hour trip nearest beach hotel seven seas beach quite easy bioluminescent bay fajardo near seven seas beach '\n", - "\n", - " warnings.warn(\"Estimator fit failed. The score on this train-test\"\n", - "/home/racho401/.local/lib/python3.8/site-packages/sklearn/model_selection/_validation.py:532: FitFailedWarning: Estimator fit failed. The score on this train-test partition for these parameters will be set to nan. Details: \n", - "ValueError: could not convert string to float: 'excellent location excellent value hotel just walk chinatown centre sydney quiet residential area door small park area good quiet location turn left hotel hill footbridge power msuem easy hotel slightly dated not say dirty rooms ok needed clean working no complaints best performing aircon hotel room work bonus nice balcony staff helpful sightseeing nice little cheap night hotel benefit excellent location similar hotel centre london wold prob cost night does not facilities larger mercure novotel rarely use does room service laundry '\n", - "\n", - " warnings.warn(\"Estimator fit failed. The score on this train-test\"\n", - "/home/racho401/.local/lib/python3.8/site-packages/sklearn/model_selection/_validation.py:532: FitFailedWarning: Estimator fit failed. The score on this train-test partition for these parameters will be set to nan. Details: \n", - "ValueError: could not convert string to float: 'good place stay check rainforest biobay viequesculebra rate place diamond hotel certain aspects rooms make place diamond aspects place exceed diamond hotel room stayed main building main building rooms odd numbers overlook town area rooms numbers views ocean room queen beds huge say quite nice stay place actually bed wall room inroom safe small refrigerator table suitable eating working thing missing room microwave furniture nice modern beds mushy soft bedspreads little worn tv fairly small bathroom little dated shower stall instead tub overall room acceptable little better expected thing not turn ac leave room hours ac powerful room quite cold came no heat room open windows let heat hotel hotel small business center wireless access unfortunately wireless not reached room porch lobby access fortunately tableschairs porch work hotel staff friendly sure ask copy pasaç goç did not eat seafood restaurant hotel mesones restaurant mesones kind like seal approval restaurants serving authentic puerto rico cuisine mexican restaurant hotel fairly inexpensive nothing write home hotel pools hotel typical pool hill cocoçé expect resort need check cocoçé guest house visit cocoçé pool towels guest house use pool area warned confusing signs pool hours içé\\x96 not sure pool actually closed signs pools said pool open till said pm note door hotel room said nice gym free weights site place play minigolf tennis basketball noted hotel easy miss oneway loop docks went far sign says hotel posters noted \\x8e blue sign easy miss case miss turn sign direction partially obscured vegetation road hotel fairly wide road guess feet wide eatshop american fast food family style chain restaurants route route including mcdonalds wendyçé ponderosa sizzler walmart grocery stores walgreens area golden bagel restaurants hotel little commercial activity near hotel recommend renting car area nearby activities drive docks ferry vieques culebra gated lots charge parking fee ferry docks north ferry adult oneway purchase tickets street ferry dock not want ferry islands storm coming high winds water choppy not pick outside seats ferry unless want smell diesel exhaust hour trip nearest beach hotel seven seas beach quite easy bioluminescent bay fajardo near seven seas beach '\n", - "\n", - " warnings.warn(\"Estimator fit failed. The score on this train-test\"\n", - "/home/racho401/.local/lib/python3.8/site-packages/sklearn/model_selection/_validation.py:532: FitFailedWarning: Estimator fit failed. The score on this train-test partition for these parameters will be set to nan. Details: \n", - "ValueError: could not convert string to float: 'good place stay check rainforest biobay viequesculebra rate place diamond hotel certain aspects rooms make place diamond aspects place exceed diamond hotel room stayed main building main building rooms odd numbers overlook town area rooms numbers views ocean room queen beds huge say quite nice stay place actually bed wall room inroom safe small refrigerator table suitable eating working thing missing room microwave furniture nice modern beds mushy soft bedspreads little worn tv fairly small bathroom little dated shower stall instead tub overall room acceptable little better expected thing not turn ac leave room hours ac powerful room quite cold came no heat room open windows let heat hotel hotel small business center wireless access unfortunately wireless not reached room porch lobby access fortunately tableschairs porch work hotel staff friendly sure ask copy pasaç goç did not eat seafood restaurant hotel mesones restaurant mesones kind like seal approval restaurants serving authentic puerto rico cuisine mexican restaurant hotel fairly inexpensive nothing write home hotel pools hotel typical pool hill cocoçé expect resort need check cocoçé guest house visit cocoçé pool towels guest house use pool area warned confusing signs pool hours içé\\x96 not sure pool actually closed signs pools said pool open till said pm note door hotel room said nice gym free weights site place play minigolf tennis basketball noted hotel easy miss oneway loop docks went far sign says hotel posters noted \\x8e blue sign easy miss case miss turn sign direction partially obscured vegetation road hotel fairly wide road guess feet wide eatshop american fast food family style chain restaurants route route including mcdonalds wendyçé ponderosa sizzler walmart grocery stores walgreens area golden bagel restaurants hotel little commercial activity near hotel recommend renting car area nearby activities drive docks ferry vieques culebra gated lots charge parking fee ferry docks north ferry adult oneway purchase tickets street ferry dock not want ferry islands storm coming high winds water choppy not pick outside seats ferry unless want smell diesel exhaust hour trip nearest beach hotel seven seas beach quite easy bioluminescent bay fajardo near seven seas beach '\n", - "\n", - " warnings.warn(\"Estimator fit failed. The score on this train-test\"\n", - "/home/racho401/.local/lib/python3.8/site-packages/sklearn/model_selection/_validation.py:532: FitFailedWarning: Estimator fit failed. The score on this train-test partition for these parameters will be set to nan. Details: \n", - "ValueError: could not convert string to float: 'good place stay check rainforest biobay viequesculebra rate place diamond hotel certain aspects rooms make place diamond aspects place exceed diamond hotel room stayed main building main building rooms odd numbers overlook town area rooms numbers views ocean room queen beds huge say quite nice stay place actually bed wall room inroom safe small refrigerator table suitable eating working thing missing room microwave furniture nice modern beds mushy soft bedspreads little worn tv fairly small bathroom little dated shower stall instead tub overall room acceptable little better expected thing not turn ac leave room hours ac powerful room quite cold came no heat room open windows let heat hotel hotel small business center wireless access unfortunately wireless not reached room porch lobby access fortunately tableschairs porch work hotel staff friendly sure ask copy pasaç goç did not eat seafood restaurant hotel mesones restaurant mesones kind like seal approval restaurants serving authentic puerto rico cuisine mexican restaurant hotel fairly inexpensive nothing write home hotel pools hotel typical pool hill cocoçé expect resort need check cocoçé guest house visit cocoçé pool towels guest house use pool area warned confusing signs pool hours içé\\x96 not sure pool actually closed signs pools said pool open till said pm note door hotel room said nice gym free weights site place play minigolf tennis basketball noted hotel easy miss oneway loop docks went far sign says hotel posters noted \\x8e blue sign easy miss case miss turn sign direction partially obscured vegetation road hotel fairly wide road guess feet wide eatshop american fast food family style chain restaurants route route including mcdonalds wendyçé ponderosa sizzler walmart grocery stores walgreens area golden bagel restaurants hotel little commercial activity near hotel recommend renting car area nearby activities drive docks ferry vieques culebra gated lots charge parking fee ferry docks north ferry adult oneway purchase tickets street ferry dock not want ferry islands storm coming high winds water choppy not pick outside seats ferry unless want smell diesel exhaust hour trip nearest beach hotel seven seas beach quite easy bioluminescent bay fajardo near seven seas beach '\n", - "\n", - " warnings.warn(\"Estimator fit failed. The score on this train-test\"\n", - "/home/racho401/.local/lib/python3.8/site-packages/sklearn/model_selection/_validation.py:532: FitFailedWarning: Estimator fit failed. The score on this train-test partition for these parameters will be set to nan. Details: \n", - "ValueError: could not convert string to float: 'good place stay check rainforest biobay viequesculebra rate place diamond hotel certain aspects rooms make place diamond aspects place exceed diamond hotel room stayed main building main building rooms odd numbers overlook town area rooms numbers views ocean room queen beds huge say quite nice stay place actually bed wall room inroom safe small refrigerator table suitable eating working thing missing room microwave furniture nice modern beds mushy soft bedspreads little worn tv fairly small bathroom little dated shower stall instead tub overall room acceptable little better expected thing not turn ac leave room hours ac powerful room quite cold came no heat room open windows let heat hotel hotel small business center wireless access unfortunately wireless not reached room porch lobby access fortunately tableschairs porch work hotel staff friendly sure ask copy pasaç goç did not eat seafood restaurant hotel mesones restaurant mesones kind like seal approval restaurants serving authentic puerto rico cuisine mexican restaurant hotel fairly inexpensive nothing write home hotel pools hotel typical pool hill cocoçé expect resort need check cocoçé guest house visit cocoçé pool towels guest house use pool area warned confusing signs pool hours içé\\x96 not sure pool actually closed signs pools said pool open till said pm note door hotel room said nice gym free weights site place play minigolf tennis basketball noted hotel easy miss oneway loop docks went far sign says hotel posters noted \\x8e blue sign easy miss case miss turn sign direction partially obscured vegetation road hotel fairly wide road guess feet wide eatshop american fast food family style chain restaurants route route including mcdonalds wendyçé ponderosa sizzler walmart grocery stores walgreens area golden bagel restaurants hotel little commercial activity near hotel recommend renting car area nearby activities drive docks ferry vieques culebra gated lots charge parking fee ferry docks north ferry adult oneway purchase tickets street ferry dock not want ferry islands storm coming high winds water choppy not pick outside seats ferry unless want smell diesel exhaust hour trip nearest beach hotel seven seas beach quite easy bioluminescent bay fajardo near seven seas beach '\n", - "\n", - " warnings.warn(\"Estimator fit failed. The score on this train-test\"\n", - "/home/racho401/.local/lib/python3.8/site-packages/sklearn/model_selection/_validation.py:532: FitFailedWarning: Estimator fit failed. The score on this train-test partition for these parameters will be set to nan. Details: \n", - "ValueError: could not convert string to float: 'excellent location excellent value hotel just walk chinatown centre sydney quiet residential area door small park area good quiet location turn left hotel hill footbridge power msuem easy hotel slightly dated not say dirty rooms ok needed clean working no complaints best performing aircon hotel room work bonus nice balcony staff helpful sightseeing nice little cheap night hotel benefit excellent location similar hotel centre london wold prob cost night does not facilities larger mercure novotel rarely use does room service laundry '\n", - "\n", - " warnings.warn(\"Estimator fit failed. The score on this train-test\"\n", - "/home/racho401/.local/lib/python3.8/site-packages/sklearn/model_selection/_validation.py:532: FitFailedWarning: Estimator fit failed. The score on this train-test partition for these parameters will be set to nan. Details: \n", - "ValueError: could not convert string to float: 'good place stay check rainforest biobay viequesculebra rate place diamond hotel certain aspects rooms make place diamond aspects place exceed diamond hotel room stayed main building main building rooms odd numbers overlook town area rooms numbers views ocean room queen beds huge say quite nice stay place actually bed wall room inroom safe small refrigerator table suitable eating working thing missing room microwave furniture nice modern beds mushy soft bedspreads little worn tv fairly small bathroom little dated shower stall instead tub overall room acceptable little better expected thing not turn ac leave room hours ac powerful room quite cold came no heat room open windows let heat hotel hotel small business center wireless access unfortunately wireless not reached room porch lobby access fortunately tableschairs porch work hotel staff friendly sure ask copy pasaç goç did not eat seafood restaurant hotel mesones restaurant mesones kind like seal approval restaurants serving authentic puerto rico cuisine mexican restaurant hotel fairly inexpensive nothing write home hotel pools hotel typical pool hill cocoçé expect resort need check cocoçé guest house visit cocoçé pool towels guest house use pool area warned confusing signs pool hours içé\\x96 not sure pool actually closed signs pools said pool open till said pm note door hotel room said nice gym free weights site place play minigolf tennis basketball noted hotel easy miss oneway loop docks went far sign says hotel posters noted \\x8e blue sign easy miss case miss turn sign direction partially obscured vegetation road hotel fairly wide road guess feet wide eatshop american fast food family style chain restaurants route route including mcdonalds wendyçé ponderosa sizzler walmart grocery stores walgreens area golden bagel restaurants hotel little commercial activity near hotel recommend renting car area nearby activities drive docks ferry vieques culebra gated lots charge parking fee ferry docks north ferry adult oneway purchase tickets street ferry dock not want ferry islands storm coming high winds water choppy not pick outside seats ferry unless want smell diesel exhaust hour trip nearest beach hotel seven seas beach quite easy bioluminescent bay fajardo near seven seas beach '\n", - "\n", - " warnings.warn(\"Estimator fit failed. The score on this train-test\"\n", - "/home/racho401/.local/lib/python3.8/site-packages/sklearn/model_selection/_validation.py:532: FitFailedWarning: Estimator fit failed. The score on this train-test partition for these parameters will be set to nan. Details: \n", - "ValueError: could not convert string to float: 'good place stay check rainforest biobay viequesculebra rate place diamond hotel certain aspects rooms make place diamond aspects place exceed diamond hotel room stayed main building main building rooms odd numbers overlook town area rooms numbers views ocean room queen beds huge say quite nice stay place actually bed wall room inroom safe small refrigerator table suitable eating working thing missing room microwave furniture nice modern beds mushy soft bedspreads little worn tv fairly small bathroom little dated shower stall instead tub overall room acceptable little better expected thing not turn ac leave room hours ac powerful room quite cold came no heat room open windows let heat hotel hotel small business center wireless access unfortunately wireless not reached room porch lobby access fortunately tableschairs porch work hotel staff friendly sure ask copy pasaç goç did not eat seafood restaurant hotel mesones restaurant mesones kind like seal approval restaurants serving authentic puerto rico cuisine mexican restaurant hotel fairly inexpensive nothing write home hotel pools hotel typical pool hill cocoçé expect resort need check cocoçé guest house visit cocoçé pool towels guest house use pool area warned confusing signs pool hours içé\\x96 not sure pool actually closed signs pools said pool open till said pm note door hotel room said nice gym free weights site place play minigolf tennis basketball noted hotel easy miss oneway loop docks went far sign says hotel posters noted \\x8e blue sign easy miss case miss turn sign direction partially obscured vegetation road hotel fairly wide road guess feet wide eatshop american fast food family style chain restaurants route route including mcdonalds wendyçé ponderosa sizzler walmart grocery stores walgreens area golden bagel restaurants hotel little commercial activity near hotel recommend renting car area nearby activities drive docks ferry vieques culebra gated lots charge parking fee ferry docks north ferry adult oneway purchase tickets street ferry dock not want ferry islands storm coming high winds water choppy not pick outside seats ferry unless want smell diesel exhaust hour trip nearest beach hotel seven seas beach quite easy bioluminescent bay fajardo near seven seas beach '\n", - "\n", - " warnings.warn(\"Estimator fit failed. The score on this train-test\"\n", - "/home/racho401/.local/lib/python3.8/site-packages/sklearn/model_selection/_validation.py:532: FitFailedWarning: Estimator fit failed. The score on this train-test partition for these parameters will be set to nan. Details: \n", - "ValueError: could not convert string to float: 'good place stay check rainforest biobay viequesculebra rate place diamond hotel certain aspects rooms make place diamond aspects place exceed diamond hotel room stayed main building main building rooms odd numbers overlook town area rooms numbers views ocean room queen beds huge say quite nice stay place actually bed wall room inroom safe small refrigerator table suitable eating working thing missing room microwave furniture nice modern beds mushy soft bedspreads little worn tv fairly small bathroom little dated shower stall instead tub overall room acceptable little better expected thing not turn ac leave room hours ac powerful room quite cold came no heat room open windows let heat hotel hotel small business center wireless access unfortunately wireless not reached room porch lobby access fortunately tableschairs porch work hotel staff friendly sure ask copy pasaç goç did not eat seafood restaurant hotel mesones restaurant mesones kind like seal approval restaurants serving authentic puerto rico cuisine mexican restaurant hotel fairly inexpensive nothing write home hotel pools hotel typical pool hill cocoçé expect resort need check cocoçé guest house visit cocoçé pool towels guest house use pool area warned confusing signs pool hours içé\\x96 not sure pool actually closed signs pools said pool open till said pm note door hotel room said nice gym free weights site place play minigolf tennis basketball noted hotel easy miss oneway loop docks went far sign says hotel posters noted \\x8e blue sign easy miss case miss turn sign direction partially obscured vegetation road hotel fairly wide road guess feet wide eatshop american fast food family style chain restaurants route route including mcdonalds wendyçé ponderosa sizzler walmart grocery stores walgreens area golden bagel restaurants hotel little commercial activity near hotel recommend renting car area nearby activities drive docks ferry vieques culebra gated lots charge parking fee ferry docks north ferry adult oneway purchase tickets street ferry dock not want ferry islands storm coming high winds water choppy not pick outside seats ferry unless want smell diesel exhaust hour trip nearest beach hotel seven seas beach quite easy bioluminescent bay fajardo near seven seas beach '\n", - "\n", - " warnings.warn(\"Estimator fit failed. The score on this train-test\"\n", - "/home/racho401/.local/lib/python3.8/site-packages/sklearn/model_selection/_validation.py:532: FitFailedWarning: Estimator fit failed. The score on this train-test partition for these parameters will be set to nan. Details: \n", - "ValueError: could not convert string to float: 'good place stay check rainforest biobay viequesculebra rate place diamond hotel certain aspects rooms make place diamond aspects place exceed diamond hotel room stayed main building main building rooms odd numbers overlook town area rooms numbers views ocean room queen beds huge say quite nice stay place actually bed wall room inroom safe small refrigerator table suitable eating working thing missing room microwave furniture nice modern beds mushy soft bedspreads little worn tv fairly small bathroom little dated shower stall instead tub overall room acceptable little better expected thing not turn ac leave room hours ac powerful room quite cold came no heat room open windows let heat hotel hotel small business center wireless access unfortunately wireless not reached room porch lobby access fortunately tableschairs porch work hotel staff friendly sure ask copy pasaç goç did not eat seafood restaurant hotel mesones restaurant mesones kind like seal approval restaurants serving authentic puerto rico cuisine mexican restaurant hotel fairly inexpensive nothing write home hotel pools hotel typical pool hill cocoçé expect resort need check cocoçé guest house visit cocoçé pool towels guest house use pool area warned confusing signs pool hours içé\\x96 not sure pool actually closed signs pools said pool open till said pm note door hotel room said nice gym free weights site place play minigolf tennis basketball noted hotel easy miss oneway loop docks went far sign says hotel posters noted \\x8e blue sign easy miss case miss turn sign direction partially obscured vegetation road hotel fairly wide road guess feet wide eatshop american fast food family style chain restaurants route route including mcdonalds wendyçé ponderosa sizzler walmart grocery stores walgreens area golden bagel restaurants hotel little commercial activity near hotel recommend renting car area nearby activities drive docks ferry vieques culebra gated lots charge parking fee ferry docks north ferry adult oneway purchase tickets street ferry dock not want ferry islands storm coming high winds water choppy not pick outside seats ferry unless want smell diesel exhaust hour trip nearest beach hotel seven seas beach quite easy bioluminescent bay fajardo near seven seas beach '\n", - "\n", - " warnings.warn(\"Estimator fit failed. The score on this train-test\"\n", - "[Parallel(n_jobs=1)]: Done 20 out of 20 | elapsed: 0.0s finished\n" - ] - }, - { - "ename": "ValueError", - "evalue": "could not convert string to float: 'good place stay check rainforest biobay viequesculebra rate place diamond hotel certain aspects rooms make place diamond aspects place exceed diamond hotel room stayed main building main building rooms odd numbers overlook town area rooms numbers views ocean room queen beds huge say quite nice stay place actually bed wall room inroom safe small refrigerator table suitable eating working thing missing room microwave furniture nice modern beds mushy soft bedspreads little worn tv fairly small bathroom little dated shower stall instead tub overall room acceptable little better expected thing not turn ac leave room hours ac powerful room quite cold came no heat room open windows let heat hotel hotel small business center wireless access unfortunately wireless not reached room porch lobby access fortunately tableschairs porch work hotel staff friendly sure ask copy pasaç goç did not eat seafood restaurant hotel mesones restaurant mesones kind like seal approval restaurants serving authentic puerto rico cuisine mexican restaurant hotel fairly inexpensive nothing write home hotel pools hotel typical pool hill cocoçé expect resort need check cocoçé guest house visit cocoçé pool towels guest house use pool area warned confusing signs pool hours içé\\x96 not sure pool actually closed signs pools said pool open till said pm note door hotel room said nice gym free weights site place play minigolf tennis basketball noted hotel easy miss oneway loop docks went far sign says hotel posters noted \\x8e blue sign easy miss case miss turn sign direction partially obscured vegetation road hotel fairly wide road guess feet wide eatshop american fast food family style chain restaurants route route including mcdonalds wendyçé ponderosa sizzler walmart grocery stores walgreens area golden bagel restaurants hotel little commercial activity near hotel recommend renting car area nearby activities drive docks ferry vieques culebra gated lots charge parking fee ferry docks north ferry adult oneway purchase tickets street ferry dock not want ferry islands storm coming high winds water choppy not pick outside seats ferry unless want smell diesel exhaust hour trip nearest beach hotel seven seas beach quite easy bioluminescent bay fajardo near seven seas beach '", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m<ipython-input-39-7796fc6625a3>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 14\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 15\u001b[0m \u001b[0mgrid_search\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mGridSearchCV\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnb\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mparam_grid\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mverbose\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 16\u001b[0;31m \u001b[0mgrid_search\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx_train\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_train\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 17\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Best Params\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mgrid_search\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbest_params_\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 18\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/.local/lib/python3.8/site-packages/sklearn/model_selection/_search.py\u001b[0m in \u001b[0;36mfit\u001b[0;34m(self, X, y, groups, **fit_params)\u001b[0m\n\u001b[1;32m 737\u001b[0m \u001b[0mrefit_start_time\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtime\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtime\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 738\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0my\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 739\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbest_estimator_\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mfit_params\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 740\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 741\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbest_estimator_\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mfit_params\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/.local/lib/python3.8/site-packages/sklearn/naive_bayes.py\u001b[0m in \u001b[0;36mfit\u001b[0;34m(self, X, y, sample_weight)\u001b[0m\n\u001b[1;32m 607\u001b[0m \u001b[0mself\u001b[0m \u001b[0;34m:\u001b[0m \u001b[0mobject\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 608\u001b[0m \"\"\"\n\u001b[0;32m--> 609\u001b[0;31m \u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_check_X_y\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 610\u001b[0m \u001b[0m_\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mn_features\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mX\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 611\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mn_features_\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mn_features\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/.local/lib/python3.8/site-packages/sklearn/naive_bayes.py\u001b[0m in \u001b[0;36m_check_X_y\u001b[0;34m(self, X, y)\u001b[0m\n\u001b[1;32m 473\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 474\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_check_X_y\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 475\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mcheck_X_y\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maccept_sparse\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'csr'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 476\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 477\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_update_class_log_prior\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mclass_prior\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/.local/lib/python3.8/site-packages/sklearn/utils/validation.py\u001b[0m in \u001b[0;36mcheck_X_y\u001b[0;34m(X, y, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, warn_on_dtype, estimator)\u001b[0m\n\u001b[1;32m 745\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mValueError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"y cannot be None\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 746\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 747\u001b[0;31m X = check_array(X, accept_sparse=accept_sparse,\n\u001b[0m\u001b[1;32m 748\u001b[0m \u001b[0maccept_large_sparse\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0maccept_large_sparse\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 749\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0morder\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0morder\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcopy\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mcopy\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/.local/lib/python3.8/site-packages/sklearn/utils/validation.py\u001b[0m in \u001b[0;36mcheck_array\u001b[0;34m(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, warn_on_dtype, estimator)\u001b[0m\n\u001b[1;32m 529\u001b[0m \u001b[0marray\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0marray\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mastype\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcasting\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"unsafe\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcopy\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 530\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 531\u001b[0;31m \u001b[0marray\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0masarray\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marray\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0morder\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0morder\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 532\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mComplexWarning\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 533\u001b[0m raise ValueError(\"Complex data not supported\\n\"\n", - "\u001b[0;32m~/.local/lib/python3.8/site-packages/pandas/core/series.py\u001b[0m in \u001b[0;36m__array__\u001b[0;34m(self, dtype)\u001b[0m\n\u001b[1;32m 870\u001b[0m dtype='datetime64[ns]')\n\u001b[1;32m 871\u001b[0m \"\"\"\n\u001b[0;32m--> 872\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0masarray\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_values\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 873\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 874\u001b[0m \u001b[0;31m# ----------------------------------------------------------------------\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mValueError\u001b[0m: could not convert string to float: 'good place stay check rainforest biobay viequesculebra rate place diamond hotel certain aspects rooms make place diamond aspects place exceed diamond hotel room stayed main building main building rooms odd numbers overlook town area rooms numbers views ocean room queen beds huge say quite nice stay place actually bed wall room inroom safe small refrigerator table suitable eating working thing missing room microwave furniture nice modern beds mushy soft bedspreads little worn tv fairly small bathroom little dated shower stall instead tub overall room acceptable little better expected thing not turn ac leave room hours ac powerful room quite cold came no heat room open windows let heat hotel hotel small business center wireless access unfortunately wireless not reached room porch lobby access fortunately tableschairs porch work hotel staff friendly sure ask copy pasaç goç did not eat seafood restaurant hotel mesones restaurant mesones kind like seal approval restaurants serving authentic puerto rico cuisine mexican restaurant hotel fairly inexpensive nothing write home hotel pools hotel typical pool hill cocoçé expect resort need check cocoçé guest house visit cocoçé pool towels guest house use pool area warned confusing signs pool hours içé\\x96 not sure pool actually closed signs pools said pool open till said pm note door hotel room said nice gym free weights site place play minigolf tennis basketball noted hotel easy miss oneway loop docks went far sign says hotel posters noted \\x8e blue sign easy miss case miss turn sign direction partially obscured vegetation road hotel fairly wide road guess feet wide eatshop american fast food family style chain restaurants route route including mcdonalds wendyçé ponderosa sizzler walmart grocery stores walgreens area golden bagel restaurants hotel little commercial activity near hotel recommend renting car area nearby activities drive docks ferry vieques culebra gated lots charge parking fee ferry docks north ferry adult oneway purchase tickets street ferry dock not want ferry islands storm coming high winds water choppy not pick outside seats ferry unless want smell diesel exhaust hour trip nearest beach hotel seven seas beach quite easy bioluminescent bay fajardo near seven seas beach '" - ] - } - ], - "source": [ - "param_grid = {'alpha':[0.01, 0.1, 1.0, 10]}\n", - "\n", - "from sklearn.model_selection import GridSearchCV\n", - "\n", - "\n", - "nb=MultinomialNB()\n", - "\n", - "# Apply Tfidf Vectorizer to convert sentence to tokens\n", - "vectorizer = TfidfVectorizer(min_df = 0.01,stop_words = 'english')\n", - "vectorizer.fit(X)\n", - "\n", - "X_train_tfidf = vectorizer.transform(x_train)\n", - "X_test_tfidf = vectorizer.transform(x_test) \n", - "\n", - "grid_search = GridSearchCV(nb, param_grid, verbose=2)\n", - "grid_search.fit(X_train_tfidf, y_train)\n", - "print(\"Best Params\", grid_search.best_params_)\n", - "\n", - "nb_tuned=MultinomialNB(alpha = 0.01)\n", - "nb_model = nb_tuned.fit(X_train_tfidf, y_train) # Train classifier\n", - "\n", - "predicted_result=nb_model.predict(X_test_tfidf)\n", - "print(\"Imbalanced results\", classification_report(y_test,predicted_result))\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Multinomial Naive Bayes (balanced dataset)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Apply Tfidf Vectorizer to convert sentence to tokens\n", - "vectorizer = TfidfVectorizer(min_df = 0.01,stop_words = 'english')\n", - "vectorizer.fit(new_df['Review'])\n", - "\n", - "\n", - "X_train_tfidf = vectorizer.transform(x_train_undersampled)\n", - "X_test_tfidf = vectorizer.transform(x_test) \n", - "\n", - "nb=MultinomialNB()\n", - "nb_model = nb.fit(X_train_tfidf, y_train_undersampled)# Train classifier\n", - "\n", - "predicted_result=nb_model.predict(X_test_tfidf)\n", - "print(\"Balanced results\", classification_report(y_test,predicted_result))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# BERT" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!pip install tensorflow" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!pip install ktrain" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#Sentiment visualization\n", - "pos = [4, 5]\n", - "neg = [1, 2]\n", - "neu = [3]\n", - "\n", - "def sentiment(rating):\n", - " if rating in pos:\n", - " return 2\n", - " elif rating in neg:\n", - " return 0\n", - " else:\n", - " return 1 \n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import ktrain\n", - "from ktrain import text" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df['Sentiment'] = df['Rating'].apply(sentiment) #Apply sentiment labels to reviews\n", - "df.head()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#Sentiment distrubution for df\n", - "fig = go.Figure([go.Bar(x=df.Sentiment.value_counts().index, y=df.Sentiment.value_counts().tolist())])\n", - "fig.update_layout(\n", - " title=\"Values in each Sentiment\",\n", - " xaxis_title=\"Sentiment\",\n", - " yaxis_title=\"Values\")\n", - "fig.show()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#Load the df in and preprocess it according to the BERT model.\n", - "(x_train, y_train), (x_test, y_test), preproc = text.texts_from_df(df, \n", - " 'Review',\n", - " label_columns=['Sentiment'],\n", - " preprocess_mode='bert')\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#Load model, use text classifier bert.\n", - "model = text.text_classifier(name='bert', train_data=(x_train,y_train), preproc=preproc)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#Create an instance of a learner that will learn the model\n", - "learner = ktrain.get_learner(model=model,\n", - " train_data=(x_train, y_train),\n", - " val_data=(x_test, y_test),\n", - " batch_size=6)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#Set learningrate and cycle learning policy\n", - "learner.fit_onecycle(lr=2e-5,\n", - " epochs=1)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "learner.validate(val_data=(x_test,y_test))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Repeat process for undersampled data - BERT" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "new_df['Sentiment'] = new_df['Rating'].apply(sentiment) #Apply sentiment labels to reviews\n", - "new_df.head()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#Load the df in and preprocess it according to the BERT model.\n", - "(x_train, y_train), (x_test, y_test), preproc = text.texts_from_df(new_df, 'Review',label_columns=['Sentiment'], preprocess_mode='bert')\n", - "\n", - "#Load model, use text classifier bert.\n", - "model = text.text_classifier(name='bert', train_data=(x_train,y_train), preproc=preproc)\n", - "\n", - "#Create an instance of a learner that will learn the model\n", - "learner = ktrain.get_learner(model=model,train_data=(x_train, y_train),val_data=(x_test, y_test),batch_size=6)\n", - "\n", - "#Set learningrate and cycle learning policy\n", - "learner.fit_onecycle(lr=2e-5,\n", - " epochs=1)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "learner.validate(val_data=(x_test,y_test))" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.10" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -}