From 0040cc1545e58f55e7e48924d689771de43cb846 Mon Sep 17 00:00:00 2001 From: jackkolm <jack.kolm@outlook.com> Date: Fri, 21 Mar 2025 01:55:36 +0100 Subject: [PATCH] accidentally removed dummy regressor code, readded (project.py) --- project/project.py | 176 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 176 insertions(+) create mode 100644 project/project.py diff --git a/project/project.py b/project/project.py new file mode 100644 index 0000000..57e640a --- /dev/null +++ b/project/project.py @@ -0,0 +1,176 @@ +import pandas as pd + + +import matplotlib.pyplot as plt +import numpy as np + +# This was needed for sklearn not to crash when importing the dummy for some reason +def dummy_npwarn_decorator_factory(): + def npwarn_decorator(x): + return x + return npwarn_decorator +np._no_nep50_warning = getattr(np, '_no_nep50_warning', dummy_npwarn_decorator_factory) + + +from sklearn.dummy import DummyClassifier, DummyRegressor +from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB, CategoricalNB +from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer +from sklearn.pipeline import Pipeline, FeatureUnion +from sklearn.metrics import classification_report, precision_score, r2_score, mean_absolute_error, mean_squared_error +from sklearn.feature_extraction.text import TfidfVectorizer + +def plot_games_on_meta_score(df): + scores = np.array(df["meta_score"]) + scores_big = scores[scores > 75] + print(scores_big) + scores_small = scores[scores <= 75] + two_sets = [scores_big, scores_small] + plt.hist(two_sets, bins=10, stacked=True) + plt.xlabel("Meta Score") + plt.ylabel("Number of Games") + plt.show() + + +def split_data(df): + """ + Split data into two parts; a training dataset, and a test dataset. + Returns the two parts as a tuple.c + """ + test_data = pd.DataFrame() + drop_indexes = [] + for i in range(1, 101): + row = df.loc[df['meta_score'] == i] + if row.empty: + continue + if row.isnull().values.any(): + print("null") + drop_indexes.append(row.index[0]) + + test_data = pd.concat([test_data, row]) + data.drop(drop_indexes, inplace=True) + + data = data.sample(frac=1.0, random_state=200) + training_data = data + return training_data, test_data + +def make_classes(df): + + for index, row in df.iterrows(): + + if row["meta_score"] < 70: + df.at[index, "class"] = "bad" + elif row["meta_score"] < 80: + df.at[index, "class"] = "average" + else: + df.at[index, "class"] = "good" + + bad_data = df.loc[df['class'] == "bad"] + good_data = df.loc[df['class'] == "good"] + average_data = df.loc[df['class'] == "average"] + least_amount = min([len(bad_data), len(good_data), len(average_data)]) + + bad_data = bad_data.sample(frac=1.0, random_state=200) + good_data = good_data.sample(frac=1.0, random_state=200) + average_data = average_data.sample(frac=1.0, random_state=200) + bad_data = bad_data[:least_amount] + good_data = good_data[:least_amount] + average_data = average_data[:least_amount] + classified_data = pd.concat([bad_data, good_data, average_data]) + randomised_avg_data = classified_data.sample(frac=1.0, random_state=201) + classified_data = randomised_avg_data.dropna(subset=["summary"]) + + return classified_data + + + +def make_binary_classes(data): + + for index, row in df.iterrows(): + + if row["meta_score"] < 75: + data.at[index, "class"] = "bad" + else: + data.at[index, "class"] = "good" + bad_data = df.loc[df['class'] == "bad"] + good_data = df.loc[df['class'] == "good"] + least_amount = min([len(bad_data), len(good_data)]) + + bad_data = bad_data.sample(frac=1.0, random_state=200) + good_data = good_data.sample(frac=1.0, random_state=200) + bad_data = bad_data[:least_amount] + good_data = good_data[:least_amount] + data = pd.concat([bad_data, good_data]) + randomised_data = data.sample(frac=1.0, random_state=201) + data = randomised_data + data = data.dropna(subset=["summary"]) + + return data + + + +def plot_classified_data(data): + """ + Plot the data returned from and classified in make_classes(). + Amount for each of the three classes (good, bad, average) is displayed, ONLY. + Three bins, one for the amount of good, one for averge, and one for bad. + """ + good = data['class'].value_counts()["good"] + average = data['class'].value_counts()["average"] + bad = data['class'].value_counts()["bad"] + #print(good, average, bad) + data = [good, average, bad] + plt.bar(["Good", "Average", "Bad"], data) + plt.xlabel("Class") + plt.ylabel("Number of Games") + plt.show() + + + + + +def dummy_regressor(train_X, train_Y, test_X, test_Y): + dummy_regr = DummyRegressor(strategy="mean") + + dummy_regr.fit(train_X, train_Y) + + pred = dummy_regr.predict(test_X) + scr = dummy_regr.score(test_X, test_Y) + + r2 = r2_score(test_Y, pred) + mar = mean_absolute_error(test_Y, pred) + msq = mean_squared_error(test_Y, pred) + + print(f"MSE: {msq}, MAE: {mar}, R2: {r2}") + #print(msq) + #print(pred) + #print(test_Y) + #print(r2) + #classification_report(test_Y, pred) + #dummy_regr.score(X, y) + #model = multinomial_naive_bayes_classifier_model(train_X, train_Y) + dc_stratified = DummyClassifier(strategy='stratified') + dc_model = dc_stratified.fit(train_X, train_Y) + #print(model.score(test_X, test_Y)) + dc_predicted = dc_model.predict(test_X) + #print(classification_report(test_Y, dc_predicted)) + +def predict_against_test_data(test_data, model): + test_X = np.array(test_data["summary"]) + test_Y = np.array(test_data["class"]) + predicted = model.predict(test_X) + score = (precision_score(test_Y, predicted, average='macro')) + print(f'Macro precision score against test data: {score}') + print("Classification report against test data:") + print(classification_report(test_Y, predicted)) +if __name__ == "__main__": + + file_path = 'C:\\repos\\text-mining\\project\\all_games.csv' + + df = pd.read_csv(file_path) + + print(df.head()) + plot_games_on_meta_score(df) + from load_data import prep_data + train_X, train_Y, test_X, test_Y = prep_data(df) + dummy_regressor(train_X, train_Y, test_X, test_Y) + -- GitLab