Skip to content
Snippets Groups Projects
Commit 0040cc15 authored by jackkolm's avatar jackkolm
Browse files

accidentally removed dummy regressor code, readded (project.py)

parent 1af473d8
No related branches found
No related tags found
No related merge requests found
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
# This was needed for sklearn not to crash when importing the dummy for some reason
def dummy_npwarn_decorator_factory():
def npwarn_decorator(x):
return x
return npwarn_decorator
np._no_nep50_warning = getattr(np, '_no_nep50_warning', dummy_npwarn_decorator_factory)
from sklearn.dummy import DummyClassifier, DummyRegressor
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB, CategoricalNB
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.metrics import classification_report, precision_score, r2_score, mean_absolute_error, mean_squared_error
from sklearn.feature_extraction.text import TfidfVectorizer
def plot_games_on_meta_score(df):
scores = np.array(df["meta_score"])
scores_big = scores[scores > 75]
print(scores_big)
scores_small = scores[scores <= 75]
two_sets = [scores_big, scores_small]
plt.hist(two_sets, bins=10, stacked=True)
plt.xlabel("Meta Score")
plt.ylabel("Number of Games")
plt.show()
def split_data(df):
"""
Split data into two parts; a training dataset, and a test dataset.
Returns the two parts as a tuple.c
"""
test_data = pd.DataFrame()
drop_indexes = []
for i in range(1, 101):
row = df.loc[df['meta_score'] == i]
if row.empty:
continue
if row.isnull().values.any():
print("null")
drop_indexes.append(row.index[0])
test_data = pd.concat([test_data, row])
data.drop(drop_indexes, inplace=True)
data = data.sample(frac=1.0, random_state=200)
training_data = data
return training_data, test_data
def make_classes(df):
for index, row in df.iterrows():
if row["meta_score"] < 70:
df.at[index, "class"] = "bad"
elif row["meta_score"] < 80:
df.at[index, "class"] = "average"
else:
df.at[index, "class"] = "good"
bad_data = df.loc[df['class'] == "bad"]
good_data = df.loc[df['class'] == "good"]
average_data = df.loc[df['class'] == "average"]
least_amount = min([len(bad_data), len(good_data), len(average_data)])
bad_data = bad_data.sample(frac=1.0, random_state=200)
good_data = good_data.sample(frac=1.0, random_state=200)
average_data = average_data.sample(frac=1.0, random_state=200)
bad_data = bad_data[:least_amount]
good_data = good_data[:least_amount]
average_data = average_data[:least_amount]
classified_data = pd.concat([bad_data, good_data, average_data])
randomised_avg_data = classified_data.sample(frac=1.0, random_state=201)
classified_data = randomised_avg_data.dropna(subset=["summary"])
return classified_data
def make_binary_classes(data):
for index, row in df.iterrows():
if row["meta_score"] < 75:
data.at[index, "class"] = "bad"
else:
data.at[index, "class"] = "good"
bad_data = df.loc[df['class'] == "bad"]
good_data = df.loc[df['class'] == "good"]
least_amount = min([len(bad_data), len(good_data)])
bad_data = bad_data.sample(frac=1.0, random_state=200)
good_data = good_data.sample(frac=1.0, random_state=200)
bad_data = bad_data[:least_amount]
good_data = good_data[:least_amount]
data = pd.concat([bad_data, good_data])
randomised_data = data.sample(frac=1.0, random_state=201)
data = randomised_data
data = data.dropna(subset=["summary"])
return data
def plot_classified_data(data):
"""
Plot the data returned from and classified in make_classes().
Amount for each of the three classes (good, bad, average) is displayed, ONLY.
Three bins, one for the amount of good, one for averge, and one for bad.
"""
good = data['class'].value_counts()["good"]
average = data['class'].value_counts()["average"]
bad = data['class'].value_counts()["bad"]
#print(good, average, bad)
data = [good, average, bad]
plt.bar(["Good", "Average", "Bad"], data)
plt.xlabel("Class")
plt.ylabel("Number of Games")
plt.show()
def dummy_regressor(train_X, train_Y, test_X, test_Y):
dummy_regr = DummyRegressor(strategy="mean")
dummy_regr.fit(train_X, train_Y)
pred = dummy_regr.predict(test_X)
scr = dummy_regr.score(test_X, test_Y)
r2 = r2_score(test_Y, pred)
mar = mean_absolute_error(test_Y, pred)
msq = mean_squared_error(test_Y, pred)
print(f"MSE: {msq}, MAE: {mar}, R2: {r2}")
#print(msq)
#print(pred)
#print(test_Y)
#print(r2)
#classification_report(test_Y, pred)
#dummy_regr.score(X, y)
#model = multinomial_naive_bayes_classifier_model(train_X, train_Y)
dc_stratified = DummyClassifier(strategy='stratified')
dc_model = dc_stratified.fit(train_X, train_Y)
#print(model.score(test_X, test_Y))
dc_predicted = dc_model.predict(test_X)
#print(classification_report(test_Y, dc_predicted))
def predict_against_test_data(test_data, model):
test_X = np.array(test_data["summary"])
test_Y = np.array(test_data["class"])
predicted = model.predict(test_X)
score = (precision_score(test_Y, predicted, average='macro'))
print(f'Macro precision score against test data: {score}')
print("Classification report against test data:")
print(classification_report(test_Y, predicted))
if __name__ == "__main__":
file_path = 'C:\\repos\\text-mining\\project\\all_games.csv'
df = pd.read_csv(file_path)
print(df.head())
plot_games_on_meta_score(df)
from load_data import prep_data
train_X, train_Y, test_X, test_Y = prep_data(df)
dummy_regressor(train_X, train_Y, test_X, test_Y)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment