Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
T
Text Mining - TDDE16
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Requirements
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Locked files
Build
Pipelines
Jobs
Pipeline schedules
Test cases
Artifacts
Deploy
Releases
Package registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Code review analytics
Issue analytics
Insights
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Terms and privacy
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Jack Kolm
Text Mining - TDDE16
Commits
0040cc15
Commit
0040cc15
authored
2 months ago
by
jackkolm
Browse files
Options
Downloads
Patches
Plain Diff
accidentally removed dummy regressor code, readded (project.py)
parent
1af473d8
No related branches found
No related tags found
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
project/project.py
+176
-0
176 additions, 0 deletions
project/project.py
with
176 additions
and
0 deletions
project/project.py
0 → 100644
+
176
−
0
View file @
0040cc15
import
pandas
as
pd
import
matplotlib.pyplot
as
plt
import
numpy
as
np
# This was needed for sklearn not to crash when importing the dummy for some reason
def
dummy_npwarn_decorator_factory
():
def
npwarn_decorator
(
x
):
return
x
return
npwarn_decorator
np
.
_no_nep50_warning
=
getattr
(
np
,
'
_no_nep50_warning
'
,
dummy_npwarn_decorator_factory
)
from
sklearn.dummy
import
DummyClassifier
,
DummyRegressor
from
sklearn.naive_bayes
import
MultinomialNB
,
GaussianNB
,
BernoulliNB
,
CategoricalNB
from
sklearn.feature_extraction.text
import
TfidfVectorizer
,
CountVectorizer
from
sklearn.pipeline
import
Pipeline
,
FeatureUnion
from
sklearn.metrics
import
classification_report
,
precision_score
,
r2_score
,
mean_absolute_error
,
mean_squared_error
from
sklearn.feature_extraction.text
import
TfidfVectorizer
def
plot_games_on_meta_score
(
df
):
scores
=
np
.
array
(
df
[
"
meta_score
"
])
scores_big
=
scores
[
scores
>
75
]
print
(
scores_big
)
scores_small
=
scores
[
scores
<=
75
]
two_sets
=
[
scores_big
,
scores_small
]
plt
.
hist
(
two_sets
,
bins
=
10
,
stacked
=
True
)
plt
.
xlabel
(
"
Meta Score
"
)
plt
.
ylabel
(
"
Number of Games
"
)
plt
.
show
()
def
split_data
(
df
):
"""
Split data into two parts; a training dataset, and a test dataset.
Returns the two parts as a tuple.c
"""
test_data
=
pd
.
DataFrame
()
drop_indexes
=
[]
for
i
in
range
(
1
,
101
):
row
=
df
.
loc
[
df
[
'
meta_score
'
]
==
i
]
if
row
.
empty
:
continue
if
row
.
isnull
().
values
.
any
():
print
(
"
null
"
)
drop_indexes
.
append
(
row
.
index
[
0
])
test_data
=
pd
.
concat
([
test_data
,
row
])
data
.
drop
(
drop_indexes
,
inplace
=
True
)
data
=
data
.
sample
(
frac
=
1.0
,
random_state
=
200
)
training_data
=
data
return
training_data
,
test_data
def
make_classes
(
df
):
for
index
,
row
in
df
.
iterrows
():
if
row
[
"
meta_score
"
]
<
70
:
df
.
at
[
index
,
"
class
"
]
=
"
bad
"
elif
row
[
"
meta_score
"
]
<
80
:
df
.
at
[
index
,
"
class
"
]
=
"
average
"
else
:
df
.
at
[
index
,
"
class
"
]
=
"
good
"
bad_data
=
df
.
loc
[
df
[
'
class
'
]
==
"
bad
"
]
good_data
=
df
.
loc
[
df
[
'
class
'
]
==
"
good
"
]
average_data
=
df
.
loc
[
df
[
'
class
'
]
==
"
average
"
]
least_amount
=
min
([
len
(
bad_data
),
len
(
good_data
),
len
(
average_data
)])
bad_data
=
bad_data
.
sample
(
frac
=
1.0
,
random_state
=
200
)
good_data
=
good_data
.
sample
(
frac
=
1.0
,
random_state
=
200
)
average_data
=
average_data
.
sample
(
frac
=
1.0
,
random_state
=
200
)
bad_data
=
bad_data
[:
least_amount
]
good_data
=
good_data
[:
least_amount
]
average_data
=
average_data
[:
least_amount
]
classified_data
=
pd
.
concat
([
bad_data
,
good_data
,
average_data
])
randomised_avg_data
=
classified_data
.
sample
(
frac
=
1.0
,
random_state
=
201
)
classified_data
=
randomised_avg_data
.
dropna
(
subset
=
[
"
summary
"
])
return
classified_data
def
make_binary_classes
(
data
):
for
index
,
row
in
df
.
iterrows
():
if
row
[
"
meta_score
"
]
<
75
:
data
.
at
[
index
,
"
class
"
]
=
"
bad
"
else
:
data
.
at
[
index
,
"
class
"
]
=
"
good
"
bad_data
=
df
.
loc
[
df
[
'
class
'
]
==
"
bad
"
]
good_data
=
df
.
loc
[
df
[
'
class
'
]
==
"
good
"
]
least_amount
=
min
([
len
(
bad_data
),
len
(
good_data
)])
bad_data
=
bad_data
.
sample
(
frac
=
1.0
,
random_state
=
200
)
good_data
=
good_data
.
sample
(
frac
=
1.0
,
random_state
=
200
)
bad_data
=
bad_data
[:
least_amount
]
good_data
=
good_data
[:
least_amount
]
data
=
pd
.
concat
([
bad_data
,
good_data
])
randomised_data
=
data
.
sample
(
frac
=
1.0
,
random_state
=
201
)
data
=
randomised_data
data
=
data
.
dropna
(
subset
=
[
"
summary
"
])
return
data
def
plot_classified_data
(
data
):
"""
Plot the data returned from and classified in make_classes().
Amount for each of the three classes (good, bad, average) is displayed, ONLY.
Three bins, one for the amount of good, one for averge, and one for bad.
"""
good
=
data
[
'
class
'
].
value_counts
()[
"
good
"
]
average
=
data
[
'
class
'
].
value_counts
()[
"
average
"
]
bad
=
data
[
'
class
'
].
value_counts
()[
"
bad
"
]
#print(good, average, bad)
data
=
[
good
,
average
,
bad
]
plt
.
bar
([
"
Good
"
,
"
Average
"
,
"
Bad
"
],
data
)
plt
.
xlabel
(
"
Class
"
)
plt
.
ylabel
(
"
Number of Games
"
)
plt
.
show
()
def
dummy_regressor
(
train_X
,
train_Y
,
test_X
,
test_Y
):
dummy_regr
=
DummyRegressor
(
strategy
=
"
mean
"
)
dummy_regr
.
fit
(
train_X
,
train_Y
)
pred
=
dummy_regr
.
predict
(
test_X
)
scr
=
dummy_regr
.
score
(
test_X
,
test_Y
)
r2
=
r2_score
(
test_Y
,
pred
)
mar
=
mean_absolute_error
(
test_Y
,
pred
)
msq
=
mean_squared_error
(
test_Y
,
pred
)
print
(
f
"
MSE:
{
msq
}
, MAE:
{
mar
}
, R2:
{
r2
}
"
)
#print(msq)
#print(pred)
#print(test_Y)
#print(r2)
#classification_report(test_Y, pred)
#dummy_regr.score(X, y)
#model = multinomial_naive_bayes_classifier_model(train_X, train_Y)
dc_stratified
=
DummyClassifier
(
strategy
=
'
stratified
'
)
dc_model
=
dc_stratified
.
fit
(
train_X
,
train_Y
)
#print(model.score(test_X, test_Y))
dc_predicted
=
dc_model
.
predict
(
test_X
)
#print(classification_report(test_Y, dc_predicted))
def
predict_against_test_data
(
test_data
,
model
):
test_X
=
np
.
array
(
test_data
[
"
summary
"
])
test_Y
=
np
.
array
(
test_data
[
"
class
"
])
predicted
=
model
.
predict
(
test_X
)
score
=
(
precision_score
(
test_Y
,
predicted
,
average
=
'
macro
'
))
print
(
f
'
Macro precision score against test data:
{
score
}
'
)
print
(
"
Classification report against test data:
"
)
print
(
classification_report
(
test_Y
,
predicted
))
if
__name__
==
"
__main__
"
:
file_path
=
'
C:
\\
repos
\\
text-mining
\\
project
\\
all_games.csv
'
df
=
pd
.
read_csv
(
file_path
)
print
(
df
.
head
())
plot_games_on_meta_score
(
df
)
from
load_data
import
prep_data
train_X
,
train_Y
,
test_X
,
test_Y
=
prep_data
(
df
)
dummy_regressor
(
train_X
,
train_Y
,
test_X
,
test_Y
)
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment