Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
C
ChatGPT Sentiment Scraper
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Requirements
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Locked files
Build
Pipelines
Jobs
Pipeline schedules
Test cases
Artifacts
Deploy
Releases
Package registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Code review analytics
Issue analytics
Insights
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Terms and privacy
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Gustaf Lindgren Bodemar
ChatGPT Sentiment Scraper
Commits
c438b3a5
Commit
c438b3a5
authored
2 years ago
by
Gustaf Lindgren Bodemar
Browse files
Options
Downloads
Patches
Plain Diff
almost done
parent
d8b07b1e
No related branches found
No related tags found
No related merge requests found
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
src/bert.py
+51
-19
51 additions, 19 deletions
src/bert.py
src/sent_analys.py
+18
-6
18 additions, 6 deletions
src/sent_analys.py
with
69 additions
and
25 deletions
src/bert.py
+
51
−
19
View file @
c438b3a5
...
@@ -63,12 +63,12 @@ keywords = [ ["security"],
...
@@ -63,12 +63,12 @@ keywords = [ ["security"],
#["development"],
#["development"],
#["improvement"],
#["improvement"],
#["economy"],
#["economy"],
[
"
chatgpt
"
],
#
["chatgpt"],
[
"
algorithms
"
],
[
"
algorithms
"
],
#["bussinesses"],
#["bussinesses"],
#["prospects"],
#["prospects"],
[
"
intelligence
"
],
[
"
intelligence
"
],
[
"
openai
"
],
#
["openai"],
[
"
generate
"
],
[
"
generate
"
],
[
"
automation
"
],
[
"
automation
"
],
[
"
secured
"
],
[
"
secured
"
],
...
@@ -122,9 +122,6 @@ def un_pickelize(file_path):
...
@@ -122,9 +122,6 @@ def un_pickelize(file_path):
def
remove_urls
(
text
):
def
remove_urls
(
text
):
return
re
.
sub
(
r
'
(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b
'
,
'
'
,
text
,
flags
=
re
.
MULTILINE
)
return
re
.
sub
(
r
'
(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b
'
,
'
'
,
text
,
flags
=
re
.
MULTILINE
)
def
remove_whitespace_chars
(
text
):
return
re
.
sub
(
r
"
[\n\t\r]*
"
,
"
"
,
text
)
def
remove_multispace
(
text
):
def
remove_multispace
(
text
):
return
"
"
.
join
(
text
.
split
())
return
"
"
.
join
(
text
.
split
())
...
@@ -135,31 +132,41 @@ def preprocess_text(text):
...
@@ -135,31 +132,41 @@ def preprocess_text(text):
words
=
[
word
for
word
in
words
if
word
not
in
set
(
stopwords
.
words
(
"
english
"
))]
words
=
[
word
for
word
in
words
if
word
not
in
set
(
stopwords
.
words
(
"
english
"
))]
return
"
"
.
join
(
words
)
return
"
"
.
join
(
words
)
def
remove_whitespace_chars
(
text
):
return
re
.
sub
(
r
"
[\n\t\r]*
"
,
""
,
text
)
def
filter_docs
():
def
filter_docs
():
print
(
"
Filtering docs
"
)
print
(
"
Filtering docs
"
)
data
=
load_raw_docs
()
data
=
load_raw_docs
()
filtered
=
[]
filtered
=
[]
selected
=
[]
selected
=
[]
idx
=
0
for
i
in
data
:
for
i
in
data
:
idx
+=
1
print
(
idx
)
select
=
i
select
=
i
#Clean data
#Clean data
s
=
i
s
=
i
s
=
s
.
lower
()
s
=
s
.
lower
()
s
=
remove_urls
(
s
)
s
=
remove_urls
(
s
)
#s = remove_whitespace_chars(s)
s
=
remove_multispace
(
s
)
s
=
remove_multispace
(
s
)
s
=
preprocess_text
(
s
)
s
=
preprocess_text
(
s
)
#Keyword search data
#Keyword search data
if
"
gpt
"
in
s
:
if
"
gpt
"
in
s
:
for
kw_lst
in
keywords
:
for
kw_lst
in
keywords
:
#print("kw_lst:",kw_lst, "\ns:",s)
if
all
((
kw
in
s
for
kw
in
kw_lst
)):
if
all
((
kw
in
s
for
kw
in
kw_lst
)):
selected
.
append
(
select
)
selected
.
append
(
select
)
filtered
.
append
(
s
)
filtered
.
append
(
s
)
break
break
#break #REMOVE
#print("###",s,select)
#break #REMOVE
print
(
"
Filtred out {} docs
"
.
format
(
len
(
filtred
)))
print
(
"
filtered out {} docs
"
.
format
(
len
(
filtered
)))
save_filtered_docs
(
filtered
)
save_filtered_docs
(
filtered
)
save_selected_docs
(
selected
)
save_selected_docs
(
selected
)
...
@@ -266,8 +273,12 @@ def calc_prob():
...
@@ -266,8 +273,12 @@ def calc_prob():
topics
=
get_topic_dict
(
topic_model
)
topics
=
get_topic_dict
(
topic_model
)
sums
[
"
topic
"
]
=
pd
.
Series
([
x
[
1
]
for
x
in
topics
.
values
()])
sums
[
"
topic
"
]
=
pd
.
Series
([
x
[
1
]
for
x
in
topics
.
values
()])
print
(
sums
.
to_string
())
#print(sums.to_string())
print
(
all_sum
)
#print(all_sum)
with
open
(
f
"
probability.txt
"
,
"
w
"
)
as
outfile
:
outfile
.
write
(
sums
.
to_string
()
+
"
\n
"
)
outfile
.
write
(
f
"
All sum:
{
all_sum
}
\n
"
)
##################################################
##################################################
...
@@ -275,21 +286,42 @@ def calc_prob():
...
@@ -275,21 +286,42 @@ def calc_prob():
def
visualize_barchart
():
def
visualize_barchart
():
print
(
"
visualize_barchart
"
)
print
(
"
visualize_barchart
"
)
topic_model
=
BERTopic
.
load
(
"
filtered_model
"
)
topic_model
=
BERTopic
.
load
(
"
filtered_model
"
)
fig
=
topic_model
.
visualize_barchart
(
top_n_topics
=
10
)
fig
=
topic_model
.
visualize_barchart
(
top_n_topics
=
12
)
fig
.
write_html
(
"
barchart.html
"
)
fig
.
write_image
(
"
barchart.svg
"
)
def
visualize_topics
():
def
visualize_topics
():
print
(
"
visualize_topics
"
)
print
(
"
visualize_topics
"
)
topic_model
=
BERTopic
.
load
(
"
filtered_model
"
)
topic_model
=
BERTopic
.
load
(
"
filtered_model
"
)
fig
=
topic_model
.
visualize_topics
()
fig
=
topic_model
.
visualize_topics
()
fig
.
write_html
(
"
visual.html
"
)
fig
.
write_image
(
"
visual.svg
"
)
def
visualize_distribution
():
print
(
"
visualize_distribution
"
)
topic_model
=
BERTopic
.
load
(
"
filtered_model
"
)
probs
=
un_pickelize
(
"
probs
"
)
for
i
in
range
(
len
(
probs
)):
path
=
os
.
path
.
join
(
"
/home/gusbo010/reddit-scraper/src/distribu
"
,
f
"
{
i
}
_distribution.svg
"
)
fig
=
topic_model
.
visualize_distribution
(
probs
[
i
],
min_probability
=
0.000001
)
fig
.
write_image
(
path
)
def
visualize_hierarchy
():
print
(
"
visualize_hierarchy
"
)
topic_model
=
BERTopic
.
load
(
"
filtered_model
"
)
probs
=
un_pickelize
(
"
probs
"
)
fig
=
topic_model
.
visualize_hierarchy
()
fig
.
write_image
(
"
hierarchy.svg
"
)
if
__name__
==
"
__main__
"
:
if
__name__
==
"
__main__
"
:
filter_docs
()
#filter_docs()
train_model
()
#train_model()
write_topics_str
()
#write_topics_str()
write_topics_docs
()
#write_topics_docs()
calc_prob
()
#calc_prob()
visualize_barchart
()
#visualize_barchart()
visualize_topics
()
#visualize_topics()
#visualize_hierarchy()
visualize_distribution
()
This diff is collapsed.
Click to expand it.
src/sent_analys.py
+
18
−
6
View file @
c438b3a5
#!/usr/bin/python3
#!/usr/bin/python3
from
bertopic
import
BERTopic
from
transformers
import
AutoTokenizer
,
AutoModelForSequenceClassification
from
transformers
import
AutoTokenizer
,
AutoModelForSequenceClassification
from
transformers
import
pipeline
from
transformers
import
pipeline
################################################## WHAT DATA ?
# Load data
def
get_topics_strs
(
topic_model
):
#-> list[topic_str]
def
get_topics_strs
(
topic_model
):
#-> list[topic_str]
to_ret
=
{}
to_ret
=
[]
for
k
,
v
in
topic_model
.
get_topics
().
items
():
for
k
,
v
in
topic_model
.
get_topics
().
items
():
words
=
[
x
[
0
]
for
x
in
v
]
words
=
[
x
[
0
]
for
x
in
v
]
topic_str
=
"
"
.
join
(
words
)
topic_str
=
"
"
.
join
(
words
)
...
@@ -15,15 +19,23 @@ def load_data():
...
@@ -15,15 +19,23 @@ def load_data():
topic_model
=
BERTopic
.
load
(
"
filtered_model
"
)
topic_model
=
BERTopic
.
load
(
"
filtered_model
"
)
return
get_topics_strs
(
topic_model
)
return
get_topics_strs
(
topic_model
)
##################################################
# Sentiment analyser
def
main
():
def
main
():
# https://huggingface.co/siebert/sentiment-roberta-large-english/tree/main
tokenizer
=
AutoTokenizer
.
from_pretrained
(
"
siebert/sentiment-roberta-large-english
"
)
tokenizer
=
AutoTokenizer
.
from_pretrained
(
"
siebert/sentiment-roberta-large-english
"
)
model
=
AutoModelForSequenceClassification
.
from_pretrained
(
"
siebert/sentiment-roberta-large-english
"
)
model
=
AutoModelForSequenceClassification
.
from_pretrained
(
"
siebert/sentiment-roberta-large-english
"
)
data
=
load_data
()
sentiment_classifier
=
pipeline
(
"
sentiment-analysis
"
,
model
=
model
,
tokenizer
=
tokenizer
)
sentiment_classifier
=
pipeline
(
"
sentiment-analysis
"
,
model
=
model
,
tokenizer
=
tokenizer
)
data
=
load_data
()
#What data?
results
=
sentiment_classifier
(
data
)
results
=
sentiment_classifier
(
data
)
print
(
results
)
#for i in range(len(data)):
# print(data[i],results[i])
with
open
(
f
"
sentiment.txt
"
,
"
w
"
)
as
outfile
:
for
i
in
range
(
len
(
data
)):
outfile
.
write
(
"
{:2} {} -- {}
\n
"
.
format
(
i
,
results
[
i
][
"
label
"
],
data
[
i
]))
if
__name__
==
"
__main__
"
:
if
__name__
==
"
__main__
"
:
main
()
main
()
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment