Skip to content
Snippets Groups Projects
Commit c438b3a5 authored by Gustaf Lindgren Bodemar's avatar Gustaf Lindgren Bodemar
Browse files

almost done

parent d8b07b1e
No related branches found
No related tags found
No related merge requests found
...@@ -63,12 +63,12 @@ keywords = [ ["security"], ...@@ -63,12 +63,12 @@ keywords = [ ["security"],
#["development"], #["development"],
#["improvement"], #["improvement"],
#["economy"], #["economy"],
["chatgpt"], #["chatgpt"],
["algorithms"], ["algorithms"],
#["bussinesses"], #["bussinesses"],
#["prospects"], #["prospects"],
["intelligence"], ["intelligence"],
["openai"], #["openai"],
["generate"], ["generate"],
["automation"], ["automation"],
["secured"], ["secured"],
...@@ -122,9 +122,6 @@ def un_pickelize(file_path): ...@@ -122,9 +122,6 @@ def un_pickelize(file_path):
def remove_urls(text): def remove_urls(text):
return re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b', ' ', text, flags=re.MULTILINE) return re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b', ' ', text, flags=re.MULTILINE)
def remove_whitespace_chars(text):
return re.sub(r"[\n\t\r]*", " ", text)
def remove_multispace(text): def remove_multispace(text):
return " ".join(text.split()) return " ".join(text.split())
...@@ -135,31 +132,41 @@ def preprocess_text(text): ...@@ -135,31 +132,41 @@ def preprocess_text(text):
words = [word for word in words if word not in set(stopwords.words("english"))] words = [word for word in words if word not in set(stopwords.words("english"))]
return " ".join(words) return " ".join(words)
def remove_whitespace_chars(text):
return re.sub(r"[\n\t\r]*", "", text)
def filter_docs(): def filter_docs():
print("Filtering docs") print("Filtering docs")
data = load_raw_docs() data = load_raw_docs()
filtered = [] filtered = []
selected = [] selected = []
idx = 0
for i in data: for i in data:
idx += 1
print(idx)
select = i select = i
#Clean data #Clean data
s = i s = i
s = s.lower() s = s.lower()
s = remove_urls(s) s = remove_urls(s)
#s = remove_whitespace_chars(s)
s = remove_multispace(s) s = remove_multispace(s)
s = preprocess_text(s) s = preprocess_text(s)
#Keyword search data #Keyword search data
if "gpt" in s: if "gpt" in s:
for kw_lst in keywords: for kw_lst in keywords:
#print("kw_lst:",kw_lst, "\ns:",s)
if all((kw in s for kw in kw_lst)): if all((kw in s for kw in kw_lst)):
selected.append(select) selected.append(select)
filtered.append(s) filtered.append(s)
break break
#break #REMOVE
#print("###",s,select)
#break #REMOVE
print("Filtred out {} docs".format(len(filtred)))
print("filtered out {} docs".format(len(filtered)))
save_filtered_docs(filtered) save_filtered_docs(filtered)
save_selected_docs(selected) save_selected_docs(selected)
...@@ -266,8 +273,12 @@ def calc_prob(): ...@@ -266,8 +273,12 @@ def calc_prob():
topics = get_topic_dict(topic_model) topics = get_topic_dict(topic_model)
sums["topic"] = pd.Series([x[1] for x in topics.values()]) sums["topic"] = pd.Series([x[1] for x in topics.values()])
print(sums.to_string()) #print(sums.to_string())
print(all_sum) #print(all_sum)
with open(f"probability.txt", "w") as outfile:
outfile.write(sums.to_string()+"\n")
outfile.write(f"All sum: {all_sum}\n")
################################################## ##################################################
...@@ -275,21 +286,42 @@ def calc_prob(): ...@@ -275,21 +286,42 @@ def calc_prob():
def visualize_barchart(): def visualize_barchart():
print("visualize_barchart") print("visualize_barchart")
topic_model = BERTopic.load("filtered_model") topic_model = BERTopic.load("filtered_model")
fig = topic_model.visualize_barchart(top_n_topics=10) fig = topic_model.visualize_barchart(top_n_topics=12)
fig.write_html("barchart.html") fig.write_image("barchart.svg")
def visualize_topics(): def visualize_topics():
print("visualize_topics") print("visualize_topics")
topic_model = BERTopic.load("filtered_model") topic_model = BERTopic.load("filtered_model")
fig = topic_model.visualize_topics() fig = topic_model.visualize_topics()
fig.write_html("visual.html") fig.write_image("visual.svg")
def visualize_distribution():
print("visualize_distribution")
topic_model = BERTopic.load("filtered_model")
probs = un_pickelize("probs")
for i in range(len(probs)):
path = os.path.join("/home/gusbo010/reddit-scraper/src/distribu", f"{i}_distribution.svg")
fig = topic_model.visualize_distribution(probs[i], min_probability=0.000001)
fig.write_image(path)
def visualize_hierarchy():
print("visualize_hierarchy")
topic_model = BERTopic.load("filtered_model")
probs = un_pickelize("probs")
fig = topic_model.visualize_hierarchy()
fig.write_image("hierarchy.svg")
if __name__ == "__main__": if __name__ == "__main__":
filter_docs() #filter_docs()
train_model() #train_model()
write_topics_str() #write_topics_str()
write_topics_docs() #write_topics_docs()
calc_prob() #calc_prob()
visualize_barchart() #visualize_barchart()
visualize_topics() #visualize_topics()
#visualize_hierarchy()
visualize_distribution()
#!/usr/bin/python3 #!/usr/bin/python3
from bertopic import BERTopic
from transformers import AutoTokenizer, AutoModelForSequenceClassification from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import pipeline from transformers import pipeline
################################################## WHAT DATA ?
# Load data
def get_topics_strs(topic_model): #-> list[topic_str] def get_topics_strs(topic_model): #-> list[topic_str]
to_ret = {} to_ret = []
for k,v in topic_model.get_topics().items(): for k,v in topic_model.get_topics().items():
words = [x[0] for x in v] words = [x[0] for x in v]
topic_str = " ".join(words) topic_str = " ".join(words)
...@@ -15,15 +19,23 @@ def load_data(): ...@@ -15,15 +19,23 @@ def load_data():
topic_model = BERTopic.load("filtered_model") topic_model = BERTopic.load("filtered_model")
return get_topics_strs(topic_model) return get_topics_strs(topic_model)
##################################################
# Sentiment analyser
def main (): def main ():
# https://huggingface.co/siebert/sentiment-roberta-large-english/tree/main
tokenizer = AutoTokenizer.from_pretrained("siebert/sentiment-roberta-large-english") tokenizer = AutoTokenizer.from_pretrained("siebert/sentiment-roberta-large-english")
model = AutoModelForSequenceClassification.from_pretrained("siebert/sentiment-roberta-large-english") model = AutoModelForSequenceClassification.from_pretrained("siebert/sentiment-roberta-large-english")
data = load_data()
sentiment_classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer) sentiment_classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)
data = load_data() #What data?
results = sentiment_classifier(data) results = sentiment_classifier(data)
print(results)
#for i in range(len(data)):
# print(data[i],results[i])
with open(f"sentiment.txt", "w") as outfile:
for i in range(len(data)):
outfile.write("{:2} {} -- {}\n".format(i,results[i]["label"], data[i]))
if __name__ == "__main__": if __name__ == "__main__":
main() main()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment