Skip to content
Snippets Groups Projects
Commit abd74584 authored by Gustaf Lindgren Bodemar's avatar Gustaf Lindgren Bodemar
Browse files

done..?

parent c438b3a5
No related branches found
No related tags found
No related merge requests found
......@@ -156,14 +156,10 @@ def filter_docs():
#Keyword search data
if "gpt" in s:
for kw_lst in keywords:
#print("kw_lst:",kw_lst, "\ns:",s)
if all((kw in s for kw in kw_lst)):
selected.append(select)
filtered.append(s)
break
#break #REMOVE
#print("###",s,select)
#break #REMOVE
print("filtered out {} docs".format(len(filtered)))
......@@ -273,9 +269,6 @@ def calc_prob():
topics = get_topic_dict(topic_model)
sums["topic"] = pd.Series([x[1] for x in topics.values()])
#print(sums.to_string())
#print(all_sum)
with open(f"probability.txt", "w") as outfile:
outfile.write(sums.to_string()+"\n")
outfile.write(f"All sum: {all_sum}\n")
......
#!/usr/bin/python3
import json
from bertopic import BERTopic
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import pipeline
import pickle as pk
################################################## WHAT DATA ?
# Load data
def get_topics_strs(topic_model): #-> list[topic_str]
to_ret = []
for k,v in topic_model.get_topics().items():
words = [x[0] for x in v]
topic_str = " ".join(words)
to_ret.append(topic_str)
#Get a dict of topics {id:(count,name)}
def get_topic_dict(topic_model):
to_ret = {}
for i in topic_model.get_topic_info().iterrows():
t = tuple(i[1])
to_ret[t[0]] = (t[1], t[2])
return to_ret
def load_data():
def load_topics():
topic_model = BERTopic.load("filtered_model")
return get_topics_strs(topic_model)
return get_topic_dict(topic_model)
def pickelize(obj,file_path):
with open(file_path,"wb") as f:
pk.dump(obj,f)
def un_pickelize(file_path):
with open(file_path,"rb") as f:
return(pk.load(f))
#Same as load_filtered_docs except strings saved unclean
def load_filtered_docs():
to_ret = []
with open("filtered_docs.json", "r") as f:
docs = json.load(f)
for i in range(len(docs)):
if i == 1759: #spaceial case to remove chinease caracters
to_ret.append("original article wsj article chinese version wsj article english version google translation chinese article chat gpt translation chinese article")
else:
d = docs[i]
to_ret.append(d[:512])
return to_ret
def save_sentiment(data):
with open("sentiment.json", "w") as outfile:
json.dump(data, outfile)
def load_sentiment():
with open("sentiment.json", "r") as f:
return json.load(f)
##################################################
# Sentiment analyser
def main ():
# https://huggingface.co/siebert/sentiment-roberta-large-english/tree/main
#https://huggingface.co/siebert/sentiment-roberta-large-english/tree/main
tokenizer = AutoTokenizer.from_pretrained("siebert/sentiment-roberta-large-english")
model = AutoModelForSequenceClassification.from_pretrained("siebert/sentiment-roberta-large-english")
sentiment_classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)
data = load_data() #What data?
results = sentiment_classifier(data)
print("loaded sentiment_classifier")
#for i in range(len(data)):
# print(data[i],results[i])
data = load_filtered_docs()
results = None
try:
results = load_sentiment()
print("loaded {} results".format(len(results)))
except:
print("result bad loading null set")
results = {}
for i in range(0,len(data)):
try:
print(i)
if i in results:
raise Exception(f"{i} in results already")
results[i] = sentiment_classifier( data[i] )
#print(results)
except:
save_sentiment(results)
break
save_sentiment(results)
##################################################
# Calc what people feel about it
def calc_sentiment_topic():
topics = load_topics()
probs = un_pickelize("probs")
results = load_sentiment()
print("loaded results")
topic_sentiments = [ 0 for x in range(len(probs[0]))] #list lenght of topics
abs_sentiments = [ 0 for x in range(len(probs[0]))] #list lenght of topics
for i in range(len(probs)): #loop through dock sentiment
doc_sent = 1 if results[str(i)][0]["label"] == "POSITIVE" else -1
doc_topic_probs = probs[i]
for j in range(len(probs[0])): #loop over each topic
topic_sentiments[j] += doc_sent * doc_topic_probs[j]
abs_sentiments[j] += abs(doc_sent * doc_topic_probs[j])
with open(f"sentiment.txt", "w") as outfile:
for i in range(len(data)):
outfile.write("{:2} {} -- {}\n".format(i,results[i]["label"], data[i]))
for i in range(len(probs[0])):
tilt = round( (topic_sentiments[i] / abs_sentiments[i])*100, 1 )
outfile.write("{:2} {:19} //{:19} {:5}% -- {}\n".format(i, topic_sentiments[i], abs_sentiments[i], tilt, topics[i][1]))
if __name__ == "__main__":
main()
#main()
calc_sentiment_topic()
#"POSITIVE"
#"NEGATIVE"
#"label"
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment