diff --git a/src/bert.py b/src/bert.py index 43c6d8e4c44393af9889a2eec7217ed47f060fc4..7ac141dfeb3f6723cf8e4b1725839de08fef5ff9 100644 --- a/src/bert.py +++ b/src/bert.py @@ -156,14 +156,10 @@ def filter_docs(): #Keyword search data if "gpt" in s: for kw_lst in keywords: - #print("kw_lst:",kw_lst, "\ns:",s) if all((kw in s for kw in kw_lst)): selected.append(select) filtered.append(s) break - #break #REMOVE - #print("###",s,select) - #break #REMOVE print("filtered out {} docs".format(len(filtered))) @@ -273,9 +269,6 @@ def calc_prob(): topics = get_topic_dict(topic_model) sums["topic"] = pd.Series([x[1] for x in topics.values()]) - #print(sums.to_string()) - #print(all_sum) - with open(f"probability.txt", "w") as outfile: outfile.write(sums.to_string()+"\n") outfile.write(f"All sum: {all_sum}\n") diff --git a/src/sent_analys.py b/src/sent_analys.py index be544a24362e5c9d44ff2e1ad3b2fb4972acc642..b289012e8f92dba720dcb7388229202f519e79ff 100644 --- a/src/sent_analys.py +++ b/src/sent_analys.py @@ -1,41 +1,119 @@ #!/usr/bin/python3 +import json from bertopic import BERTopic from transformers import AutoTokenizer, AutoModelForSequenceClassification from transformers import pipeline +import pickle as pk + ################################################## WHAT DATA ? # Load data -def get_topics_strs(topic_model): #-> list[topic_str] - to_ret = [] - for k,v in topic_model.get_topics().items(): - words = [x[0] for x in v] - topic_str = " ".join(words) - to_ret.append(topic_str) +#Get a dict of topics {id:(count,name)} +def get_topic_dict(topic_model): + to_ret = {} + for i in topic_model.get_topic_info().iterrows(): + t = tuple(i[1]) + to_ret[t[0]] = (t[1], t[2]) return to_ret -def load_data(): +def load_topics(): topic_model = BERTopic.load("filtered_model") - return get_topics_strs(topic_model) + return get_topic_dict(topic_model) + +def pickelize(obj,file_path): + with open(file_path,"wb") as f: + pk.dump(obj,f) + +def un_pickelize(file_path): + with open(file_path,"rb") as f: + return(pk.load(f)) + +#Same as load_filtered_docs except strings saved unclean +def load_filtered_docs(): + to_ret = [] + with open("filtered_docs.json", "r") as f: + docs = json.load(f) + for i in range(len(docs)): + if i == 1759: #spaceial case to remove chinease caracters + to_ret.append("original article wsj article chinese version wsj article english version google translation chinese article chat gpt translation chinese article") + else: + d = docs[i] + to_ret.append(d[:512]) + return to_ret + +def save_sentiment(data): + with open("sentiment.json", "w") as outfile: + json.dump(data, outfile) + +def load_sentiment(): + with open("sentiment.json", "r") as f: + return json.load(f) + ################################################## # Sentiment analyser def main (): - # https://huggingface.co/siebert/sentiment-roberta-large-english/tree/main + #https://huggingface.co/siebert/sentiment-roberta-large-english/tree/main tokenizer = AutoTokenizer.from_pretrained("siebert/sentiment-roberta-large-english") model = AutoModelForSequenceClassification.from_pretrained("siebert/sentiment-roberta-large-english") sentiment_classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer) - data = load_data() #What data? - results = sentiment_classifier(data) + print("loaded sentiment_classifier") - #for i in range(len(data)): - # print(data[i],results[i]) + + data = load_filtered_docs() + results = None + try: + results = load_sentiment() + print("loaded {} results".format(len(results))) + except: + print("result bad loading null set") + results = {} + + for i in range(0,len(data)): + try: + print(i) + if i in results: + raise Exception(f"{i} in results already") + results[i] = sentiment_classifier( data[i] ) + #print(results) + except: + save_sentiment(results) + break + save_sentiment(results) + + + +################################################## +# Calc what people feel about it + +def calc_sentiment_topic(): + topics = load_topics() + probs = un_pickelize("probs") + results = load_sentiment() + print("loaded results") + + topic_sentiments = [ 0 for x in range(len(probs[0]))] #list lenght of topics + abs_sentiments = [ 0 for x in range(len(probs[0]))] #list lenght of topics + + for i in range(len(probs)): #loop through dock sentiment + doc_sent = 1 if results[str(i)][0]["label"] == "POSITIVE" else -1 + doc_topic_probs = probs[i] + for j in range(len(probs[0])): #loop over each topic + topic_sentiments[j] += doc_sent * doc_topic_probs[j] + abs_sentiments[j] += abs(doc_sent * doc_topic_probs[j]) with open(f"sentiment.txt", "w") as outfile: - for i in range(len(data)): - outfile.write("{:2} {} -- {}\n".format(i,results[i]["label"], data[i])) + for i in range(len(probs[0])): + tilt = round( (topic_sentiments[i] / abs_sentiments[i])*100, 1 ) + outfile.write("{:2} {:19} //{:19} {:5}% -- {}\n".format(i, topic_sentiments[i], abs_sentiments[i], tilt, topics[i][1])) if __name__ == "__main__": - main() + #main() + calc_sentiment_topic() + + #"POSITIVE" + #"NEGATIVE" + #"label"