Skip to content
Snippets Groups Projects
Commit 7fe740cf authored by Love Arreborn's avatar Love Arreborn
Browse files

SCREAM operating with Stanza-output

parent 1a530f5a
Branches
No related tags found
No related merge requests found
Pipeline #131688 skipped
......@@ -19,6 +19,8 @@ Aaron Smith <aaron.smith@lingfil.uu.se>
import pprint
import time
import stanza
from math import isfinite
# Import Scream
from scream.metrics import LexicalMetrics
......@@ -47,14 +49,51 @@ from scream.document_parts import DependencyParsedDocument
MAX_TOKEN = 256
nlp = stanza.Pipeline(lang="sv", processors="tokenize,pos,lemma,depparse,ner")
nlp = stanza.Pipeline(
lang="sv",
processors="tokenize,pos,lemma,depparse,ner",
download_method=stanza.DownloadMethod.REUSE_RESOURCES,
)
# Simple table for conversion from Stanza's UPOS tags to SUC tags
upos_to_suc = {
"NOUN": "NN",
"VERB": "VB",
"ADJ": "JJ",
"ADV": "AB",
"PRON": "PN",
"DET": "DT",
"ADP": "PP",
"CONJ": "KN",
"NUM": "RG",
"PART": "PC",
"INTJ": "IE",
"PUNCT": "MAD",
"X": "XX",
"SYM": "MAD",
"SCONJ": "SN",
}
# Likewise, the dependency relation tags need to be mapped to match efselabs output
dep_rel_mapping = {
"case": "RA",
"obl": "PA",
"root": "ROOT",
"nsubj": "SS",
"det": "DT",
"amod": "AT",
"obj": "OO",
"punct": "IP",
}
class AttrOptions:
"""A class to hold the options for the pipeline. Used in favor of a dict
to convert the keys to attributes. NOTE: Might be unnecessary."""
to convert the keys to attributes. NOTE: Might be unnecessary, could likely
just be a dict.
"""
def __init__(self, d):
def __init__(self, d: dict):
self.__dict__ = d
......@@ -99,47 +138,64 @@ def run_pipeline(options: AttrOptions, args: list) -> list:
list: A list of dictionaries containing the processed data.
"""
start_time = time.time()
pipeline_start_time = time.time()
result = []
for filename in args:
# ================== STANZA-TAGGING ==================
start_time = time.time()
proc = process_file(options, filename)
time_checker(start_time, "Stanza")
# ================== SCREAM ==================
start_time = time.time()
depdoc = DependencyParsedDocument()
# depdoc.build_document(
# split_sentences_from_tagger(prep_parsed_for_build_doc(proc["parsed"]))
# ) # proc[parsed] is the output from EFSELAB
# scream, additional_metrics = split_measures_to_scream_headings(
# StructuralMetrics(depdoc)
# )
depdoc.build_document(
split_sentences_from_tagger(prep_parsed_for_build_doc(proc["parsed"]))
) # proc[parsed] is the output from EFSELAB
scream, additional_metrics = split_measures_to_scream_headings(
StructuralMetrics(depdoc)
)
time_checker(start_time, "SCREAM")
print(f"FILENAME: {filename}")
# ================== STILETT ==================
# start_time = time.time()
# simplified_text, sentence_transformations = ud_text_simplifier.sapis_wrapper(
# filename
# )
# time_checker(start_time, "Stilett")
# print(simplified_text, sentence_transformations)
# print(f"proc {proc}")
# print(f"PARAGRAPHS {proc['paragraphs']}" )
elapsed_time1 = time.time() - start_time
# ================== COH-METRIX ==================
# start_time = time.time()
# coh_metrix_cohesion = cohesion.run_cohesion(proc['parsed'])
# coh_metrix_lsa = lsa.run_LSA(proc['parsed'], proc['paragraphs'])
# coh_metrix_connectives = connectives.run_connectives(proc['parsed'])
# synonym_dict = synonyms.run_synonyms(proc['parsed'])
# coh_metrix_cohesion = {"cohesion": coh_metrix_cohesion}
# coh_metrix_connectives = {"connectives": coh_metrix_connectives}
# coh_metrix_lsa = {"LSA" : coh_metrix_lsa}
# coh_metrix = {**coh_metrix_cohesion, **coh_metrix_connectives, **coh_metrix_lsa }
# coh_metrix = {}
# time_checker(start_time, "Coh-Metrix")
# ================== SYNONYMS ==================
# start_time = time.time()
# synonym_dict = synonyms.run_synonyms(proc['parsed'])
# time_checker(start_time, "Synonyms")
result.append(
{
"input": filename,
"efselab": proc,
# "scream": scream,
# "additional_metrics": additional_metrics,
"scream": scream,
"additional_metrics": additional_metrics,
# "stillett": {
# "simplified_text": simplified_text,
# "sentence_transformations": sentence_transformations,
......@@ -157,11 +213,21 @@ def run_pipeline(options: AttrOptions, args: list) -> list:
# print(d['coh-metrix'])
# print(l[-2])
time_checker(pipeline_start_time, "Pipeline")
pprint.pp(result)
return result
def prep_parsed_for_build_doc(parsed):
def prep_parsed_for_build_doc(parsed: list) -> str:
"""
Prepare parsed data for building a DependencyParsedDocument. Required for SCREAM.
Args:
parsed (list): The parsed data.
Returns:
list: Data prepared for building a DependencyParsedDocument.
"""
return "\n".join(
[
"\t".join(map(str, word_info))
......@@ -196,7 +262,10 @@ def process_file(options: AttrOptions, filename: str) -> dict:
if options.tagged or options.lemmatized or options.parsed:
tagged = [
[(word.text, word.upos) for word in sentence.words]
[
(word.text, upos_to_suc.get(word.upos, word.upos))
for word in sentence.words
]
for sentence in doc.sentences
]
lemmas = [[word.lemma for word in sentence.words] for sentence in doc.sentences]
......@@ -211,13 +280,14 @@ def process_file(options: AttrOptions, filename: str) -> dict:
parsed = [
[
(
word.id,
str(word.id),
word.text,
word.lemma,
word.upos,
word.xpos,
word.head,
word.deprel,
upos_to_suc.get(word.upos, word.upos),
upos_to_suc.get(word.upos, word.upos),
"|".join(upos_to_suc.get(word.xpos, word.xpos).split("|")[1:]),
str(word.head) if word.head != 0 else "0",
dep_rel_mapping.get(word.deprel, word.deprel),
)
for word in sentence.words
]
......@@ -233,7 +303,16 @@ def process_file(options: AttrOptions, filename: str) -> dict:
}
def split_sentences_from_tagger(resp) -> list:
def split_sentences_from_tagger(resp: str) -> list:
"""Split the sentences from the tagger output.
Args:
resp (str): The tagger output.
Returns:
list: The sentences split from the tagger output.
"""
sentences = []
sentence = []
for line in resp.split("\n"):
......@@ -251,7 +330,14 @@ def split_sentences_from_tagger(resp) -> list:
def split_measures_to_scream_headings(structural_instance) -> dict:
from math import isfinite
"""Split the measures to SCREAM headings.
Args:
structural_instance: The structural instance.
Returns:
dict: The measures split to SCREAM headings.
"""
calculated_metrics = vars(structural_instance)
structural_vars = [
......@@ -340,5 +426,21 @@ def split_measures_to_scream_headings(structural_instance) -> dict:
return fixed_dict, extra_dict
def time_checker(start_time: float, method: str) -> None:
"""Check the time elapsed since the start time.
Args:
start_time (float): The start time.
method (str): The method being run.
"""
elapsed_time = time.time() - start_time
print(f"{method}, elapsed time: {elapsed_time}")
if __name__ == "__main__":
main("I skolan äter jag ett rött äpple.".encode("utf-8"))
# main(
# "Det finns ett stort antal meningar som är onödigt långa, och vi behöver se till att dessa kan taggas godtyckligt. Denna, till synes, enkla uppgift är inte alltid så enkel.".encode(
# "utf-8"
# )
# )
......@@ -3,6 +3,7 @@ import os
from scream import conf
from scream import helper_methods
class FinalizeError(Exception):
def __init__(self, *args):
"""
......@@ -11,6 +12,7 @@ class FinalizeError(Exception):
"""
super().__init__(*args)
# Document parts
class Sentence:
def __init__(self):
......@@ -51,7 +53,9 @@ class Sentence:
:return: maximum tree depth
"""
if not self.finalized:
raise FinalizeError("The sentence is not finalized, please finalize sentence before calculating the depth.")
raise FinalizeError(
"The sentence is not finalized, please finalize sentence before calculating the depth."
)
return self.root.get_depth()
def assign_root(self) -> None:
......@@ -109,7 +113,8 @@ class Sentence:
if len(self.verb_arities) == 0:
if not self.finalized:
raise FinalizeError(
"The sentence is not finalized, please finalize sentence before calculating the total verb arity.")
"The sentence is not finalized, please finalize sentence before calculating the total verb arity."
)
return self.root.calculate_verb_arities(self)
def get_tokens(self):
......@@ -117,12 +122,17 @@ class Sentence:
return self.unigram_representation + [self.root]
return self.unigram_representation
class SwevocManager:
def __init__(self):
""" A manager for SweVoc."""
"""A manager for SweVoc."""
self._swe_voc = dict()
self._categories = set()
self._load_swe_voc(os.path.join(os.path.dirname(os.path.abspath(__file__)), ".", conf.swe_voc_file))
self._load_swe_voc(
os.path.join(
os.path.dirname(os.path.abspath(__file__)), ".", conf.swe_voc_file
)
)
def _load_swe_voc(self, path):
"""
......@@ -139,7 +149,9 @@ class SwevocManager:
tag = conf.parole_to_suc_conversion[split_line[1]]
word = split_line[2]
categories = {category.strip() for category in split_line[3].split(",")}
categories = {
category.strip() for category in split_line[3].split(",")
}
if word not in self._swe_voc.keys():
self._swe_voc[word] = dict()
......@@ -176,8 +188,11 @@ class SwevocManager:
except KeyError:
return set()
class Token:
def __init__(self, token, pos, lemma=None, dep_rel=None, ref=None, dep_head_ref=None):
def __init__(
self, token, pos, lemma=None, dep_rel=None, ref=None, dep_head_ref=None
):
"""
A class representing a token. Mainly used for the tree based syntactic
representation of sentences used in the Sentence class.
......@@ -200,10 +215,10 @@ class Token:
def __str__(self) -> str:
print_list = [
"Lemma: " + self.lemma,
"PoS tag: " + self.pos_tag,
"Dependency relation: " + self.dep_rel,
"Ref: " + str(self.ref)
f"Lemma: {self.lemma}",
f"PoS tag: {self.pos_tag}",
f"Dependency relation: {self.dep_rel}",
f"Ref: {self.ref}",
]
return "\n".join(print_list)
......@@ -259,6 +274,7 @@ class Token:
return 0
return tmp
# Documents
class PosTaggedDocument:
def __init__(self):
......@@ -320,25 +336,29 @@ class PosTaggedDocument:
"""
for sentence in pos_tagged_sentences:
sentence_object = Sentence()
#print("BUILD_DOC", sentence)
# print("BUILD_DOC", sentence)
for token in sentence:
#print(token)
# print(token)
token_string = token[1]
pos_tag = token[4].split('|')[0]
pos_tag = token[4].split("|")[0]
try:
lemma = token[2]
dep_rel = token[7]
ref = int(token[0])
dep_head_ref = int(token[6])
except IndexError:
lemma = token[2] #None #TODO implementera lemmaization eller ta bort lemmastatistiken från postaggeddocument
lemma = token[
2
] # None #TODO implementera lemmaization eller ta bort lemmastatistiken från postaggeddocument
dep_rel = None
ref = None
dep_head_ref = None
except ValueError:
print(token[6], '\n', sentence)
print(token[6], "\n", sentence)
token_object = self.create_token(token_string, pos_tag, lemma, dep_rel, ref, dep_head_ref)
token_object = self.create_token(
token_string, pos_tag, lemma, dep_rel, ref, dep_head_ref
)
sentence_object.add_token(token_object)
self.add_token_statistics(token_object)
......@@ -383,15 +403,14 @@ class PosTaggedDocument:
if token.token not in self.word_dict.keys():
self.word_dict[token.token] = 0
self.n_unique_words += 1
if token.token.lower() not in self.lower_token_dict.keys():
self.lower_token_dict[token.token.lower()] = 0
if token.lemma not in self.lemma_dict.keys():
self.lemma_dict[token.lemma] = 0
self.n_unique_lemma += 1
self.word_dict[token.token] += 1
self.lemma_dict[token.lemma] += 1
self.n_words += 1
......@@ -399,7 +418,7 @@ class PosTaggedDocument:
self.total_word_length += len(token.lemma)
self.n_syllables += token.syllables
#if len(token.lemma) > conf.lix_limit: #changed tthis to token.token
# if len(token.lemma) > conf.lix_limit: #changed tthis to token.token
if len(token.token) > conf.lix_limit:
self.n_lix_long_words += 1
......@@ -450,6 +469,7 @@ class PosTaggedDocument:
self.n_content_words -= 1
self.n_verbs -= 1
class DependencyParsedDocument(PosTaggedDocument):
def __init__(self):
PosTaggedDocument.__init__(self)
......@@ -521,30 +541,32 @@ class DependencyParsedDocument(PosTaggedDocument):
"""
PosTaggedDocument.add_sentence_statistics(self, sentence)
#print('================')
#print(sentence)
#print('================')
# print('================')
# print(sentence)
# print('================')
sentence.finalize()
self.total_sentence_depth += sentence.get_depth()
self.total_verb_arity += sentence.get_total_verb_arity()
#print(self.total_verb_arity)
#print(f"sentence verb arities{sentence.verb_arities}")
for arity, number in zip(sentence.verb_arities.keys(), sentence.verb_arities.values()):
# print(self.total_verb_arity)
# print(f"sentence verb arities{sentence.verb_arities}")
for arity, number in zip(
sentence.verb_arities.keys(), sentence.verb_arities.values()
):
# HAR KOMMENTERAT UT VERB ARITIES TILLS VIDARE PGA FUNKAR INTE MEN VET EJ VARFÖR
#Funkar nu?
# Funkar nu?
if int(arity) >= len(conf.verb_arities):
self.verb_arities_dict[9] += 1
else:
else:
self.verb_arities_dict[arity] += 1
if sentence.has_verbial_root():
self.n_verbal_roots += 1
def add_dep_statistics(self, token) -> None:
def add_dep_statistics(self, token: Token) -> None:
"""
Incremebts the following attributes:
Increments the following attributes:
The amount of dependency tags
The amount of dependencies
The amount of right dependenceis (given positive relation distance)
......@@ -555,16 +577,21 @@ class DependencyParsedDocument(PosTaggedDocument):
The amount of post modifiers (if the relation equals the predifined relation)
The amount of pre modifiers (if the relation equals the predifined relation)
The amount of preposition compositions (if the relations equals the predifined relation)
:param token: Token
Args:
token: Token
"""
dep_distance = token.ref - token.dep_head_ref # TODO: var vaksam här, kan bli fel. kolla extractor: handleDependency
print(token)
dep_distance = (
token.ref - token.dep_head_ref
) # TODO: var vaksam här, kan bli fel. kolla extractor: handleDependency
self.n_dep_tags += 1
self.n_dependencies += 1
#print("HITME!")
#print("t d l", token.dep_rel)
# print("HITME!")
# print("t d l", token.dep_rel)
if token.dep_rel in conf.dep_types:
self.dep_type_dict[token.dep_rel] += 1
#print("HITME!")
# print("HITME!")
if token.dep_rel == conf.subclause_dep:
self.n_sub_clauses += 1
if dep_distance > 0:
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment