diff --git a/pipeline.py b/pipeline.py index 4d1dd3c208429e2cc23d648184361960bf6af866..8eea3bb0c2c7b58ae70ca5ae5fd2ef727e610a0c 100755 --- a/pipeline.py +++ b/pipeline.py @@ -19,6 +19,8 @@ Aaron Smith <aaron.smith@lingfil.uu.se> import pprint import time import stanza +from math import isfinite + # Import Scream from scream.metrics import LexicalMetrics @@ -47,14 +49,51 @@ from scream.document_parts import DependencyParsedDocument MAX_TOKEN = 256 -nlp = stanza.Pipeline(lang="sv", processors="tokenize,pos,lemma,depparse,ner") +nlp = stanza.Pipeline( + lang="sv", + processors="tokenize,pos,lemma,depparse,ner", + download_method=stanza.DownloadMethod.REUSE_RESOURCES, +) + +# Simple table for conversion from Stanza's UPOS tags to SUC tags +upos_to_suc = { + "NOUN": "NN", + "VERB": "VB", + "ADJ": "JJ", + "ADV": "AB", + "PRON": "PN", + "DET": "DT", + "ADP": "PP", + "CONJ": "KN", + "NUM": "RG", + "PART": "PC", + "INTJ": "IE", + "PUNCT": "MAD", + "X": "XX", + "SYM": "MAD", + "SCONJ": "SN", +} + +# Likewise, the dependency relation tags need to be mapped to match efselabs output +dep_rel_mapping = { + "case": "RA", + "obl": "PA", + "root": "ROOT", + "nsubj": "SS", + "det": "DT", + "amod": "AT", + "obj": "OO", + "punct": "IP", +} class AttrOptions: """A class to hold the options for the pipeline. Used in favor of a dict - to convert the keys to attributes. NOTE: Might be unnecessary.""" + to convert the keys to attributes. NOTE: Might be unnecessary, could likely + just be a dict. + """ - def __init__(self, d): + def __init__(self, d: dict): self.__dict__ = d @@ -99,47 +138,64 @@ def run_pipeline(options: AttrOptions, args: list) -> list: list: A list of dictionaries containing the processed data. """ - start_time = time.time() + pipeline_start_time = time.time() result = [] for filename in args: + + # ================== STANZA-TAGGING ================== + start_time = time.time() proc = process_file(options, filename) + time_checker(start_time, "Stanza") + + # ================== SCREAM ================== + start_time = time.time() depdoc = DependencyParsedDocument() - # depdoc.build_document( - # split_sentences_from_tagger(prep_parsed_for_build_doc(proc["parsed"])) - # ) # proc[parsed] is the output from EFSELAB - # scream, additional_metrics = split_measures_to_scream_headings( - # StructuralMetrics(depdoc) - # ) + depdoc.build_document( + split_sentences_from_tagger(prep_parsed_for_build_doc(proc["parsed"])) + ) # proc[parsed] is the output from EFSELAB + scream, additional_metrics = split_measures_to_scream_headings( + StructuralMetrics(depdoc) + ) + time_checker(start_time, "SCREAM") print(f"FILENAME: {filename}") + # ================== STILETT ================== + # start_time = time.time() # simplified_text, sentence_transformations = ud_text_simplifier.sapis_wrapper( # filename # ) + # time_checker(start_time, "Stilett") # print(simplified_text, sentence_transformations) # print(f"proc {proc}") # print(f"PARAGRAPHS {proc['paragraphs']}" ) - elapsed_time1 = time.time() - start_time # ================== COH-METRIX ================== + # start_time = time.time() # coh_metrix_cohesion = cohesion.run_cohesion(proc['parsed']) # coh_metrix_lsa = lsa.run_LSA(proc['parsed'], proc['paragraphs']) # coh_metrix_connectives = connectives.run_connectives(proc['parsed']) - # synonym_dict = synonyms.run_synonyms(proc['parsed']) # coh_metrix_cohesion = {"cohesion": coh_metrix_cohesion} # coh_metrix_connectives = {"connectives": coh_metrix_connectives} # coh_metrix_lsa = {"LSA" : coh_metrix_lsa} # coh_metrix = {**coh_metrix_cohesion, **coh_metrix_connectives, **coh_metrix_lsa } # coh_metrix = {} + # time_checker(start_time, "Coh-Metrix") + + # ================== SYNONYMS ================== + # start_time = time.time() + # synonym_dict = synonyms.run_synonyms(proc['parsed']) + # time_checker(start_time, "Synonyms") + result.append( { "input": filename, "efselab": proc, - # "scream": scream, - # "additional_metrics": additional_metrics, + "scream": scream, + "additional_metrics": additional_metrics, # "stillett": { # "simplified_text": simplified_text, # "sentence_transformations": sentence_transformations, @@ -157,11 +213,21 @@ def run_pipeline(options: AttrOptions, args: list) -> list: # print(d['coh-metrix']) # print(l[-2]) + time_checker(pipeline_start_time, "Pipeline") pprint.pp(result) return result -def prep_parsed_for_build_doc(parsed): +def prep_parsed_for_build_doc(parsed: list) -> str: + """ + Prepare parsed data for building a DependencyParsedDocument. Required for SCREAM. + + Args: + parsed (list): The parsed data. + + Returns: + list: Data prepared for building a DependencyParsedDocument. + """ return "\n".join( [ "\t".join(map(str, word_info)) @@ -196,7 +262,10 @@ def process_file(options: AttrOptions, filename: str) -> dict: if options.tagged or options.lemmatized or options.parsed: tagged = [ - [(word.text, word.upos) for word in sentence.words] + [ + (word.text, upos_to_suc.get(word.upos, word.upos)) + for word in sentence.words + ] for sentence in doc.sentences ] lemmas = [[word.lemma for word in sentence.words] for sentence in doc.sentences] @@ -211,13 +280,14 @@ def process_file(options: AttrOptions, filename: str) -> dict: parsed = [ [ ( - word.id, + str(word.id), word.text, word.lemma, - word.upos, - word.xpos, - word.head, - word.deprel, + upos_to_suc.get(word.upos, word.upos), + upos_to_suc.get(word.upos, word.upos), + "|".join(upos_to_suc.get(word.xpos, word.xpos).split("|")[1:]), + str(word.head) if word.head != 0 else "0", + dep_rel_mapping.get(word.deprel, word.deprel), ) for word in sentence.words ] @@ -233,7 +303,16 @@ def process_file(options: AttrOptions, filename: str) -> dict: } -def split_sentences_from_tagger(resp) -> list: +def split_sentences_from_tagger(resp: str) -> list: + """Split the sentences from the tagger output. + + Args: + resp (str): The tagger output. + + Returns: + list: The sentences split from the tagger output. + """ + sentences = [] sentence = [] for line in resp.split("\n"): @@ -251,7 +330,14 @@ def split_sentences_from_tagger(resp) -> list: def split_measures_to_scream_headings(structural_instance) -> dict: - from math import isfinite + """Split the measures to SCREAM headings. + + Args: + structural_instance: The structural instance. + + Returns: + dict: The measures split to SCREAM headings. + """ calculated_metrics = vars(structural_instance) structural_vars = [ @@ -340,5 +426,21 @@ def split_measures_to_scream_headings(structural_instance) -> dict: return fixed_dict, extra_dict +def time_checker(start_time: float, method: str) -> None: + """Check the time elapsed since the start time. + + Args: + start_time (float): The start time. + method (str): The method being run. + """ + elapsed_time = time.time() - start_time + print(f"{method}, elapsed time: {elapsed_time}") + + if __name__ == "__main__": main("I skolan äter jag ett rött äpple.".encode("utf-8")) + # main( + # "Det finns ett stort antal meningar som är onödigt långa, och vi behöver se till att dessa kan taggas godtyckligt. Denna, till synes, enkla uppgift är inte alltid så enkel.".encode( + # "utf-8" + # ) + # ) diff --git a/scream/document_parts.py b/scream/document_parts.py index 7ef4c4aba6f4de0e86df979c93e67609fab5e6b0..11e0548bcbff0f08c3f996b0bbbf47e6f14e7b19 100644 --- a/scream/document_parts.py +++ b/scream/document_parts.py @@ -3,6 +3,7 @@ import os from scream import conf from scream import helper_methods + class FinalizeError(Exception): def __init__(self, *args): """ @@ -11,6 +12,7 @@ class FinalizeError(Exception): """ super().__init__(*args) + # Document parts class Sentence: def __init__(self): @@ -51,7 +53,9 @@ class Sentence: :return: maximum tree depth """ if not self.finalized: - raise FinalizeError("The sentence is not finalized, please finalize sentence before calculating the depth.") + raise FinalizeError( + "The sentence is not finalized, please finalize sentence before calculating the depth." + ) return self.root.get_depth() def assign_root(self) -> None: @@ -109,7 +113,8 @@ class Sentence: if len(self.verb_arities) == 0: if not self.finalized: raise FinalizeError( - "The sentence is not finalized, please finalize sentence before calculating the total verb arity.") + "The sentence is not finalized, please finalize sentence before calculating the total verb arity." + ) return self.root.calculate_verb_arities(self) def get_tokens(self): @@ -117,12 +122,17 @@ class Sentence: return self.unigram_representation + [self.root] return self.unigram_representation + class SwevocManager: def __init__(self): - """ A manager for SweVoc.""" + """A manager for SweVoc.""" self._swe_voc = dict() self._categories = set() - self._load_swe_voc(os.path.join(os.path.dirname(os.path.abspath(__file__)), ".", conf.swe_voc_file)) + self._load_swe_voc( + os.path.join( + os.path.dirname(os.path.abspath(__file__)), ".", conf.swe_voc_file + ) + ) def _load_swe_voc(self, path): """ @@ -139,7 +149,9 @@ class SwevocManager: tag = conf.parole_to_suc_conversion[split_line[1]] word = split_line[2] - categories = {category.strip() for category in split_line[3].split(",")} + categories = { + category.strip() for category in split_line[3].split(",") + } if word not in self._swe_voc.keys(): self._swe_voc[word] = dict() @@ -176,8 +188,11 @@ class SwevocManager: except KeyError: return set() + class Token: - def __init__(self, token, pos, lemma=None, dep_rel=None, ref=None, dep_head_ref=None): + def __init__( + self, token, pos, lemma=None, dep_rel=None, ref=None, dep_head_ref=None + ): """ A class representing a token. Mainly used for the tree based syntactic representation of sentences used in the Sentence class. @@ -200,10 +215,10 @@ class Token: def __str__(self) -> str: print_list = [ - "Lemma: " + self.lemma, - "PoS tag: " + self.pos_tag, - "Dependency relation: " + self.dep_rel, - "Ref: " + str(self.ref) + f"Lemma: {self.lemma}", + f"PoS tag: {self.pos_tag}", + f"Dependency relation: {self.dep_rel}", + f"Ref: {self.ref}", ] return "\n".join(print_list) @@ -259,6 +274,7 @@ class Token: return 0 return tmp + # Documents class PosTaggedDocument: def __init__(self): @@ -320,25 +336,29 @@ class PosTaggedDocument: """ for sentence in pos_tagged_sentences: sentence_object = Sentence() - #print("BUILD_DOC", sentence) + # print("BUILD_DOC", sentence) for token in sentence: - #print(token) + # print(token) token_string = token[1] - pos_tag = token[4].split('|')[0] + pos_tag = token[4].split("|")[0] try: lemma = token[2] dep_rel = token[7] ref = int(token[0]) dep_head_ref = int(token[6]) except IndexError: - lemma = token[2] #None #TODO implementera lemmaization eller ta bort lemmastatistiken från postaggeddocument + lemma = token[ + 2 + ] # None #TODO implementera lemmaization eller ta bort lemmastatistiken från postaggeddocument dep_rel = None ref = None dep_head_ref = None except ValueError: - print(token[6], '\n', sentence) + print(token[6], "\n", sentence) - token_object = self.create_token(token_string, pos_tag, lemma, dep_rel, ref, dep_head_ref) + token_object = self.create_token( + token_string, pos_tag, lemma, dep_rel, ref, dep_head_ref + ) sentence_object.add_token(token_object) self.add_token_statistics(token_object) @@ -383,15 +403,14 @@ class PosTaggedDocument: if token.token not in self.word_dict.keys(): self.word_dict[token.token] = 0 self.n_unique_words += 1 - - + if token.token.lower() not in self.lower_token_dict.keys(): self.lower_token_dict[token.token.lower()] = 0 if token.lemma not in self.lemma_dict.keys(): self.lemma_dict[token.lemma] = 0 self.n_unique_lemma += 1 - + self.word_dict[token.token] += 1 self.lemma_dict[token.lemma] += 1 self.n_words += 1 @@ -399,7 +418,7 @@ class PosTaggedDocument: self.total_word_length += len(token.lemma) self.n_syllables += token.syllables - #if len(token.lemma) > conf.lix_limit: #changed tthis to token.token + # if len(token.lemma) > conf.lix_limit: #changed tthis to token.token if len(token.token) > conf.lix_limit: self.n_lix_long_words += 1 @@ -450,6 +469,7 @@ class PosTaggedDocument: self.n_content_words -= 1 self.n_verbs -= 1 + class DependencyParsedDocument(PosTaggedDocument): def __init__(self): PosTaggedDocument.__init__(self) @@ -521,30 +541,32 @@ class DependencyParsedDocument(PosTaggedDocument): """ PosTaggedDocument.add_sentence_statistics(self, sentence) - #print('================') - #print(sentence) - #print('================') + # print('================') + # print(sentence) + # print('================') sentence.finalize() self.total_sentence_depth += sentence.get_depth() self.total_verb_arity += sentence.get_total_verb_arity() - #print(self.total_verb_arity) - #print(f"sentence verb arities{sentence.verb_arities}") - for arity, number in zip(sentence.verb_arities.keys(), sentence.verb_arities.values()): - + # print(self.total_verb_arity) + # print(f"sentence verb arities{sentence.verb_arities}") + for arity, number in zip( + sentence.verb_arities.keys(), sentence.verb_arities.values() + ): + # HAR KOMMENTERAT UT VERB ARITIES TILLS VIDARE PGA FUNKAR INTE MEN VET EJ VARFÖR - #Funkar nu? - + # Funkar nu? + if int(arity) >= len(conf.verb_arities): self.verb_arities_dict[9] += 1 - else: + else: self.verb_arities_dict[arity] += 1 if sentence.has_verbial_root(): self.n_verbal_roots += 1 - def add_dep_statistics(self, token) -> None: + def add_dep_statistics(self, token: Token) -> None: """ - Incremebts the following attributes: + Increments the following attributes: The amount of dependency tags The amount of dependencies The amount of right dependenceis (given positive relation distance) @@ -555,16 +577,21 @@ class DependencyParsedDocument(PosTaggedDocument): The amount of post modifiers (if the relation equals the predifined relation) The amount of pre modifiers (if the relation equals the predifined relation) The amount of preposition compositions (if the relations equals the predifined relation) - :param token: Token + Args: + token: Token """ - dep_distance = token.ref - token.dep_head_ref # TODO: var vaksam här, kan bli fel. kolla extractor: handleDependency + + print(token) + dep_distance = ( + token.ref - token.dep_head_ref + ) # TODO: var vaksam här, kan bli fel. kolla extractor: handleDependency self.n_dep_tags += 1 self.n_dependencies += 1 - #print("HITME!") - #print("t d l", token.dep_rel) + # print("HITME!") + # print("t d l", token.dep_rel) if token.dep_rel in conf.dep_types: self.dep_type_dict[token.dep_rel] += 1 - #print("HITME!") + # print("HITME!") if token.dep_rel == conf.subclause_dep: self.n_sub_clauses += 1 if dep_distance > 0: