SCREAM operating with Stanza-output

7fe740cf · Love Arreborn · 1a530f5a · 7fe740cf · 7fe740cf
Commit 7fe740cf authored 1 year ago by Love Arreborn
--- a/pipeline.py
+++ b/pipeline.py
@@ -19,6 +19,8 @@ Aaron Smith <aaron.smith@lingfil.uu.se>
 import pprint
 import time
 import stanza
+from math import isfinite
+

 # Import Scream
 from scream.metrics import LexicalMetrics
@@ -47,14 +49,51 @@ from scream.document_parts import DependencyParsedDocument

 MAX_TOKEN = 256

-nlp = stanza.Pipeline(lang="sv", processors="tokenize,pos,lemma,depparse,ner")
+nlp = stanza.Pipeline(
+    lang="sv",
+    processors="tokenize,pos,lemma,depparse,ner",
+    download_method=stanza.DownloadMethod.REUSE_RESOURCES,
+)
+
+# Simple table for conversion from Stanza's UPOS tags to SUC tags
+upos_to_suc = {
+    "NOUN": "NN",
+    "VERB": "VB",
+    "ADJ": "JJ",
+    "ADV": "AB",
+    "PRON": "PN",
+    "DET": "DT",
+    "ADP": "PP",
+    "CONJ": "KN",
+    "NUM": "RG",
+    "PART": "PC",
+    "INTJ": "IE",
+    "PUNCT": "MAD",
+    "X": "XX",
+    "SYM": "MAD",
+    "SCONJ": "SN",
+}
+
+# Likewise, the dependency relation tags need to be mapped to match efselabs output
+dep_rel_mapping = {
+    "case": "RA",
+    "obl": "PA",
+    "root": "ROOT",
+    "nsubj": "SS",
+    "det": "DT",
+    "amod": "AT",
+    "obj": "OO",
+    "punct": "IP",
+}


 class AttrOptions:
    """A class to hold the options for the pipeline. Used in favor of a dict
-    to convert the keys to attributes. NOTE: Might be unnecessary."""
+    to convert the keys to attributes. NOTE: Might be unnecessary, could likely
+    just be a dict.
+    """

-    def __init__(self, d):
+    def __init__(self, d: dict):
        self.__dict__ = d


@@ -99,47 +138,64 @@ def run_pipeline(options: AttrOptions, args: list) -> list:
        list: A list of dictionaries containing the processed data.
    """

-    start_time = time.time()
+    pipeline_start_time = time.time()
    result = []
    for filename in args:
+
+        # ================== STANZA-TAGGING ==================
+        start_time = time.time()
        proc = process_file(options, filename)
+        time_checker(start_time, "Stanza")
+
+        # ================== SCREAM ==================
+        start_time = time.time()
        depdoc = DependencyParsedDocument()
-        # depdoc.build_document(
-        #     split_sentences_from_tagger(prep_parsed_for_build_doc(proc["parsed"]))
-        # )  # proc[parsed] is the output from EFSELAB
-        # scream, additional_metrics = split_measures_to_scream_headings(
-        #     StructuralMetrics(depdoc)
-        # )
+        depdoc.build_document(
+            split_sentences_from_tagger(prep_parsed_for_build_doc(proc["parsed"]))
+        )  # proc[parsed] is the output from EFSELAB
+        scream, additional_metrics = split_measures_to_scream_headings(
+            StructuralMetrics(depdoc)
+        )
+        time_checker(start_time, "SCREAM")

        print(f"FILENAME: {filename}")

+        # ================== STILETT ==================
+        # start_time = time.time()
        # simplified_text, sentence_transformations = ud_text_simplifier.sapis_wrapper(
        #     filename
        # )
+        # time_checker(start_time, "Stilett")

        # print(simplified_text, sentence_transformations)

        # print(f"proc {proc}")
        # print(f"PARAGRAPHS {proc['paragraphs']}" )
-        elapsed_time1 = time.time() - start_time

        # ================== COH-METRIX ==================
+        # start_time = time.time()
        # coh_metrix_cohesion = cohesion.run_cohesion(proc['parsed'])
        # coh_metrix_lsa = lsa.run_LSA(proc['parsed'], proc['paragraphs'])
        # coh_metrix_connectives = connectives.run_connectives(proc['parsed'])
-        # synonym_dict = synonyms.run_synonyms(proc['parsed'])

        # coh_metrix_cohesion = {"cohesion": coh_metrix_cohesion}
        # coh_metrix_connectives = {"connectives": coh_metrix_connectives}
        # coh_metrix_lsa = {"LSA" : coh_metrix_lsa}
        # coh_metrix = {**coh_metrix_cohesion, **coh_metrix_connectives, **coh_metrix_lsa }
        # coh_metrix = {}
+        # time_checker(start_time, "Coh-Metrix")
+
+        # ================== SYNONYMS ==================
+        # start_time = time.time()
+        # synonym_dict = synonyms.run_synonyms(proc['parsed'])
+        # time_checker(start_time, "Synonyms")
+
        result.append(
            {
                "input": filename,
                "efselab": proc,
-                # "scream": scream,
-                # "additional_metrics": additional_metrics,
+                "scream": scream,
+                "additional_metrics": additional_metrics,
                # "stillett": {
                #     "simplified_text": simplified_text,
                #     "sentence_transformations": sentence_transformations,
@@ -157,11 +213,21 @@ def run_pipeline(options: AttrOptions, args: list) -> list:
    #            print(d['coh-metrix'])
    # print(l[-2])

+    time_checker(pipeline_start_time, "Pipeline")
    pprint.pp(result)
    return result


-def prep_parsed_for_build_doc(parsed):
+def prep_parsed_for_build_doc(parsed: list) -> str:
+    """
+    Prepare parsed data for building a DependencyParsedDocument. Required for SCREAM.
+
+    Args:
+        parsed (list): The parsed data.
+
+    Returns:
+        list: Data prepared for building a DependencyParsedDocument.
+    """
    return "\n".join(
        [
            "\t".join(map(str, word_info))
@@ -196,7 +262,10 @@ def process_file(options: AttrOptions, filename: str) -> dict:

    if options.tagged or options.lemmatized or options.parsed:
        tagged = [
-            [(word.text, word.upos) for word in sentence.words]
+            [
+                (word.text, upos_to_suc.get(word.upos, word.upos))
+                for word in sentence.words
+            ]
            for sentence in doc.sentences
        ]
        lemmas = [[word.lemma for word in sentence.words] for sentence in doc.sentences]
@@ -211,13 +280,14 @@ def process_file(options: AttrOptions, filename: str) -> dict:
        parsed = [
            [
                (
-                    word.id,
+                    str(word.id),
                    word.text,
                    word.lemma,
-                    word.upos,
-                    word.xpos,
-                    word.head,
-                    word.deprel,
+                    upos_to_suc.get(word.upos, word.upos),
+                    upos_to_suc.get(word.upos, word.upos),
+                    "|".join(upos_to_suc.get(word.xpos, word.xpos).split("|")[1:]),
+                    str(word.head) if word.head != 0 else "0",
+                    dep_rel_mapping.get(word.deprel, word.deprel),
                )
                for word in sentence.words
            ]
@@ -233,7 +303,16 @@ def process_file(options: AttrOptions, filename: str) -> dict:
    }


-def split_sentences_from_tagger(resp) -> list:
+def split_sentences_from_tagger(resp: str) -> list:
+    """Split the sentences from the tagger output.
+
+    Args:
+        resp (str): The tagger output.
+
+    Returns:
+        list: The sentences split from the tagger output.
+    """
+
    sentences = []
    sentence = []
    for line in resp.split("\n"):
@@ -251,7 +330,14 @@ def split_sentences_from_tagger(resp) -> list:


 def split_measures_to_scream_headings(structural_instance) -> dict:
-    from math import isfinite
+    """Split the measures to SCREAM headings.
+
+    Args:
+        structural_instance: The structural instance.
+
+    Returns:
+        dict: The measures split to SCREAM headings.
+    """

    calculated_metrics = vars(structural_instance)
    structural_vars = [
@@ -340,5 +426,21 @@ def split_measures_to_scream_headings(structural_instance) -> dict:
    return fixed_dict, extra_dict


+def time_checker(start_time: float, method: str) -> None:
+    """Check the time elapsed since the start time.
+
+    Args:
+        start_time (float): The start time.
+        method (str): The method being run.
+    """
+    elapsed_time = time.time() - start_time
+    print(f"{method}, elapsed time: {elapsed_time}")
+
+
 if __name__ == "__main__":
    main("I skolan äter jag ett rött äpple.".encode("utf-8"))
+    # main(
+    #     "Det finns ett stort antal meningar som är onödigt långa, och vi behöver se till att dessa kan taggas godtyckligt. Denna, till synes, enkla uppgift är inte alltid så enkel.".encode(
+    #         "utf-8"
+    #     )
+    # )
--- a/scream/document_parts.py
+++ b/scream/document_parts.py
@@ -3,6 +3,7 @@ import os
 from scream import conf
 from scream import helper_methods

+
 class FinalizeError(Exception):
    def __init__(self, *args):
        """
@@ -11,6 +12,7 @@ class FinalizeError(Exception):
        """
        super().__init__(*args)

+
 # Document parts
 class Sentence:
    def __init__(self):
@@ -51,7 +53,9 @@ class Sentence:
        :return: maximum tree depth
        """
        if not self.finalized:
-            raise FinalizeError("The sentence is not finalized, please finalize sentence before calculating the depth.")
+            raise FinalizeError(
+                "The sentence is not finalized, please finalize sentence before calculating the depth."
+            )
        return self.root.get_depth()

    def assign_root(self) -> None:
@@ -109,7 +113,8 @@ class Sentence:
        if len(self.verb_arities) == 0:
            if not self.finalized:
                raise FinalizeError(
-                    "The sentence is not finalized, please finalize sentence before calculating the total verb arity.")
+                    "The sentence is not finalized, please finalize sentence before calculating the total verb arity."
+                )
        return self.root.calculate_verb_arities(self)

    def get_tokens(self):
@@ -117,12 +122,17 @@ class Sentence:
            return self.unigram_representation + [self.root]
        return self.unigram_representation

+
 class SwevocManager:
    def __init__(self):
-        """ A manager for SweVoc."""
+        """A manager for SweVoc."""
        self._swe_voc = dict()
        self._categories = set()
-        self._load_swe_voc(os.path.join(os.path.dirname(os.path.abspath(__file__)), ".", conf.swe_voc_file))
+        self._load_swe_voc(
+            os.path.join(
+                os.path.dirname(os.path.abspath(__file__)), ".", conf.swe_voc_file
+            )
+        )

    def _load_swe_voc(self, path):
        """
@@ -139,7 +149,9 @@ class SwevocManager:

                    tag = conf.parole_to_suc_conversion[split_line[1]]
                    word = split_line[2]
-                    categories = {category.strip() for category in split_line[3].split(",")}
+                    categories = {
+                        category.strip() for category in split_line[3].split(",")
+                    }

                    if word not in self._swe_voc.keys():
                        self._swe_voc[word] = dict()
@@ -176,8 +188,11 @@ class SwevocManager:
        except KeyError:
            return set()

+
 class Token:
-    def __init__(self, token, pos, lemma=None, dep_rel=None, ref=None, dep_head_ref=None):
+    def __init__(
+        self, token, pos, lemma=None, dep_rel=None, ref=None, dep_head_ref=None
+    ):
        """
        A class representing a token. Mainly used for the tree based syntactic
        representation of sentences used in the Sentence class.
@@ -200,10 +215,10 @@ class Token:

    def __str__(self) -> str:
        print_list = [
-            "Lemma: " + self.lemma,
-            "PoS tag: " + self.pos_tag,
-            "Dependency relation: " + self.dep_rel,
-            "Ref: " + str(self.ref)
+            f"Lemma: {self.lemma}",
+            f"PoS tag: {self.pos_tag}",
+            f"Dependency relation: {self.dep_rel}",
+            f"Ref: {self.ref}",
        ]
        return "\n".join(print_list)

@@ -259,6 +274,7 @@ class Token:
            return 0
        return tmp

+
 # Documents
 class PosTaggedDocument:
    def __init__(self):
@@ -320,25 +336,29 @@ class PosTaggedDocument:
        """
        for sentence in pos_tagged_sentences:
            sentence_object = Sentence()
-            #print("BUILD_DOC", sentence)
+            # print("BUILD_DOC", sentence)
            for token in sentence:
-                #print(token)
+                # print(token)
                token_string = token[1]
-                pos_tag = token[4].split('|')[0]
+                pos_tag = token[4].split("|")[0]
                try:
                    lemma = token[2]
                    dep_rel = token[7]
                    ref = int(token[0])
                    dep_head_ref = int(token[6])
                except IndexError:
-                    lemma = token[2] #None #TODO implementera lemmaization eller ta bort lemmastatistiken från postaggeddocument
+                    lemma = token[
+                        2
+                    ]  # None #TODO implementera lemmaization eller ta bort lemmastatistiken från postaggeddocument
                    dep_rel = None
                    ref = None
                    dep_head_ref = None
                except ValueError:
-                    print(token[6], '\n', sentence)
+                    print(token[6], "\n", sentence)

-                token_object = self.create_token(token_string, pos_tag, lemma, dep_rel, ref, dep_head_ref)
+                token_object = self.create_token(
+                    token_string, pos_tag, lemma, dep_rel, ref, dep_head_ref
+                )
                sentence_object.add_token(token_object)

                self.add_token_statistics(token_object)
@@ -383,15 +403,14 @@ class PosTaggedDocument:
            if token.token not in self.word_dict.keys():
                self.word_dict[token.token] = 0
                self.n_unique_words += 1
-                
-           
+
            if token.token.lower() not in self.lower_token_dict.keys():
                self.lower_token_dict[token.token.lower()] = 0

            if token.lemma not in self.lemma_dict.keys():
                self.lemma_dict[token.lemma] = 0
                self.n_unique_lemma += 1
-            
+
            self.word_dict[token.token] += 1
            self.lemma_dict[token.lemma] += 1
            self.n_words += 1
@@ -399,7 +418,7 @@ class PosTaggedDocument:
            self.total_word_length += len(token.lemma)
            self.n_syllables += token.syllables

-            #if len(token.lemma) > conf.lix_limit:  #changed tthis to token.token
+            # if len(token.lemma) > conf.lix_limit:  #changed tthis to token.token
            if len(token.token) > conf.lix_limit:
                self.n_lix_long_words += 1

@@ -450,6 +469,7 @@ class PosTaggedDocument:
            self.n_content_words -= 1
            self.n_verbs -= 1

+
 class DependencyParsedDocument(PosTaggedDocument):
    def __init__(self):
        PosTaggedDocument.__init__(self)
@@ -521,30 +541,32 @@ class DependencyParsedDocument(PosTaggedDocument):
        """
        PosTaggedDocument.add_sentence_statistics(self, sentence)

-        #print('================')
-        #print(sentence)
-        #print('================')
+        # print('================')
+        # print(sentence)
+        # print('================')

        sentence.finalize()
        self.total_sentence_depth += sentence.get_depth()
        self.total_verb_arity += sentence.get_total_verb_arity()
-        #print(self.total_verb_arity)
-        #print(f"sentence verb arities{sentence.verb_arities}")
-        for arity, number in zip(sentence.verb_arities.keys(), sentence.verb_arities.values()):
-           
+        # print(self.total_verb_arity)
+        # print(f"sentence verb arities{sentence.verb_arities}")
+        for arity, number in zip(
+            sentence.verb_arities.keys(), sentence.verb_arities.values()
+        ):
+
            # HAR KOMMENTERAT UT VERB ARITIES TILLS VIDARE PGA FUNKAR INTE MEN VET EJ VARFÖR
-            #Funkar nu?
-            
+            # Funkar nu?
+
            if int(arity) >= len(conf.verb_arities):
                self.verb_arities_dict[9] += 1
-            else:    
+            else:
                self.verb_arities_dict[arity] += 1
        if sentence.has_verbial_root():
            self.n_verbal_roots += 1

-    def add_dep_statistics(self, token) -> None:
+    def add_dep_statistics(self, token: Token) -> None:
        """
-        Incremebts the following attributes:
+        Increments the following attributes:
            The amount of dependency tags
            The amount of dependencies
            The amount of right dependenceis (given positive relation distance)
@@ -555,16 +577,21 @@ class DependencyParsedDocument(PosTaggedDocument):
            The amount of post modifiers (if the relation equals the predifined relation)
            The amount of pre modifiers (if the relation equals the predifined relation)
            The amount of preposition compositions (if the relations equals the predifined relation)
-        :param token: Token
+        Args:
+            token: Token
        """
-        dep_distance = token.ref - token.dep_head_ref  # TODO: var vaksam här, kan bli fel. kolla extractor: handleDependency
+
+        print(token)
+        dep_distance = (
+            token.ref - token.dep_head_ref
+        )  # TODO: var vaksam här, kan bli fel. kolla extractor: handleDependency
        self.n_dep_tags += 1
        self.n_dependencies += 1
-        #print("HITME!")
-        #print("t d l", token.dep_rel)
+        # print("HITME!")
+        # print("t d l", token.dep_rel)
        if token.dep_rel in conf.dep_types:
            self.dep_type_dict[token.dep_rel] += 1
-            #print("HITME!")
+            # print("HITME!")
        if token.dep_rel == conf.subclause_dep:
            self.n_sub_clauses += 1
            if dep_distance > 0: