From f12f55e6cd8b14f58287316bbf94ca77e2239030 Mon Sep 17 00:00:00 2001
From: Love Arreborn <love.arreborn@liu.se>
Date: Thu, 18 Jul 2024 11:30:06 +0200
Subject: [PATCH] adding finished cohesion, needs verification

---
 .DS_Store                        | Bin 8196 -> 8196 bytes
 .gitignore                       |   1 +
 coh_metrix_new/coh_metrix.py     |  73 ++++-----
 coh_metrix_new/cohesion.py       | 260 +++++++++++++++++++++++++++++++
 coh_metrix_new/configuration.py  |   8 +
 coh_metrix_new/helpers.py        |   5 +-
 coh_metrix_new/lsa.py            | 170 +++++++++-----------
 coh_metrix_new/run_coh_metrix.py |  17 --
 latt.txt                         |   9 ++
 lsa.txt                          |   8 +-
 pipeline.py                      | 111 +++----------
 scream2/scream2.py               |  37 ++++-
 testset_results.json             |   2 -
 13 files changed, 440 insertions(+), 261 deletions(-)
 create mode 100644 coh_metrix_new/cohesion.py
 delete mode 100644 coh_metrix_new/run_coh_metrix.py
 create mode 100644 latt.txt
 delete mode 100644 testset_results.json

diff --git a/.DS_Store b/.DS_Store
index c356708b1daaf1932a9d38108d01ed0ec9751a85..7dee366b0623845d59fa3942e69f1c878cff591d 100644
GIT binary patch
delta 32
ocmZp1XmOa}&nUGqU^hRb)Mg%mO>7&>JQ+8$OKfM`SmwqE0I&oKUH||9

delta 150
zcmZp1XmOa}&nUeyU^hRb^kyD`O>CAT3?&Sy48;s348;to3?&Tl3`JmmDMJoeOphUp
op_n0`A<r`>KRGEUKZ${XL4bjQ*?IGNF#*QS>=N54GlQ2A0J5$k>;M1&

diff --git a/.gitignore b/.gitignore
index c3771b20..0ee8ecb2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,3 +6,4 @@ scream/__pycache__
 stilett/__pycache__
 synonyms/__pycache__
 */__pycache__
+testset_results.json
\ No newline at end of file
diff --git a/coh_metrix_new/coh_metrix.py b/coh_metrix_new/coh_metrix.py
index 58c556dc..ae17fe49 100644
--- a/coh_metrix_new/coh_metrix.py
+++ b/coh_metrix_new/coh_metrix.py
@@ -1,60 +1,41 @@
-import spacy
+"""
+This module adds Coh-Metrix to the spaCy pipeline.
+The individual metrics can be found in their individual files:
+- cohesion.py
+- lsa.py
+- data for givenness is collected when running SCREAM, and as such,
+    is only computed with a division in this module.
+"""
+
+from spacy.tokens import Doc
 from spacy.language import Language
 
-import coh_metrix_new.configuration as config
-import coh_metrix_new.helpers as helpers
+from coh_metrix_new.cohesion import run_cohesion
+from coh_metrix_new.lsa import run_lsa
+from coh_metrix_new.helpers import calculate_safe_division
 
-import numpy as np
 
 @Language.component("coh_metrix")
 def run_coh_metrix(docs):
-    global_lsa = None
-    adjacent_lsa = None
-
-    paragraphs = []
-
-    start = 0
-    
-    # create spans for each paragraph
-    for i, token in enumerate(docs):
-        if token.pos_ == "SPACE":
-            paragraphs.append(docs[start:i])
-            start = i + 1
-
-    # only add a span if we've encountered a whitespace
-    if start != 0:
-        paragraphs.append(docs[start:])
-
-    coh_metrix = {
-        "givenness": (
-            docs.user_data["coh_metrix_helpers"]["repeated_lemmas"] +
-            docs.user_data["coh_metrix_helpers"]["pronouns"] 
-            ) / docs.user_data["coh_metrix_helpers"]["total_lemmas"],
-        "global_lsa": global_lsa,
-        "adjacent_lsa": adjacent_lsa
-        }
-    docs.user_data["coh_metrix"] = coh_metrix
+    Doc.set_extension("coh_metrix", default={})
+    docs._.coh_metrix = {
+        "cohesion": run_cohesion(docs),
+        "lsa": run_lsa(docs),
+        "givenness": calculate_safe_division(
+            (
+                docs._.coh_metrix_helpers["repeated_lemmas"]
+                + docs._.coh_metrix_helpers["pronouns"]
+            ),
+            docs._.coh_metrix_helpers["total_lemmas"],
+        ),
+    }
     return docs
 
+
 def add_coh_metrix(nlp):
     """
     Add the coh_metrix component to the spaCy pipeline.
     Ensure this is added *after* SCREAM2.
     :param nlp: The spaCy nlp object.
     """
-
-    # Construction via add_pipe with custom config
-    
-    config = {
-        "model": {
-            "@architectures": "spacy-transformers.TransformerModel.v3",
-            "name": "KBLab/sentence-bert-swedish-cased",
-            "tokenizer_config": {"use_fast": True},
-            "transformer_config": {"output_attentions": True},
-            "mixed_precision": True,
-            "grad_scaler_config": {"init_scale": 32768}
-        }
-    }
-    
-    nlp.add_pipe("transformer", config=config, last=True)
-    nlp.add_pipe("coh_metrix", last=True)
\ No newline at end of file
+    nlp.add_pipe("coh_metrix", last=True)
diff --git a/coh_metrix_new/cohesion.py b/coh_metrix_new/cohesion.py
new file mode 100644
index 00000000..b974b8c6
--- /dev/null
+++ b/coh_metrix_new/cohesion.py
@@ -0,0 +1,260 @@
+"""
+This module calculates cohesion metrics for a given text.
+The metrics are divided into adjacent and global cohesion, where global
+cohesion is only calculated if the amount of sentences is below the threshold
+MAX_SENTENCES, as defined in coh_metrics/configuration.py.
+
+These metrics need to be verified in full, especially anaphor overlap.
+Furthermore, training a new model to include transformer embeddings as well as
+coreference resolution would be beneficial.
+"""
+
+import numpy as np
+from spacy.tokens import Token, Doc
+from coh_metrix_new import configuration as config
+
+
+def calculate_overlaps(token1: Token, token2: Token, main: dict, other: dict):
+    """
+    A helper function to minimize code duplication in the cohesion calculation.
+    Can be used for both adjacent and global cohesion.
+    :param token1: the first token.
+    :param token2: the second token.
+    :param main: the main dictionary to update.
+    :param other: the other dictionary to update.
+    """
+
+    # Noun overlap
+    if (
+        token1.text.lower() == token2.text.lower()
+        and token1.pos_ == token2.pos_
+        and token1.pos_ in ["NOUN", "PROPN"]
+    ):
+        main["nouns"] += 1
+
+    # Argument overlap
+    if (
+        token1.text == token2.text and token1.pos_ == "PRON" and token2.pos_ == "PRON"
+    ) or (
+        token1.lemma_ == token2.lemma_
+        and token1.pos_ == "NOUN"
+        and token2.pos_ == "NOUN"
+    ):
+        main["arguments"] += 1
+
+    # Stem overlap
+    if (
+        token1.lemma_ == token2.lemma_
+        and token1.pos_ in ["NOUN", "VERB", "ADJ", "ADV"]
+        and token2.pos_ == "NOUN"
+    ):
+        main["stems"] += 1
+
+    # Content word overlap
+
+    if token1.lemma_ == token2.lemma_:
+        if token1.pos_ == "PRON" and token2.pos_ == "PRON":
+            other["pronouns"]["bin"] += 1
+        elif token1.pos_ == "NOUN" and token2.pos_ == "NOUN":
+            other["nouns"]["bin"] += 1
+        elif token1.pos_ == "ADV" and token2.pos_ == "ADV":
+            other["adverbs"]["bin"] += 1
+        elif token1.pos_ == "ADJ" and token2.pos_ == "ADJ":
+            other["adjectives"]["bin"] += 1
+        elif token1.pos_ == "VERB" and token2.pos_ == "VERB":
+            other["verbs"]["bin"] += 1
+
+
+def gen_num(np_morph, pron_morph):
+    """
+    Helper function to determine gender and number agreement between a noun phrase
+    and a pronoun.
+    :param np_morph: the morphological features of the noun phrase.
+    :param pron_morph: the morphological features of the pronoun.
+    :return: True if the noun phrase and pronoun agree.
+    """
+    try:
+        genus_np = next(iter(np_morph.get("Gender")))
+        numerus_np = next(iter(np_morph.get("Number")))
+        genus_pron = next(iter(pron_morph.get("Gender")))
+        numerus_pron = next(iter(pron_morph.get("Number")))
+    except StopIteration:
+        return False
+
+    if genus_np == genus_pron and numerus_np == numerus_pron:
+        return True
+    return False
+
+
+def run_cohesion(doc: Doc) -> dict:
+    """
+    Runs the cohesion metrics. Adjacent cohesion is always calculated, whereas global
+    cohesion is only calculated if the amount of sentences is below the threshold
+    MAX_PARAGRAPHS, as defined in coh_metrics/configuration.py.
+    :param doc: the spaCy doc object containing the parsed text.
+    :return: the calculated cohesion metrics.
+    """
+    sentences = list(doc.sents)
+    content_tags = {"NOUN", "VERB", "ADJ", "ADV"}
+    noun_tags = {"NOUN", "PROPN"}
+
+    _adjacent = {
+        "nouns": 0,
+        "arguments": 0,
+        "stems": 0,
+        "content_words": {"ratio": 0, "std": 0},
+        "anaphors": 0,
+    }
+
+    _other_adjacent = {
+        "nouns": {"bin": 0, "avg_ratio": 0},
+        "adjectives": {"bin": 0, "avg_ratio": 0},
+        "adverbs": {"bin": 0, "avg_ratio": 0},
+        "verbs": {"bin": 0, "avg_ratio": 0},
+        "pronouns": {"bin": 0, "avg_ratio": 0},
+    }
+
+    _global = {
+        "nouns": 0,
+        "arguments": 0,
+        "stems": 0,
+        "content_words": {"ratio": 0, "std": 0},
+        "anaphors": 0,
+    }
+
+    _other_global = {
+        "nouns": {"bin": 0, "avg_ratio": 0},
+        "adjectives": {"bin": 0, "avg_ratio": 0},
+        "adverbs": {"bin": 0, "avg_ratio": 0},
+        "verbs": {"bin": 0, "avg_ratio": 0},
+        "pronouns": {"bin": 0, "avg_ratio": 0},
+    }
+
+    adjacent_total_pairs = 0
+    global_total_pairs = 0
+
+    content_word_overlap_adjacent = []
+    content_word_overlap_global = []
+
+    doc_length = len(sentences)
+
+    max_sentences_reached = False
+
+    for i in range(doc_length - 1):
+        sentence1 = sentences[i]
+        sentence2 = sentences[i + 1]
+
+        adjacent_pairs = len(sentence1) * len(sentence2)
+
+        content_word_pairs = 0
+        content_word_overlaps = 0
+
+        # Calculate adjacent metrics
+        for token1 in sentence1._.cohesion_tokens:
+            for token2 in sentence2._.cohesion_tokens:
+                calculate_overlaps(token1, token2, _adjacent, _other_adjacent)
+
+                # Content word overlap
+                if token1.pos_ in content_tags and token2.pos_ in content_tags:
+                    content_word_pairs += 1
+                    if token1.lemma_ == token2.lemma_:
+                        _adjacent["content_words"]["ratio"] += 1
+                        content_word_overlaps += 1
+
+        # Anaphor overlap
+        for token in sentence2:
+            if token.pos_ == "PRON":
+                for np_ in sentence1._.np_chunks:
+                    if gen_num(np_["morph"], token.morph):
+                        _adjacent["anaphors"] += 1
+
+        # Normalize the other adjacent categories
+        for cat in _other_adjacent.keys():
+            if _other_adjacent[cat]["bin"] > 0:
+                _other_adjacent[cat]["avg_ratio"] += (
+                    _other_adjacent[cat]["bin"] / adjacent_pairs
+                )
+
+        adjacent_total_pairs += 1
+
+        content_word_overlap_adjacent.append(content_word_overlaps / content_word_pairs)
+
+        # only allow global metrics if the document is not too long
+        if doc_length >= config.MAX_SENTENCES:
+            max_sentences_reached = True
+            continue
+
+        # Calculate global metrics
+        for j in range(i + 1, len(sentences)):
+            sentence2_global = sentences[j]
+
+            global_pairs = len(sentence1) * len(sentence2)
+
+            content_word_pairs = 0
+            content_word_overlaps = 0
+
+            for token1 in sentence1._.cohesion_tokens:
+                for token2 in sentence2_global._.cohesion_tokens:
+                    calculate_overlaps(token1, token2, _global, _other_global)
+
+                    # Content word overlap
+                    if token1.pos_ in content_tags and token2.pos_ in content_tags:
+                        content_word_pairs += 1
+                        if token1.lemma_ == token2.lemma_:
+                            _global["content_words"]["ratio"] += 1
+                            content_word_overlaps += 1
+
+            # Anaphor overlap
+            # TODO: Global anaphor overlap considers past X sentences
+            for token in sentence2_global:
+                if token.pos_ == "PRON":
+                    for np_ in sentence1._.np_chunks:
+                        if gen_num(np_["morph"], token.morph):
+                            _global["anaphors"] += 1
+
+            # Normalize the other global categories
+            for cat in _other_global.keys():
+                if _other_global[cat]["bin"] > 0:
+                    _other_global[cat]["avg_ratio"] += (
+                        _other_global[cat]["bin"] / global_pairs
+                    )
+
+            global_total_pairs += 1
+
+            content_word_overlap_global.append(
+                content_word_overlaps / content_word_pairs
+            )
+
+    for cat in _adjacent.keys():
+        if cat == "content_words":
+            _adjacent["content_words"]["std"] = (
+                np.std(np.array(content_word_overlap_adjacent))
+                if adjacent_total_pairs > 0
+                else "NaN"
+            )
+            _adjacent["content_words"]["ratio"] /= adjacent_total_pairs
+        else:
+            _adjacent[cat] /= adjacent_total_pairs
+
+    _adjacent["other"] = _other_adjacent
+
+    if max_sentences_reached:
+        return {"adjacent": _adjacent, "global": None}
+
+    for cat in _global.keys():
+        if cat == "content_words":
+            _global["content_words"]["std"] = (
+                np.std(np.array(content_word_overlap_global))
+                if global_total_pairs > 0
+                else "NaN"
+            )
+            _global["content_words"]["ratio"] /= global_total_pairs
+        else:
+            _global[cat] /= global_total_pairs
+
+    _global["other"] = _other_global
+
+    return {
+        "adjacent": _adjacent,
+        "global": _global,
+    }
diff --git a/coh_metrix_new/configuration.py b/coh_metrix_new/configuration.py
index 0df4ec87..23f0bda9 100644
--- a/coh_metrix_new/configuration.py
+++ b/coh_metrix_new/configuration.py
@@ -1,2 +1,10 @@
 CONTENT_TAGS = ["NN", "VB", "PN", "NOUN", "VERB", "PRON"]
 PUNCTUATION_MARKS = ["MAD", "MID", "PAD", "PUNCT"]
+
+# Maxmimum number of paragraphs to accept when running LSA
+# TODO: Find a reasonable number of paragraphs to use for LSA
+MAX_PARAGRAPHS = 10
+
+# Maximum number of sentences to accept when running Cohesion
+# TODO: Find a reasonable number of sentences to use for Cohesion
+MAX_SENTENCES = 100
diff --git a/coh_metrix_new/helpers.py b/coh_metrix_new/helpers.py
index e3641635..289a8108 100644
--- a/coh_metrix_new/helpers.py
+++ b/coh_metrix_new/helpers.py
@@ -1,5 +1,6 @@
 """
-Computes the cosine-similarities for LSA.
+Provides helper methods for the coh_metrix_new package.
+Primarily used for LSA.
 """
 
 import numpy as np
@@ -48,7 +49,6 @@ def project(vector_1: np.ndarray, vector_2: np.ndarray) -> np.ndarray:
     :return: the resulting projected vector.
     """
 
-    # proj = (v1·v2 / (|v2|^2)) * v2
     dot = np.dot(vector_1, vector_2)
     len2 = np.dot(vector_2, vector_2)
     return vector_2.copy() * (dot / len2)
@@ -103,6 +103,7 @@ def project_onto_subspace(
 
     return sub_space_projection
 
+
 def calculate_safe_division(x, y) -> float:
     """
     A helper method to assure no zero division exceptions are raised.
diff --git a/coh_metrix_new/lsa.py b/coh_metrix_new/lsa.py
index 167764de..9ca1d321 100644
--- a/coh_metrix_new/lsa.py
+++ b/coh_metrix_new/lsa.py
@@ -1,23 +1,14 @@
-from .lsa_helpers import (
-    cos_sim,
-    norm_avg_cos_sims,
-    norm_std_cos_sims,
-)
+import time
 from sentence_transformers import SentenceTransformer
+import coh_metrix_new.configuration as config
+import coh_metrix_new.helpers as helpers
+import warnings
 
-_cache = {}
+warnings.filterwarnings("ignore")
 
 _model = SentenceTransformer("KBLab/sentence-bert-swedish-cased")
 
-
-def process_tokenized_sentences(sentences: list) -> list:
-    """
-    Process the tokenized sentences into a list of strings.
-
-    :param sentences: a list of tokenized sentences.
-    :return: a list of strings.
-    """
-    return [" ".join(sentence) for sentence in sentences]
+_cache = {}
 
 
 def clear_cache() -> None:
@@ -25,75 +16,103 @@ def clear_cache() -> None:
     _cache.clear()
 
 
-def run_lsa(sentences: list, parsed: list) -> tuple[float, float]:
+def run_lsa(docs) -> dict:
     """
     Computes the LSASSp and LSASSpd scores for the given text.
 
-    :param sentences: the CoNLL tree for the text.
-    :param connll: the CoNLL tree for the text.
-    :return: a 2-tuple where (LSASSp, LSASSpd) or None if there's
-    only one sentence.
+    :param docs: the spaCy doc object containing the parsed text.
+    :return: The computed LSA metrics.
     """
+    _n_paragraphs = 0
+    paragraphs = []
+    start = 0
+
+    # Create spans for each paragraph
+    for i, token in enumerate(docs):
+        if token.pos_ == "SPACE":
+            paragraphs.append(docs[start:i])
+            start = i + 1
+            _n_paragraphs += 1
+
+        # Check if max number of paragraphs has been reached
+        if _n_paragraphs == config.MAX_PARAGRAPHS:
+            break
+
+    # add the last span after exiting the loop
+    if start != 0:
+        _n_paragraphs += 1
+        paragraphs.append(docs[start:])
+
+    processed_sentences = [str(para) for para in paragraphs]
+
+    # Don't run LSA if we only have one paragraph
+    if len(paragraphs) <= 1:
+        return {
+            "global_lsa": None,
+            "adjacent_lsa": None,
+        }
 
-    if len(sentences) <= 1:
-        return None
+    # Precompute all sentences for efficiency
+    # TODO: Still not the quickest, might be alleviated with GPU support.
+    # If we keep getting bottlenecked by the embeddings, we may need to re-think
+    # this approach, potentially waiting until a new spaCy model with transformers
+    # can be trained.
+    start_time = time.time()
 
-    processed_sentences = process_tokenized_sentences(sentences)
+    for item in paragraphs:
+        item_str = str(item)
+        _cache[item_str] = _model.encode(item_str, convert_to_numpy=True)
 
-    # Precompute all sentences for efficiency
-    for sentence in processed_sentences:
-        _cache[sentence] = _model.encode(sentence, convert_to_numpy=True)
+    print(f"Precomputation time: {time.time() - start_time:.2f}s")
 
     # Run each LSA computation (fairly quickly since the metrics are precomputed)
-    lsa_all = lsa_all_sentences(processed_sentences)
+    # However, if the text is too large, we don't run LSA globally
+    # TODO: Configure config.MAX_PARAGRAPHS when a reasonable number of paragraphs
+    # has been identified.
+    if len(paragraphs) > config.MAX_PARAGRAPHS:
+        lsa_all = (None, None)
+    else:
+        start_time = time.time()
+        lsa_all = lsa_all_sentences(processed_sentences)
+        print(f"Global LSA time: {time.time() - start_time:.2f}s")
+
+    start_time = time.time()
     lsa_adjacent = lsa_adjacent_sentences(processed_sentences)
-    lsa_givenness = lsa_compute_givenness(parsed)
+    print(f"Adjacent LSA time: {time.time() - start_time:.2f}s")
 
     clear_cache()
 
     return {
         "global_lsa": {"avg": lsa_all[0], "std": lsa_all[1]},
         "adjacent_lsa": {"avg": lsa_adjacent[0], "std": lsa_adjacent[1]},
-        "givenness": lsa_givenness,
     }
 
 
-# Taken from coh_metrix_kvtill/lsa_all_sentences.py, converted to a function from a class since we're not reusing the data and only computing once
-# Not sure if this represents the global_lsa from the original SAPIS, hard to verify as the API-call gives np.nan for both metrics
-# Check with Daniel
 def lsa_all_sentences(processed_sentences: list) -> tuple[float, float]:
-    return ("None", "None")
     """
     Computes the LSASSp and LSASSpd scores for the given text.
 
     :param sentences: A list of pre-processed sentences.
     :return: a 2-tuple where (LSASSp, LSASSpd) or None if there's
-    only one sentence.
+    only one sentence. (None, None) if the text is too large to
+    compute LSA.
     """
-
-    # Calculate the cos_sum between one vector of the sentence "index" and the sentence "query"
-    # Index here is the current sentence, and query is the sentence which is compared against
-    # Do this for each index
-    # NOTE: Might benefit from a length check?
+    # Calculate the cos_sum between one vector of the sentence "index"
+    # and the sentence "query" -- do this for each index
+    # NOTE: This is computationally expensive!
     cos_sims = [
-        cos_sim(_cache[index], _cache[query])
+        helpers.cos_sim(_cache[index], _cache[query])
         for i, index in enumerate(processed_sentences)
         for j, query in enumerate(processed_sentences)
         if i != j
     ]
 
     return (
-        norm_avg_cos_sims(cos_sims),
-        norm_std_cos_sims(cos_sims),
+        helpers.norm_avg_cos_sims(cos_sims),
+        helpers.norm_std_cos_sims(cos_sims),
     )
 
 
-# Taken from coh_metrix_kvtill/lsa_adjacent_sentences.py, converted to a function from a class since we're not reusing the data and only computing once
-# Implementation gives way different results than the LSA from the API-call when run with long.txt
-# "avg": 0.5413950681686401, "std": 0.19490689039230347 (API)
-# "avg": 0.6881772577762604, "std": 0.5822818204760551 (this function)
-# Differs slightly from the original code as well, unsure if this is due to the new implementation from KvTill23 or if there might be some issues here
-# Check with Daniel if this seems right
 def lsa_adjacent_sentences(processed_sentences: list) -> tuple[float, float]:
     """
     Compute the average and standard deviation cosine similarity
@@ -103,7 +122,10 @@ def lsa_adjacent_sentences(processed_sentences: list) -> tuple[float, float]:
     :return: a 2-tuple where (avg_cos_sims, std_cos_sims).
     """
 
-    # NOTE: The original code seems to check if the vectorized indexes (embeddings here) have a value before continuing -- is there a point for us to do that here as well? (e.g. if not _cache[index] or not _cache[query]: continue)
+    # NOTE: The original code seems to check if the vectorized indexes
+    # (embeddings here) have a value before continuing -- is there a
+    # point for us to do that here as well? (e.g. if not _cache[index]
+    # or not _cache[query]: continue)
 
     cos_sims = []
 
@@ -111,54 +133,10 @@ def lsa_adjacent_sentences(processed_sentences: list) -> tuple[float, float]:
         next_sentence = processed_sentences[i + 1]
         embedding = _cache[sentence]
         next_embedding = _cache[next_sentence]
-        cossim = cos_sim(embedding, next_embedding)
+        cossim = helpers.cos_sim(embedding, next_embedding)
         cos_sims.append(cossim)
 
-    avg_cos_sims = norm_avg_cos_sims(cos_sims)
-    std_cos_sims = norm_std_cos_sims(cos_sims)
+    avg_cos_sims = helpers.norm_avg_cos_sims(cos_sims)
+    std_cos_sims = helpers.norm_std_cos_sims(cos_sims)
 
     return avg_cos_sims, std_cos_sims
-
-
-# Taken from coh_metrix_kvtill/taaco_givenness.py, I assumed we're interested in parsing the entire text and thus only took the relevant parts for the pipeline
-# Furthermore I assumed we're only interested in the global givenness average for the entire text, and that the taaco givenness is preferable to the giveness in coh_metrix_kvtill/lsa_givenness.py
-# Results seem to be roughly in range though
-# 0.4509948415622697 (this function) vs 0.4802052785923754 (API)
-# Looks very similar to the original giveness-code as well
-# Check with Daniel though
-def lsa_compute_givenness(parsed: list) -> tuple[float, float]:
-    """Calculate the global givenness average for the entire text.
-    :param parsed: a list with the parsed text.
-    :return: the global givenness average."""
-
-    current_lemmas = set()
-    repeated_lemmas, pronouns = 0, 0
-
-    for sentence in parsed:
-        for word in sentence:
-            try:
-                lemma = word[2]  # Lemma
-                upos = word[3]  # UpoS (converted to SUC)
-                current_lemmas.add(lemma)  # save number of unique lemmas
-
-                # if the word is a noun, verb or pronoun and lemma has been repeated once
-                if lemma in current_lemmas and upos in content_tags:
-                    repeated_lemmas += 1
-
-                # if the word is a pronoun and exists in the defined list of swedish pronouns
-                if lemma in swedish_pronouns and upos == "PRON":
-                    pronouns += 1
-
-                # if word is not a punctuation mark
-                if upos not in punctuation_marks:
-                    total_lemmas += 1
-
-            except:
-                continue
-
-    # Calculate global givenness according to the formula
-    return (repeated_lemmas + pronouns) / total_lemmas
-
-
-if __name__ == "__main__":
-    pass
diff --git a/coh_metrix_new/run_coh_metrix.py b/coh_metrix_new/run_coh_metrix.py
deleted file mode 100644
index 6246c732..00000000
--- a/coh_metrix_new/run_coh_metrix.py
+++ /dev/null
@@ -1,17 +0,0 @@
-from lsa import run_lsa
-
-# NOTE: WIP
-
-
-def run_coh_metrix(sentences: list, parsed: list) -> dict:
-    """
-    Runs all the Coh-Metrix metrics on the given text.
-
-    :param sentences: the CoNLL tree for the text.
-    :param connll: the CoNLL tree for the text.
-    :return: a 2-tuple where (LSASSp, LSASSpd) or None if there's
-    only one sentence.
-    """
-    return {
-        "lsa": run_lsa(sentences, parsed),
-    }
diff --git a/latt.txt b/latt.txt
new file mode 100644
index 00000000..114452f2
--- /dev/null
+++ b/latt.txt
@@ -0,0 +1,9 @@
+I onsdags kom det mycket regn på flera platser i Sverige. I Jönköping blev det översvämningar på vägar och i hus.
+
+Det kommer att kosta många miljoner kronor att laga allt som gick sönder. Det säger myndigheterna.
+
+Nu varnar myndigheten SMHI för mer problem med regn och åska på lördag. Det finns risk för nya översvämningar.
+
+Värst kan det bli i Götaland och på Gotland. Kanske kan ovädret nå ända upp till Svealand.
+
+Experterna på väder säger att det kan bli mycket regn, mellan 30 och 60 millimeter. Det kommer också att blåsa mycket.
\ No newline at end of file
diff --git a/lsa.txt b/lsa.txt
index 1f94a14e..6844d0ab 100644
--- a/lsa.txt
+++ b/lsa.txt
@@ -1,2 +1,8 @@
 När Sverige kristnades och i vilken omfattning är oklart. Spår finns redan från omkring 900 men den förste kung som tog ställning för kristendomen var Olof Skötkonung omkring år 1000. Kristnandet skedde genom mission och genom att kungar och hövdingafamiljer ställde sig bakom den nya guden.
-Olika historiker har satt olika datum för "enandet" av Sverige, beroende på vad man anser vara ett enat rike och tillräckligt pålitliga källor. Den första historiskt säkert belagde svenske kungen är Erik Segersäll, men Sverige bestod då bara av Götalandskapen, Närke, Södermanland, Västmanland och de uppländska folklanden. Konsolideringen av Sverige fortgick under medeltiden och nya områden lades till. Vid denna tid etablerades även det svenska styret över Finland. De finska landskapen var redan vid slutet av 1200-talet fullt jämställda med övriga delar av det svenska riket. 
\ No newline at end of file
+Olika historiker har satt olika datum för "enandet" av Sverige, beroende på vad man anser vara ett enat rike och tillräckligt pålitliga källor. Den första historiskt säkert belagde svenske kungen är Erik Segersäll, men Sverige bestod då bara av Götalandskapen, Närke, Södermanland, Västmanland och de uppländska folklanden. Konsolideringen av Sverige fortgick under medeltiden och nya områden lades till. Vid denna tid etablerades även det svenska styret över Finland. De finska landskapen var redan vid slutet av 1200-talet fullt jämställda med övriga delar av det svenska riket. 
+Under medeltiden var det landskapen och lagsagorna, inte riket, som var i centrum. Först 1336 ersattes landskapslagarna av en enhetlig lag för hela riket av kung Magnus Eriksson.
+Norden förenades i Kalmarunionen år 1397 varpå de tre rikena gav upp delar av sin självständighet.
+Vasatiden började med att Gustav Vasa efter Stockholms blodbad ledde Sverige ur Kalmarunionen genom krig med Danmark, en konflikt som kallas befrielsekriget. Kriget slutade 1523, och samma år valdes Gustav Vasa till kung av Sverige. Under Gustav Vasa tog reformationen fart i Sverige samtidigt som reduktioner av kyrkojord bidrog till att statens finanser stärktes. Gustav Vasa var även den som 1544 införde arvkungadöme i Sverige.
+Under Vasatiden kallade ståndsriksdagen löpande till riksdagar och omfattade då permanent bondeståndet.
+Stormaktstiden inleddes med att kung Gustav II Adolf ledde in Sverige i det trettioåriga kriget. Kungen dödades redan 1632 under Slaget vid Lützen, men Sveriges deltagande under kriget fram till fredsslutet 1648 gjorde att Sverige genom den Westfaliska freden blev en europeisk stormakt, med besittningar både i Baltikum och norra Tyskland. Det var under stormaktstiden som Skåne, Halland, Blekinge, Bohuslän, Gotland, Härjedalen och Jämtland blev en del av Sverige.
+Fram till 1680 delades den utövande makten mellan kungen och högadeln, vilket ledde till mäktigare adelsmän. I samband med reduktionen infördes 1680 enväldig monarki, vilken varade fram till Karl XII:s död 1718, vilket markerar slutet på Stora nordiska kriget.
\ No newline at end of file
diff --git a/pipeline.py b/pipeline.py
index 1782740b..2e89be74 100755
--- a/pipeline.py
+++ b/pipeline.py
@@ -14,49 +14,39 @@ Filip Salomonsson <filip.salomonsson@gmail.com>
 Robert Östling <robert.ostling@helsinki.fi>
 Aaron Smith <aaron.smith@lingfil.uu.se>
 """
-# added comment
 
 import pprint  # for debugging
 import time  # for debugging
+import warnings
 import spacy
-from spacy.tokens import Doc
+from spacy.tokens import Doc, Span
 from math import isfinite
 import numpy as np
 
 import spacy
 from spacy import displacy
+from spacy_transformers import Transformer, TransformerModel
 
-
-# Import Scream
-# from scream.metrics import LexicalMetrics
-# from scream.metrics import SurfaceMetrics
-# from scream.metrics import StructuralMetrics
-# from scream.document_parts import DependencyParsedDocument
+# spacy-transformers spits out a bunch of deprecation warnings,
+# so like any responsible dev would, we'll ignore them
+warnings.filterwarnings("ignore")
 
 from scream2.scream2 import add_scream_metrics
 from coh_metrix_new.coh_metrix import add_coh_metrix
 
 # Import stillett3
-# from stilett import ud_text_simplifier
-from stilett import ud_tree
-
-# from coh_metrix.coh_metrix_pipeline import run_coh_metrix
-
-# Import Coh-metrix
-# from coh_metrix.cohesion import run_cohesion
-# from coh_metrix_new.lsa import run_lsa
-
-# from .coh_metrix import givenness
-# from coh_metrix.connectives import run_connectives
-
-# Import Synonyms
-# sys.path.insert(0,'synonyms/')
-# from .synonyms import synonyms
+# from stilett import ud_tree
 
 MAX_TOKEN = 256
 
 nlp = spacy.load("sv_core_news_lg")
 
+# We're using some extra extensions for spaCys tokens, all defined here
+Doc.set_extension("scream_metrics", default=None)
+Doc.set_extension("coh_metrix_helpers", default=None)
+Span.set_extension("cohesion_tokens", default=[])
+Span.set_extension("np_chunks", default=[])
+
 add_scream_metrics(nlp)
 add_coh_metrix(nlp)
 
@@ -117,14 +107,12 @@ def main(data: bytes, test: bool = False, parser: str = "stanza") -> list:
     """Main function for the pipeline.
 
     Args:
-        data (bytes): The input data to process.
-
-    Returns:
-        list: A list of dictionaries containing the processed data."""
+    :param data (bytes): The input data to process.
+    :return list: A list of dictionaries containing the processed data."""
 
     # TODO: ta in argument här på något sätt?
     args = data.decode("utf-8")
-    
+
     return run_pipeline({args}, test)
 
 
@@ -144,12 +132,13 @@ def run_pipeline(args: str, test: bool) -> list:
 
         # ================== PARSING ==================
         start_time = time.time()
-        #doc, node_tree = process_file(filename)
+        # doc, node_tree = process_file(filename)
         doc = process_file(filename)
         if test:
             time_checker(start_time, "spaCy", timestamps)
 
-        pprint.pp(doc.user_data["coh_metrix"])
+        pprint.pp(doc._.scream_metrics)
+        pprint.pp(doc._.coh_metrix)
 
         return doc
 
@@ -166,7 +155,6 @@ def run_pipeline(args: str, test: bool) -> list:
         # will be removed shortly.
 
         # coh_metrix_results = run_coh_metrix(processed['parsed'], timestamps)
-        #
 
         coh_metrix_start_time = time.time()
         start_time = time.time()
@@ -251,7 +239,7 @@ def process_file(filename: str) -> Doc:
     #     for sentence in doc.sents
     # ]
 
-    return doc#, node_tree
+    return doc  # , node_tree
 
 
 def time_checker(start_time: float, method: str, timestamps: dict) -> None:
@@ -266,68 +254,9 @@ def time_checker(start_time: float, method: str, timestamps: dict) -> None:
     print(f"{method} took {elapsed_time:.2f} seconds.")
 
 
-def combine_metrics(docs) -> dict:
-    """
-    Combines metrics for all paragraphs over the entire document into 
-    a single dict usable in SCREAM2 and Coh-Metrix.
-    :param docs: A generator of spaCy documents.
-    :return: A dict with all extracted metrics for the entire text.
-    """
-    
-    # Initialize the combined metrics dictionary
-    combined_metrics = {
-        "sentence_lengths": 0.0,
-        "n_sentences": 0.0,
-        "total_word_length": 0.0,
-        "n_words": 0.0,
-        "n_syllables": 0.0,
-        "total_token_length": 0.0,
-        "n_unique_tokens": 0.0,
-        "n_unique_words": 0.0,
-        "sentence_length_list": [],
-        "n_tokens": 0.0,
-        "n_lix_long_words": 0.0,
-        "swevoc_dict": {
-            "total": 0.0,
-            "C": 0.0,
-            "D": 0.0,
-            "H": 0.0,
-            "S": 0.0,
-            "K": 0.0,
-        },
-        "total_dep_distance": 0.0,
-        "n_nominal_postmodifiers": 0.0,
-        "n_nominal_premodifiers": 0.0,
-        "n_right_dependencies": 0.0,
-        "n_prep_comp": 0.0,
-        "verb_arities": [],
-        "n_verbs": 0.0,
-        "verb_arities_dict": {
-            1: 0.0,
-            2: 0.0,
-            3: 0.0,
-            4: 0.0,
-            5: 0.0,
-            6: 0.0,
-            7: 0.0,
-        },
-        "n_sub_clauses": 0.0,
-        "dep_ud_probs": dict(config.DEP_UD_TYPES),
-        "pos_ud_probs": dict(config.UPOS_POS_TYPES),
-        "n_content_words": 0.0,
-        "n_verbal_roots": 0.0,
-        "sentence_depth": 0.0,
-    }
-
-
 if __name__ == "__main__":
     main(
         "I skolan äter jag ett rött äpple. Det brukar ganska ofta vara ruttet.".encode(
             "utf-8"
         )
     )
-    # main(
-    #     "Det finns ett stort antal meningar som är onödigt långa, och vi behöver se till att dessa kan taggas godtyckligt. Denna, till synes, enkla uppgift är inte alltid så enkel.".encode(
-    #         "utf-8"
-    #     )
-    # )
diff --git a/scream2/scream2.py b/scream2/scream2.py
index 367765c8..d45d754f 100644
--- a/scream2/scream2.py
+++ b/scream2/scream2.py
@@ -6,10 +6,13 @@ add_scream_metrics(nlp) to add this component to your pipeline.
 See README.md for an explanation of all SCREAM2 metrics calculated
 in this code. Variable declaration as well as calculations are all separated
 into their own sections for readability and maintainability.
+
+Please note that SCREAM2 also extracts some data used in Coh-Metrix in order
+to reduce the amount of times we need to iterate over the entire dataset.
 """
 
 from spacy.language import Language
-from spacy.tokens import Doc, Token
+from spacy.tokens import Doc, Token, Span
 
 from scream2.SweVoc.swevoc import load_swe_voc
 
@@ -20,6 +23,7 @@ from math import log
 
 import time
 
+
 @Language.component("scream_metrics")
 def scream_metrics(doc: Doc) -> Doc:
     """
@@ -94,7 +98,6 @@ def scream_metrics(doc: Doc) -> Doc:
     # -- Structural vars --
 
     n_content_words = 0
-    n_pos_tags = 0
     n_verbs = 0
     n_verbal_roots = 0
     n_syllables = 0
@@ -129,9 +132,9 @@ def scream_metrics(doc: Doc) -> Doc:
         return max_depth + 1.0
 
     # ----- Coh-Metrix helpers -----
-    # Some data for Coh-Metrix can be extracted in this loop, to avoid running unneccessary 
-    # loops in coh_metrix.py. This will simply be added to the document and used in later 
-    # modules. 
+    # Some data for Coh-Metrix can be extracted in this loop, to avoid running unneccessary
+    # loops in coh_metrix.py. This will simply be added to the document and used in later
+    # modules.
 
     current_lemmas = set()
     repeated_lemmas, pronouns = 0, 0
@@ -206,7 +209,7 @@ def scream_metrics(doc: Doc) -> Doc:
                 current_lemmas.add(token.lemma)
             else:
                 repeated_lemmas += 1
-            
+
             pronouns += token.lemma in config.SWEDISH_PRONOUNS and token.pos_ == "PRON"
 
         total_dep_distance += abs(dep_distance)
@@ -252,6 +255,9 @@ def scream_metrics(doc: Doc) -> Doc:
         current_sentence_length = 0
         current_verb_arities = 0
 
+        # Extracting coherence metrics
+        sent_cohesion_tokens = []
+
         # unfortunate double looping, unavoidable if we need to skip punctuation
         for token in sent:
             current_sentence_length += not token.is_punct
@@ -263,12 +269,27 @@ def scream_metrics(doc: Doc) -> Doc:
                 if current_verb_arities in verb_arities_dict:
                     verb_arities_dict[current_verb_arities] += 1
 
+            # Extract tokens in the sentence relevant for cohesion.
+            if token.pos_ in {"NOUN", "VERB", "PRON", "PROPN", "ADJ", "ADV"}:
+                sent_cohesion_tokens.append(token)
+
         sentence_lengths += current_sentence_length
         sentence_length_list_np[i] = current_sentence_length
         verb_arities.append(current_verb_arities)
 
         total_verb_arity = np.average(verb_arities)
 
+        sent._.cohesion_tokens = sent_cohesion_tokens
+
+        for np_ in sent.noun_chunks:
+            sent._.np_chunks.append(
+                {
+                    "start": np_.start,
+                    "end": np_.end - 1,
+                    "morph": np_.root.morph,
+                }
+            )
+
     for key in verb_arities_dict:
         verb_arity_unigram_probs[key] = calculate_safe_division(
             verb_arities_dict[key], n_words
@@ -309,6 +330,7 @@ def scream_metrics(doc: Doc) -> Doc:
         "avg_word_length": calculate_safe_division(total_word_length, n_words),
         "avg_n_syllables": calculate_safe_division(n_syllables, n_words),
         "n_words": n_words,
+        "n_sentences": n_sentences,
     }
 
     additional_surface_metrics = {
@@ -407,6 +429,9 @@ def scream_metrics(doc: Doc) -> Doc:
         "pronouns": pronouns,
     }
 
+    doc._.scream_metrics = doc.user_data["scream_metrics"]
+    doc._.coh_metrix_helpers = doc.user_data["coh_metrix_helpers"]
+
     return doc
 
 
diff --git a/testset_results.json b/testset_results.json
deleted file mode 100644
index e27cf080..00000000
--- a/testset_results.json
+++ /dev/null
@@ -1,2 +0,0 @@
-{
-    "0": 
\ No newline at end of file
-- 
GitLab