bugfix on cohesion

372161d9 · Love Arreborn · 53fdd7b0 · 372161d9
Commit 372161d9 authored 1 month ago by Love Arreborn
--- a/coh_metrix/cohesion.py
+++ b/coh_metrix/cohesion.py
-"""
-This module calculates cohesion metrics for a given text.
-The metrics are divided into adjacent and global cohesion, where global
-cohesion is only calculated if the amount of sentences is below the threshold
-MAX_SENTENCES, as defined in coh_metrics/configuration.py.
-These metrics need to be verified in full, especially anaphor overlap.
-Furthermore, training a new model to include transformer embeddings as well as
-coreference resolution would be beneficial.
-"""
 import numpy as np
 from spacy.tokens import Token, Doc
@@ -15,252 +5,229 @@ from coh_metrix import configuration as config
 def calculate_overlaps(token1: Token, token2: Token, main: dict, other: dict):
-    """
+  """
-    A helper function to minimize code duplication in the cohesion calculation.
+  A helper function to minimize code duplication in the cohesion calculation.
-    Can be used for both adjacent and global cohesion.
+  Can be used for both adjacent and global cohesion.
-    :param token1: the first token.
+  :param token1: the first token.
-    :param token2: the second token.
+  :param token2: the second token.
-    :param main: the main dictionary to update.
+  :param main: the main dictionary to update.
-    :param other: the other dictionary to update.
+  :param other: the other dictionary to update.
-    """
+  """
-    # Noun overlap
+  # Noun overlap
-    if (
+  if (
-        token1.text.lower() == token2.text.lower()
+    token1.text.lower() == token2.text.lower()
-        and token1.pos_ == token2.pos_
+    and token1.pos_ == token2.pos_
-        and token1.pos_ in ["NOUN", "PROPN"]
+    and token1.pos_ in ["NOUN", "PROPN"]
-    ):
+  ):
-        main["nouns"] += 1
+    main["nouns"] += 1
-    # Argument overlap
+  # Argument overlap
-    if (
+  if (
-        token1.text == token2.text and token1.pos_ == "PRON" and token2.pos_ == "PRON"
+    token1.text == token2.text and token1.pos_ == "PRON" and token2.pos_ == "PRON"
-    ) or (
+  ) or (
-        token1.lemma_ == token2.lemma_
+    token1.lemma_ == token2.lemma_
-        and token1.pos_ == "NOUN"
+    and token1.pos_ == "NOUN"
-        and token2.pos_ == "NOUN"
+    and token2.pos_ == "NOUN"
-    ):
+  ):
-        main["arguments"] += 1
+    main["arguments"] += 1
-    # Stem overlap
+  # Stem overlap
-    if (
+  if (
-        token1.lemma_ == token2.lemma_
+    token1.lemma_ == token2.lemma_
-        and token1.pos_ in ["NOUN", "VERB", "ADJ", "ADV"]
+    and token1.pos_ in ["NOUN", "VERB", "ADJ", "ADV"]
-        and token2.pos_ == "NOUN"
+    and token2.pos_ == "NOUN"
-    ):
+  ):
-        main["stems"] += 1
+    main["stems"] += 1
-    # Content word overlap
+  # Content word overlap (for other bins)
+  if token1.lemma_ == token2.lemma_:
-    if token1.lemma_ == token2.lemma_:
+    if token1.pos_ == "PRON" and token2.pos_ == "PRON":
-        if token1.pos_ == "PRON" and token2.pos_ == "PRON":
+      other["pronouns"]["bin"] += 1
-            other["pronouns"]["bin"] += 1
+    elif token1.pos_ == "NOUN" and token2.pos_ == "NOUN":
-        elif token1.pos_ == "NOUN" and token2.pos_ == "NOUN":
+      other["nouns"]["bin"] += 1
-            other["nouns"]["bin"] += 1
+    elif token1.pos_ == "ADV" and token2.pos_ == "ADV":
-        elif token1.pos_ == "ADV" and token2.pos_ == "ADV":
+      other["adverbs"]["bin"] += 1
-            other["adverbs"]["bin"] += 1
+    elif token1.pos_ == "ADJ" and token2.pos_ == "ADJ":
-        elif token1.pos_ == "ADJ" and token2.pos_ == "ADJ":
+      other["adjectives"]["bin"] += 1
-            other["adjectives"]["bin"] += 1
+    elif token1.pos_ == "VERB" and token2.pos_ == "VERB":
-        elif token1.pos_ == "VERB" and token2.pos_ == "VERB":
+      other["verbs"]["bin"] += 1
-            other["verbs"]["bin"] += 1
 def gen_num(np_morph, pron_morph):
-    """
+  """
-    Helper function to determine gender and number agreement between a noun phrase
+  Helper function to determine gender and number agreement between a noun phrase
-    and a pronoun.
+  and a pronoun.
-    :param np_morph: the morphological features of the noun phrase.
+  :param np_morph: the morphological features of the noun phrase.
-    :param pron_morph: the morphological features of the pronoun.
+  :param pron_morph: the morphological features of the pronoun.
-    :return: True if the noun phrase and pronoun agree.
+  :return: True if the noun phrase and pronoun agree.
-    """
+  """
-    try:
+  try:
-        genus_np = next(iter(np_morph.get("Gender")))
+    genus_np = next(iter(np_morph.get("Gender")))
-        numerus_np = next(iter(np_morph.get("Number")))
+    numerus_np = next(iter(np_morph.get("Number")))
-        genus_pron = next(iter(pron_morph.get("Gender")))
+    genus_pron = next(iter(pron_morph.get("Gender")))
-        numerus_pron = next(iter(pron_morph.get("Number")))
+    numerus_pron = next(iter(pron_morph.get("Number")))
-    except StopIteration:
+  except StopIteration:
-        return False
-    if genus_np == genus_pron and numerus_np == numerus_pron:
-        return True
    return False
+  return (genus_np == genus_pron and numerus_np == numerus_pron)
 def run_cohesion(doc: Doc) -> dict:
-    """
+  """
-    Runs the cohesion metrics. Adjacent cohesion is always calculated, whereas global
+  Runs the cohesion metrics. Adjacent cohesion is always calculated, whereas global
-    cohesion is only calculated if the amount of sentences is below the threshold
+  cohesion is only calculated if the amount of sentences is below the threshold
-    MAX_PARAGRAPHS, as defined in coh_metrics/configuration.py.
+  MAX_PARAGRAPHS, as defined in coh_metrics/configuration.py.
-    :param doc: the spaCy doc object containing the parsed text.
+  :param doc: the spaCy doc object containing the parsed text.
-    :return: the calculated cohesion metrics.
+  :return: the calculated cohesion metrics.
-    """
+  """
-    sentences = list(doc.sents)
+  sentences = list(doc.sents)
-    content_tags = {"NOUN", "VERB", "ADJ", "ADV"}
+  content_tags = {"NOUN", "VERB", "ADJ", "ADV"}
-    noun_tags = {"NOUN", "PROPN"}
+  _adjacent = {
-    _adjacent = {
+    "nouns": 0,
-        "nouns": 0,
+    "arguments": 0,
-        "arguments": 0,
+    "stems": 0,
-        "stems": 0,
+    "content_words": {"ratio": 0, "std": 0},
-        "content_words": {"ratio": 0, "std": 0},
+    "anaphors": 0,
-        "anaphors": 0,
+  }
-    }
+  _other_adjacent = {
+    "nouns": {"bin": 0, "avg_ratio": 0},
-    _other_adjacent = {
+    "adjectives": {"bin": 0, "avg_ratio": 0},
-        "nouns": {"bin": 0, "avg_ratio": 0},
+    "adverbs": {"bin": 0, "avg_ratio": 0},
-        "adjectives": {"bin": 0, "avg_ratio": 0},
+    "verbs": {"bin": 0, "avg_ratio": 0},
-        "adverbs": {"bin": 0, "avg_ratio": 0},
+    "pronouns": {"bin": 0, "avg_ratio": 0},
-        "verbs": {"bin": 0, "avg_ratio": 0},
+  }
-        "pronouns": {"bin": 0, "avg_ratio": 0},
-    }
+  _global = {
+    "nouns": 0,
-    _global = {
+    "arguments": 0,
-        "nouns": 0,
+    "stems": 0,
-        "arguments": 0,
+    "content_words": {"ratio": 0, "std": 0},
-        "stems": 0,
+    "anaphors": 0,
-        "content_words": {"ratio": 0, "std": 0},
+  }
-        "anaphors": 0,
+  _other_global = {
-    }
+    "nouns": {"bin": 0, "avg_ratio": 0},
+    "adjectives": {"bin": 0, "avg_ratio": 0},
-    _other_global = {
+    "adverbs": {"bin": 0, "avg_ratio": 0},
-        "nouns": {"bin": 0, "avg_ratio": 0},
+    "verbs": {"bin": 0, "avg_ratio": 0},
-        "adjectives": {"bin": 0, "avg_ratio": 0},
+    "pronouns": {"bin": 0, "avg_ratio": 0},
-        "adverbs": {"bin": 0, "avg_ratio": 0},
+  }
-        "verbs": {"bin": 0, "avg_ratio": 0},
-        "pronouns": {"bin": 0, "avg_ratio": 0},
+  adjacent_total_pairs = 0
-    }
+  global_total_pairs = 0
+  content_word_overlap_adjacent = []
-    adjacent_total_pairs = 0
+  content_word_overlap_global = []
-    global_total_pairs = 0
+  doc_length = len(sentences)
-    content_word_overlap_adjacent = []
+  max_sentences_reached = False
-    content_word_overlap_global = []
+  for i in range(doc_length - 1):
-    doc_length = len(sentences)
+    sentence1 = sentences[i]
+    sentence2 = sentences[i + 1]
-    max_sentences_reached = False
+    adjacent_pairs = len(sentence1) * len(sentence2)
-    for i in range(doc_length - 1):
+    # capture starting bin counts for per-pair calculation
-        sentence1 = sentences[i]
+    start_bins = {cat: _other_adjacent[cat]["bin"] for cat in _other_adjacent}
-        sentence2 = sentences[i + 1]
+    content_word_pairs = 0
-        adjacent_pairs = len(sentence1) * len(sentence2)
+    content_word_overlaps = 0
-        content_word_pairs = 0
+    # Calculate overlaps for this adjacent pair
-        content_word_overlaps = 0
+    for token1 in sentence1._.cohesion_tokens:
+      for token2 in sentence2._.cohesion_tokens:
-        # Calculate adjacent metrics
+        calculate_overlaps(token1, token2, _adjacent, _other_adjacent)
-        for token1 in sentence1._.cohesion_tokens:
+        if token1.pos_ in content_tags and token2.pos_ in content_tags:
-            for token2 in sentence2._.cohesion_tokens:
+          content_word_pairs += 1
-                calculate_overlaps(token1, token2, _adjacent, _other_adjacent)
+          if token1.lemma_ == token2.lemma_:
+            _adjacent["content_words"]["ratio"] += 1
-                # Content word overlap
+            content_word_overlaps += 1
-                if token1.pos_ in content_tags and token2.pos_ in content_tags:
-                    content_word_pairs += 1
+    # Anaphor overlap
-                    if token1.lemma_ == token2.lemma_:
+    for token in sentence2:
-                        _adjacent["content_words"]["ratio"] += 1
+      if token.pos_ == "PRON":
-                        content_word_overlaps += 1
+        for np_ in sentence1._.np_chunks:
+          if gen_num(np_["morph"], token.morph):
-        # Anaphor overlap
+            _adjacent["anaphors"] += 1
-        for token in sentence2:
-            if token.pos_ == "PRON":
+    # update per-pair avg_ratio using local bin increments
-                for np_ in sentence1._.np_chunks:
+    for cat in _other_adjacent:
-                    if gen_num(np_["morph"], token.morph):
+      local_count = _other_adjacent[cat]["bin"] - start_bins[cat]
-                        _adjacent["anaphors"] += 1
+      if local_count > 0:
+        _other_adjacent[cat]["avg_ratio"] += (local_count / adjacent_pairs)
-        # Normalize the other adjacent categories
-        for cat in _other_adjacent.keys():
+    adjacent_total_pairs += 1
-            if _other_adjacent[cat]["bin"] > 0:
+    content_word_overlap_adjacent.append(
-                _other_adjacent[cat]["avg_ratio"] += (
+      content_word_overlaps / content_word_pairs
-                    _other_adjacent[cat]["bin"] / adjacent_pairs
+      if content_word_pairs > 0 and content_word_overlaps > 0
-                )
+      else 0
+    )
-        adjacent_total_pairs += 1
+    # only allow global metrics if the document is not too long
-        content_word_overlap_adjacent.append(
+    if doc_length >= config.MAX_SENTENCES:
-            content_word_overlaps / content_word_pairs
+      max_sentences_reached = True
-            if content_word_pairs > 0 and content_word_overlaps > 0
+      continue
-            else 0
-        )
+    # Calculate global metrics
+    for j in range(i + 1, doc_length):
-        # only allow global metrics if the document is not too long
+      sentence2_global = sentences[j]
-        if doc_length >= config.MAX_SENTENCES:
+      global_pairs = len(sentence1) * len(sentence2_global)
-            max_sentences_reached = True
-            continue
+      start_bins_g = {cat: _other_global[cat]["bin"] for cat in _other_global}
+      cw_pairs_g = 0
-        # Calculate global metrics
+      cw_overlaps_g = 0
-        for j in range(i + 1, len(sentences)):
-            sentence2_global = sentences[j]
+      for token1 in sentence1._.cohesion_tokens:
+        for token2 in sentence2_global._.cohesion_tokens:
-            global_pairs = len(sentence1) * len(sentence2)
+          calculate_overlaps(token1, token2, _global, _other_global)
+          if token1.pos_ in content_tags and token2.pos_ in content_tags:
-            content_word_pairs = 0
+            cw_pairs_g += 1
-            content_word_overlaps = 0
+            if token1.lemma_ == token2.lemma_:
+              _global["content_words"]["ratio"] += 1
-            for token1 in sentence1._.cohesion_tokens:
+              cw_overlaps_g += 1
-                for token2 in sentence2_global._.cohesion_tokens:
-                    calculate_overlaps(token1, token2, _global, _other_global)
+      for token in sentence2_global:
+        if token.pos_ == "PRON":
-                    # Content word overlap
+          for np_ in sentence1._.np_chunks:
-                    if token1.pos_ in content_tags and token2.pos_ in content_tags:
+            if gen_num(np_["morph"], token.morph):
-                        content_word_pairs += 1
+              _global["anaphors"] += 1
-                        if token1.lemma_ == token2.lemma_:
-                            _global["content_words"]["ratio"] += 1
+      for cat in _other_global:
-                            content_word_overlaps += 1
+        local_count_g = _other_global[cat]["bin"] - start_bins_g[cat]
+        if local_count_g > 0:
-            # Anaphor overlap
+          _other_global[cat]["avg_ratio"] += (local_count_g / global_pairs)
-            # TODO: Global anaphor overlap considers past X sentences
-            for token in sentence2_global:
+      global_total_pairs += 1
-                if token.pos_ == "PRON":
+      content_word_overlap_global.append(
-                    for np_ in sentence1._.np_chunks:
+        cw_overlaps_g / cw_pairs_g if cw_pairs_g > 0 and cw_overlaps_g > 0 else 0
-                        if gen_num(np_["morph"], token.morph):
+      )
-                            _global["anaphors"] += 1
+  # finalize adjacent metrics
-            # Normalize the other global categories
+  if adjacent_total_pairs > 0:
-            for cat in _other_global.keys():
+    _adjacent["content_words"]["ratio"] /= adjacent_total_pairs
-                if _other_global[cat]["bin"] > 0:
+    _adjacent["content_words"]["std"] = np.std(
-                    _other_global[cat]["avg_ratio"] += (
+      np.array(content_word_overlap_adjacent)
-                        _other_global[cat]["bin"] / global_pairs
+    )
-                    )
+    for cat in _other_adjacent:
+      _other_adjacent[cat]["avg_ratio"] /= adjacent_total_pairs
-            global_total_pairs += 1
+    for cat in [k for k in _adjacent if k != "content_words"]:
+      _adjacent[cat] /= adjacent_total_pairs
-            content_word_overlap_global.append(
+  _adjacent["other"] = _other_adjacent
-                content_word_overlaps / content_word_pairs
-                if content_word_pairs > 0 and content_word_overlaps > 0
+  if max_sentences_reached:
-                else 0
+    return {"adjacent": _adjacent, "global": None}
-            )
+  # finalize global metrics
-    for cat in _adjacent.keys():
+  if global_total_pairs > 0:
-        if cat == "content_words":
+    _global["content_words"]["ratio"] /= global_total_pairs
-            _adjacent["content_words"]["std"] = (
+    _global["content_words"]["std"] = np.std(
-                np.std(np.array(content_word_overlap_adjacent))
+      np.array(content_word_overlap_global)
-                if adjacent_total_pairs > 0
+    )
-                else "NaN"
+    for cat in _other_global:
-            )
+      _other_global[cat]["avg_ratio"] /= global_total_pairs
-            _adjacent["content_words"]["ratio"] /= adjacent_total_pairs
+    for cat in [k for k in _global if k != "content_words"]:
-        else:
+      _global[cat] /= global_total_pairs
-            _adjacent[cat] /= adjacent_total_pairs
+  _global["other"] = _other_global
-    _adjacent["other"] = _other_adjacent
+  return {"adjacent": _adjacent, "global": _global}
-    if max_sentences_reached:
-        return {"adjacent": _adjacent, "global": None}
-    for cat in _global.keys():
-        if cat == "content_words":
-            _global["content_words"]["std"] = (
-                np.std(np.array(content_word_overlap_global))
-                if global_total_pairs > 0
-                else "NaN"
-            )
-            _global["content_words"]["ratio"] /= global_total_pairs
-        else:
-            _global[cat] /= global_total_pairs
-    _global["other"] = _other_global
-    return {
-        "adjacent": _adjacent,
-        "global": _global,
-    }