Skip to content
Snippets Groups Projects
Commit 372161d9 authored by Love Arreborn's avatar Love Arreborn
Browse files

bugfix on cohesion

parent 53fdd7b0
No related branches found
No related tags found
No related merge requests found
Pipeline #163185 skipped
"""
This module calculates cohesion metrics for a given text.
The metrics are divided into adjacent and global cohesion, where global
cohesion is only calculated if the amount of sentences is below the threshold
MAX_SENTENCES, as defined in coh_metrics/configuration.py.
These metrics need to be verified in full, especially anaphor overlap.
Furthermore, training a new model to include transformer embeddings as well as
coreference resolution would be beneficial.
"""
import numpy as np import numpy as np
from spacy.tokens import Token, Doc from spacy.tokens import Token, Doc
...@@ -15,252 +5,229 @@ from coh_metrix import configuration as config ...@@ -15,252 +5,229 @@ from coh_metrix import configuration as config
def calculate_overlaps(token1: Token, token2: Token, main: dict, other: dict): def calculate_overlaps(token1: Token, token2: Token, main: dict, other: dict):
""" """
A helper function to minimize code duplication in the cohesion calculation. A helper function to minimize code duplication in the cohesion calculation.
Can be used for both adjacent and global cohesion. Can be used for both adjacent and global cohesion.
:param token1: the first token. :param token1: the first token.
:param token2: the second token. :param token2: the second token.
:param main: the main dictionary to update. :param main: the main dictionary to update.
:param other: the other dictionary to update. :param other: the other dictionary to update.
""" """
# Noun overlap # Noun overlap
if ( if (
token1.text.lower() == token2.text.lower() token1.text.lower() == token2.text.lower()
and token1.pos_ == token2.pos_ and token1.pos_ == token2.pos_
and token1.pos_ in ["NOUN", "PROPN"] and token1.pos_ in ["NOUN", "PROPN"]
): ):
main["nouns"] += 1 main["nouns"] += 1
# Argument overlap # Argument overlap
if ( if (
token1.text == token2.text and token1.pos_ == "PRON" and token2.pos_ == "PRON" token1.text == token2.text and token1.pos_ == "PRON" and token2.pos_ == "PRON"
) or ( ) or (
token1.lemma_ == token2.lemma_ token1.lemma_ == token2.lemma_
and token1.pos_ == "NOUN" and token1.pos_ == "NOUN"
and token2.pos_ == "NOUN" and token2.pos_ == "NOUN"
): ):
main["arguments"] += 1 main["arguments"] += 1
# Stem overlap # Stem overlap
if ( if (
token1.lemma_ == token2.lemma_ token1.lemma_ == token2.lemma_
and token1.pos_ in ["NOUN", "VERB", "ADJ", "ADV"] and token1.pos_ in ["NOUN", "VERB", "ADJ", "ADV"]
and token2.pos_ == "NOUN" and token2.pos_ == "NOUN"
): ):
main["stems"] += 1 main["stems"] += 1
# Content word overlap # Content word overlap (for other bins)
if token1.lemma_ == token2.lemma_:
if token1.lemma_ == token2.lemma_: if token1.pos_ == "PRON" and token2.pos_ == "PRON":
if token1.pos_ == "PRON" and token2.pos_ == "PRON": other["pronouns"]["bin"] += 1
other["pronouns"]["bin"] += 1 elif token1.pos_ == "NOUN" and token2.pos_ == "NOUN":
elif token1.pos_ == "NOUN" and token2.pos_ == "NOUN": other["nouns"]["bin"] += 1
other["nouns"]["bin"] += 1 elif token1.pos_ == "ADV" and token2.pos_ == "ADV":
elif token1.pos_ == "ADV" and token2.pos_ == "ADV": other["adverbs"]["bin"] += 1
other["adverbs"]["bin"] += 1 elif token1.pos_ == "ADJ" and token2.pos_ == "ADJ":
elif token1.pos_ == "ADJ" and token2.pos_ == "ADJ": other["adjectives"]["bin"] += 1
other["adjectives"]["bin"] += 1 elif token1.pos_ == "VERB" and token2.pos_ == "VERB":
elif token1.pos_ == "VERB" and token2.pos_ == "VERB": other["verbs"]["bin"] += 1
other["verbs"]["bin"] += 1
def gen_num(np_morph, pron_morph): def gen_num(np_morph, pron_morph):
""" """
Helper function to determine gender and number agreement between a noun phrase Helper function to determine gender and number agreement between a noun phrase
and a pronoun. and a pronoun.
:param np_morph: the morphological features of the noun phrase. :param np_morph: the morphological features of the noun phrase.
:param pron_morph: the morphological features of the pronoun. :param pron_morph: the morphological features of the pronoun.
:return: True if the noun phrase and pronoun agree. :return: True if the noun phrase and pronoun agree.
""" """
try: try:
genus_np = next(iter(np_morph.get("Gender"))) genus_np = next(iter(np_morph.get("Gender")))
numerus_np = next(iter(np_morph.get("Number"))) numerus_np = next(iter(np_morph.get("Number")))
genus_pron = next(iter(pron_morph.get("Gender"))) genus_pron = next(iter(pron_morph.get("Gender")))
numerus_pron = next(iter(pron_morph.get("Number"))) numerus_pron = next(iter(pron_morph.get("Number")))
except StopIteration: except StopIteration:
return False
if genus_np == genus_pron and numerus_np == numerus_pron:
return True
return False return False
return (genus_np == genus_pron and numerus_np == numerus_pron)
def run_cohesion(doc: Doc) -> dict: def run_cohesion(doc: Doc) -> dict:
""" """
Runs the cohesion metrics. Adjacent cohesion is always calculated, whereas global Runs the cohesion metrics. Adjacent cohesion is always calculated, whereas global
cohesion is only calculated if the amount of sentences is below the threshold cohesion is only calculated if the amount of sentences is below the threshold
MAX_PARAGRAPHS, as defined in coh_metrics/configuration.py. MAX_PARAGRAPHS, as defined in coh_metrics/configuration.py.
:param doc: the spaCy doc object containing the parsed text. :param doc: the spaCy doc object containing the parsed text.
:return: the calculated cohesion metrics. :return: the calculated cohesion metrics.
""" """
sentences = list(doc.sents) sentences = list(doc.sents)
content_tags = {"NOUN", "VERB", "ADJ", "ADV"} content_tags = {"NOUN", "VERB", "ADJ", "ADV"}
noun_tags = {"NOUN", "PROPN"}
_adjacent = {
_adjacent = { "nouns": 0,
"nouns": 0, "arguments": 0,
"arguments": 0, "stems": 0,
"stems": 0, "content_words": {"ratio": 0, "std": 0},
"content_words": {"ratio": 0, "std": 0}, "anaphors": 0,
"anaphors": 0, }
} _other_adjacent = {
"nouns": {"bin": 0, "avg_ratio": 0},
_other_adjacent = { "adjectives": {"bin": 0, "avg_ratio": 0},
"nouns": {"bin": 0, "avg_ratio": 0}, "adverbs": {"bin": 0, "avg_ratio": 0},
"adjectives": {"bin": 0, "avg_ratio": 0}, "verbs": {"bin": 0, "avg_ratio": 0},
"adverbs": {"bin": 0, "avg_ratio": 0}, "pronouns": {"bin": 0, "avg_ratio": 0},
"verbs": {"bin": 0, "avg_ratio": 0}, }
"pronouns": {"bin": 0, "avg_ratio": 0},
} _global = {
"nouns": 0,
_global = { "arguments": 0,
"nouns": 0, "stems": 0,
"arguments": 0, "content_words": {"ratio": 0, "std": 0},
"stems": 0, "anaphors": 0,
"content_words": {"ratio": 0, "std": 0}, }
"anaphors": 0, _other_global = {
} "nouns": {"bin": 0, "avg_ratio": 0},
"adjectives": {"bin": 0, "avg_ratio": 0},
_other_global = { "adverbs": {"bin": 0, "avg_ratio": 0},
"nouns": {"bin": 0, "avg_ratio": 0}, "verbs": {"bin": 0, "avg_ratio": 0},
"adjectives": {"bin": 0, "avg_ratio": 0}, "pronouns": {"bin": 0, "avg_ratio": 0},
"adverbs": {"bin": 0, "avg_ratio": 0}, }
"verbs": {"bin": 0, "avg_ratio": 0},
"pronouns": {"bin": 0, "avg_ratio": 0}, adjacent_total_pairs = 0
} global_total_pairs = 0
content_word_overlap_adjacent = []
adjacent_total_pairs = 0 content_word_overlap_global = []
global_total_pairs = 0
doc_length = len(sentences)
content_word_overlap_adjacent = [] max_sentences_reached = False
content_word_overlap_global = []
for i in range(doc_length - 1):
doc_length = len(sentences) sentence1 = sentences[i]
sentence2 = sentences[i + 1]
max_sentences_reached = False adjacent_pairs = len(sentence1) * len(sentence2)
for i in range(doc_length - 1): # capture starting bin counts for per-pair calculation
sentence1 = sentences[i] start_bins = {cat: _other_adjacent[cat]["bin"] for cat in _other_adjacent}
sentence2 = sentences[i + 1]
content_word_pairs = 0
adjacent_pairs = len(sentence1) * len(sentence2) content_word_overlaps = 0
content_word_pairs = 0 # Calculate overlaps for this adjacent pair
content_word_overlaps = 0 for token1 in sentence1._.cohesion_tokens:
for token2 in sentence2._.cohesion_tokens:
# Calculate adjacent metrics calculate_overlaps(token1, token2, _adjacent, _other_adjacent)
for token1 in sentence1._.cohesion_tokens: if token1.pos_ in content_tags and token2.pos_ in content_tags:
for token2 in sentence2._.cohesion_tokens: content_word_pairs += 1
calculate_overlaps(token1, token2, _adjacent, _other_adjacent) if token1.lemma_ == token2.lemma_:
_adjacent["content_words"]["ratio"] += 1
# Content word overlap content_word_overlaps += 1
if token1.pos_ in content_tags and token2.pos_ in content_tags:
content_word_pairs += 1 # Anaphor overlap
if token1.lemma_ == token2.lemma_: for token in sentence2:
_adjacent["content_words"]["ratio"] += 1 if token.pos_ == "PRON":
content_word_overlaps += 1 for np_ in sentence1._.np_chunks:
if gen_num(np_["morph"], token.morph):
# Anaphor overlap _adjacent["anaphors"] += 1
for token in sentence2:
if token.pos_ == "PRON": # update per-pair avg_ratio using local bin increments
for np_ in sentence1._.np_chunks: for cat in _other_adjacent:
if gen_num(np_["morph"], token.morph): local_count = _other_adjacent[cat]["bin"] - start_bins[cat]
_adjacent["anaphors"] += 1 if local_count > 0:
_other_adjacent[cat]["avg_ratio"] += (local_count / adjacent_pairs)
# Normalize the other adjacent categories
for cat in _other_adjacent.keys(): adjacent_total_pairs += 1
if _other_adjacent[cat]["bin"] > 0: content_word_overlap_adjacent.append(
_other_adjacent[cat]["avg_ratio"] += ( content_word_overlaps / content_word_pairs
_other_adjacent[cat]["bin"] / adjacent_pairs if content_word_pairs > 0 and content_word_overlaps > 0
) else 0
)
adjacent_total_pairs += 1
# only allow global metrics if the document is not too long
content_word_overlap_adjacent.append( if doc_length >= config.MAX_SENTENCES:
content_word_overlaps / content_word_pairs max_sentences_reached = True
if content_word_pairs > 0 and content_word_overlaps > 0 continue
else 0
) # Calculate global metrics
for j in range(i + 1, doc_length):
# only allow global metrics if the document is not too long sentence2_global = sentences[j]
if doc_length >= config.MAX_SENTENCES: global_pairs = len(sentence1) * len(sentence2_global)
max_sentences_reached = True
continue start_bins_g = {cat: _other_global[cat]["bin"] for cat in _other_global}
cw_pairs_g = 0
# Calculate global metrics cw_overlaps_g = 0
for j in range(i + 1, len(sentences)):
sentence2_global = sentences[j] for token1 in sentence1._.cohesion_tokens:
for token2 in sentence2_global._.cohesion_tokens:
global_pairs = len(sentence1) * len(sentence2) calculate_overlaps(token1, token2, _global, _other_global)
if token1.pos_ in content_tags and token2.pos_ in content_tags:
content_word_pairs = 0 cw_pairs_g += 1
content_word_overlaps = 0 if token1.lemma_ == token2.lemma_:
_global["content_words"]["ratio"] += 1
for token1 in sentence1._.cohesion_tokens: cw_overlaps_g += 1
for token2 in sentence2_global._.cohesion_tokens:
calculate_overlaps(token1, token2, _global, _other_global) for token in sentence2_global:
if token.pos_ == "PRON":
# Content word overlap for np_ in sentence1._.np_chunks:
if token1.pos_ in content_tags and token2.pos_ in content_tags: if gen_num(np_["morph"], token.morph):
content_word_pairs += 1 _global["anaphors"] += 1
if token1.lemma_ == token2.lemma_:
_global["content_words"]["ratio"] += 1 for cat in _other_global:
content_word_overlaps += 1 local_count_g = _other_global[cat]["bin"] - start_bins_g[cat]
if local_count_g > 0:
# Anaphor overlap _other_global[cat]["avg_ratio"] += (local_count_g / global_pairs)
# TODO: Global anaphor overlap considers past X sentences
for token in sentence2_global: global_total_pairs += 1
if token.pos_ == "PRON": content_word_overlap_global.append(
for np_ in sentence1._.np_chunks: cw_overlaps_g / cw_pairs_g if cw_pairs_g > 0 and cw_overlaps_g > 0 else 0
if gen_num(np_["morph"], token.morph): )
_global["anaphors"] += 1
# finalize adjacent metrics
# Normalize the other global categories if adjacent_total_pairs > 0:
for cat in _other_global.keys(): _adjacent["content_words"]["ratio"] /= adjacent_total_pairs
if _other_global[cat]["bin"] > 0: _adjacent["content_words"]["std"] = np.std(
_other_global[cat]["avg_ratio"] += ( np.array(content_word_overlap_adjacent)
_other_global[cat]["bin"] / global_pairs )
) for cat in _other_adjacent:
_other_adjacent[cat]["avg_ratio"] /= adjacent_total_pairs
global_total_pairs += 1 for cat in [k for k in _adjacent if k != "content_words"]:
_adjacent[cat] /= adjacent_total_pairs
content_word_overlap_global.append( _adjacent["other"] = _other_adjacent
content_word_overlaps / content_word_pairs
if content_word_pairs > 0 and content_word_overlaps > 0 if max_sentences_reached:
else 0 return {"adjacent": _adjacent, "global": None}
)
# finalize global metrics
for cat in _adjacent.keys(): if global_total_pairs > 0:
if cat == "content_words": _global["content_words"]["ratio"] /= global_total_pairs
_adjacent["content_words"]["std"] = ( _global["content_words"]["std"] = np.std(
np.std(np.array(content_word_overlap_adjacent)) np.array(content_word_overlap_global)
if adjacent_total_pairs > 0 )
else "NaN" for cat in _other_global:
) _other_global[cat]["avg_ratio"] /= global_total_pairs
_adjacent["content_words"]["ratio"] /= adjacent_total_pairs for cat in [k for k in _global if k != "content_words"]:
else: _global[cat] /= global_total_pairs
_adjacent[cat] /= adjacent_total_pairs _global["other"] = _other_global
_adjacent["other"] = _other_adjacent return {"adjacent": _adjacent, "global": _global}
if max_sentences_reached:
return {"adjacent": _adjacent, "global": None}
for cat in _global.keys():
if cat == "content_words":
_global["content_words"]["std"] = (
np.std(np.array(content_word_overlap_global))
if global_total_pairs > 0
else "NaN"
)
_global["content_words"]["ratio"] /= global_total_pairs
else:
_global[cat] /= global_total_pairs
_global["other"] = _other_global
return {
"adjacent": _adjacent,
"global": _global,
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment