diff --git a/coh_metrix/cohesion.py b/coh_metrix/cohesion.py index 382869edf26446e4538fd052168c2d0348c7decf..02de6db17b3d0c71efeaaa9214eddf5c48279fe4 100644 --- a/coh_metrix/cohesion.py +++ b/coh_metrix/cohesion.py @@ -1,13 +1,3 @@ -""" -This module calculates cohesion metrics for a given text. -The metrics are divided into adjacent and global cohesion, where global -cohesion is only calculated if the amount of sentences is below the threshold -MAX_SENTENCES, as defined in coh_metrics/configuration.py. - -These metrics need to be verified in full, especially anaphor overlap. -Furthermore, training a new model to include transformer embeddings as well as -coreference resolution would be beneficial. -""" import numpy as np from spacy.tokens import Token, Doc @@ -15,252 +5,229 @@ from coh_metrix import configuration as config def calculate_overlaps(token1: Token, token2: Token, main: dict, other: dict): - """ - A helper function to minimize code duplication in the cohesion calculation. - Can be used for both adjacent and global cohesion. - :param token1: the first token. - :param token2: the second token. - :param main: the main dictionary to update. - :param other: the other dictionary to update. - """ - - # Noun overlap - if ( - token1.text.lower() == token2.text.lower() - and token1.pos_ == token2.pos_ - and token1.pos_ in ["NOUN", "PROPN"] - ): - main["nouns"] += 1 - - # Argument overlap - if ( - token1.text == token2.text and token1.pos_ == "PRON" and token2.pos_ == "PRON" - ) or ( - token1.lemma_ == token2.lemma_ - and token1.pos_ == "NOUN" - and token2.pos_ == "NOUN" - ): - main["arguments"] += 1 - - # Stem overlap - if ( - token1.lemma_ == token2.lemma_ - and token1.pos_ in ["NOUN", "VERB", "ADJ", "ADV"] - and token2.pos_ == "NOUN" - ): - main["stems"] += 1 - - # Content word overlap - - if token1.lemma_ == token2.lemma_: - if token1.pos_ == "PRON" and token2.pos_ == "PRON": - other["pronouns"]["bin"] += 1 - elif token1.pos_ == "NOUN" and token2.pos_ == "NOUN": - other["nouns"]["bin"] += 1 - elif token1.pos_ == "ADV" and token2.pos_ == "ADV": - other["adverbs"]["bin"] += 1 - elif token1.pos_ == "ADJ" and token2.pos_ == "ADJ": - other["adjectives"]["bin"] += 1 - elif token1.pos_ == "VERB" and token2.pos_ == "VERB": - other["verbs"]["bin"] += 1 + """ + A helper function to minimize code duplication in the cohesion calculation. + Can be used for both adjacent and global cohesion. + :param token1: the first token. + :param token2: the second token. + :param main: the main dictionary to update. + :param other: the other dictionary to update. + """ + + # Noun overlap + if ( + token1.text.lower() == token2.text.lower() + and token1.pos_ == token2.pos_ + and token1.pos_ in ["NOUN", "PROPN"] + ): + main["nouns"] += 1 + + # Argument overlap + if ( + token1.text == token2.text and token1.pos_ == "PRON" and token2.pos_ == "PRON" + ) or ( + token1.lemma_ == token2.lemma_ + and token1.pos_ == "NOUN" + and token2.pos_ == "NOUN" + ): + main["arguments"] += 1 + + # Stem overlap + if ( + token1.lemma_ == token2.lemma_ + and token1.pos_ in ["NOUN", "VERB", "ADJ", "ADV"] + and token2.pos_ == "NOUN" + ): + main["stems"] += 1 + + # Content word overlap (for other bins) + if token1.lemma_ == token2.lemma_: + if token1.pos_ == "PRON" and token2.pos_ == "PRON": + other["pronouns"]["bin"] += 1 + elif token1.pos_ == "NOUN" and token2.pos_ == "NOUN": + other["nouns"]["bin"] += 1 + elif token1.pos_ == "ADV" and token2.pos_ == "ADV": + other["adverbs"]["bin"] += 1 + elif token1.pos_ == "ADJ" and token2.pos_ == "ADJ": + other["adjectives"]["bin"] += 1 + elif token1.pos_ == "VERB" and token2.pos_ == "VERB": + other["verbs"]["bin"] += 1 def gen_num(np_morph, pron_morph): - """ - Helper function to determine gender and number agreement between a noun phrase - and a pronoun. - :param np_morph: the morphological features of the noun phrase. - :param pron_morph: the morphological features of the pronoun. - :return: True if the noun phrase and pronoun agree. - """ - try: - genus_np = next(iter(np_morph.get("Gender"))) - numerus_np = next(iter(np_morph.get("Number"))) - genus_pron = next(iter(pron_morph.get("Gender"))) - numerus_pron = next(iter(pron_morph.get("Number"))) - except StopIteration: - return False - - if genus_np == genus_pron and numerus_np == numerus_pron: - return True + """ + Helper function to determine gender and number agreement between a noun phrase + and a pronoun. + :param np_morph: the morphological features of the noun phrase. + :param pron_morph: the morphological features of the pronoun. + :return: True if the noun phrase and pronoun agree. + """ + try: + genus_np = next(iter(np_morph.get("Gender"))) + numerus_np = next(iter(np_morph.get("Number"))) + genus_pron = next(iter(pron_morph.get("Gender"))) + numerus_pron = next(iter(pron_morph.get("Number"))) + except StopIteration: return False + return (genus_np == genus_pron and numerus_np == numerus_pron) + def run_cohesion(doc: Doc) -> dict: - """ - Runs the cohesion metrics. Adjacent cohesion is always calculated, whereas global - cohesion is only calculated if the amount of sentences is below the threshold - MAX_PARAGRAPHS, as defined in coh_metrics/configuration.py. - :param doc: the spaCy doc object containing the parsed text. - :return: the calculated cohesion metrics. - """ - sentences = list(doc.sents) - content_tags = {"NOUN", "VERB", "ADJ", "ADV"} - noun_tags = {"NOUN", "PROPN"} - - _adjacent = { - "nouns": 0, - "arguments": 0, - "stems": 0, - "content_words": {"ratio": 0, "std": 0}, - "anaphors": 0, - } - - _other_adjacent = { - "nouns": {"bin": 0, "avg_ratio": 0}, - "adjectives": {"bin": 0, "avg_ratio": 0}, - "adverbs": {"bin": 0, "avg_ratio": 0}, - "verbs": {"bin": 0, "avg_ratio": 0}, - "pronouns": {"bin": 0, "avg_ratio": 0}, - } - - _global = { - "nouns": 0, - "arguments": 0, - "stems": 0, - "content_words": {"ratio": 0, "std": 0}, - "anaphors": 0, - } - - _other_global = { - "nouns": {"bin": 0, "avg_ratio": 0}, - "adjectives": {"bin": 0, "avg_ratio": 0}, - "adverbs": {"bin": 0, "avg_ratio": 0}, - "verbs": {"bin": 0, "avg_ratio": 0}, - "pronouns": {"bin": 0, "avg_ratio": 0}, - } - - adjacent_total_pairs = 0 - global_total_pairs = 0 - - content_word_overlap_adjacent = [] - content_word_overlap_global = [] - - doc_length = len(sentences) - - max_sentences_reached = False - - for i in range(doc_length - 1): - sentence1 = sentences[i] - sentence2 = sentences[i + 1] - - adjacent_pairs = len(sentence1) * len(sentence2) - - content_word_pairs = 0 - content_word_overlaps = 0 - - # Calculate adjacent metrics - for token1 in sentence1._.cohesion_tokens: - for token2 in sentence2._.cohesion_tokens: - calculate_overlaps(token1, token2, _adjacent, _other_adjacent) - - # Content word overlap - if token1.pos_ in content_tags and token2.pos_ in content_tags: - content_word_pairs += 1 - if token1.lemma_ == token2.lemma_: - _adjacent["content_words"]["ratio"] += 1 - content_word_overlaps += 1 - - # Anaphor overlap - for token in sentence2: - if token.pos_ == "PRON": - for np_ in sentence1._.np_chunks: - if gen_num(np_["morph"], token.morph): - _adjacent["anaphors"] += 1 - - # Normalize the other adjacent categories - for cat in _other_adjacent.keys(): - if _other_adjacent[cat]["bin"] > 0: - _other_adjacent[cat]["avg_ratio"] += ( - _other_adjacent[cat]["bin"] / adjacent_pairs - ) - - adjacent_total_pairs += 1 - - content_word_overlap_adjacent.append( - content_word_overlaps / content_word_pairs - if content_word_pairs > 0 and content_word_overlaps > 0 - else 0 - ) - - # only allow global metrics if the document is not too long - if doc_length >= config.MAX_SENTENCES: - max_sentences_reached = True - continue - - # Calculate global metrics - for j in range(i + 1, len(sentences)): - sentence2_global = sentences[j] - - global_pairs = len(sentence1) * len(sentence2) - - content_word_pairs = 0 - content_word_overlaps = 0 - - for token1 in sentence1._.cohesion_tokens: - for token2 in sentence2_global._.cohesion_tokens: - calculate_overlaps(token1, token2, _global, _other_global) - - # Content word overlap - if token1.pos_ in content_tags and token2.pos_ in content_tags: - content_word_pairs += 1 - if token1.lemma_ == token2.lemma_: - _global["content_words"]["ratio"] += 1 - content_word_overlaps += 1 - - # Anaphor overlap - # TODO: Global anaphor overlap considers past X sentences - for token in sentence2_global: - if token.pos_ == "PRON": - for np_ in sentence1._.np_chunks: - if gen_num(np_["morph"], token.morph): - _global["anaphors"] += 1 - - # Normalize the other global categories - for cat in _other_global.keys(): - if _other_global[cat]["bin"] > 0: - _other_global[cat]["avg_ratio"] += ( - _other_global[cat]["bin"] / global_pairs - ) - - global_total_pairs += 1 - - content_word_overlap_global.append( - content_word_overlaps / content_word_pairs - if content_word_pairs > 0 and content_word_overlaps > 0 - else 0 - ) - - for cat in _adjacent.keys(): - if cat == "content_words": - _adjacent["content_words"]["std"] = ( - np.std(np.array(content_word_overlap_adjacent)) - if adjacent_total_pairs > 0 - else "NaN" - ) - _adjacent["content_words"]["ratio"] /= adjacent_total_pairs - else: - _adjacent[cat] /= adjacent_total_pairs - - _adjacent["other"] = _other_adjacent - - if max_sentences_reached: - return {"adjacent": _adjacent, "global": None} - - for cat in _global.keys(): - if cat == "content_words": - _global["content_words"]["std"] = ( - np.std(np.array(content_word_overlap_global)) - if global_total_pairs > 0 - else "NaN" - ) - _global["content_words"]["ratio"] /= global_total_pairs - else: - _global[cat] /= global_total_pairs - - _global["other"] = _other_global - - return { - "adjacent": _adjacent, - "global": _global, - } + """ + Runs the cohesion metrics. Adjacent cohesion is always calculated, whereas global + cohesion is only calculated if the amount of sentences is below the threshold + MAX_PARAGRAPHS, as defined in coh_metrics/configuration.py. + :param doc: the spaCy doc object containing the parsed text. + :return: the calculated cohesion metrics. + """ + sentences = list(doc.sents) + content_tags = {"NOUN", "VERB", "ADJ", "ADV"} + + _adjacent = { + "nouns": 0, + "arguments": 0, + "stems": 0, + "content_words": {"ratio": 0, "std": 0}, + "anaphors": 0, + } + _other_adjacent = { + "nouns": {"bin": 0, "avg_ratio": 0}, + "adjectives": {"bin": 0, "avg_ratio": 0}, + "adverbs": {"bin": 0, "avg_ratio": 0}, + "verbs": {"bin": 0, "avg_ratio": 0}, + "pronouns": {"bin": 0, "avg_ratio": 0}, + } + + _global = { + "nouns": 0, + "arguments": 0, + "stems": 0, + "content_words": {"ratio": 0, "std": 0}, + "anaphors": 0, + } + _other_global = { + "nouns": {"bin": 0, "avg_ratio": 0}, + "adjectives": {"bin": 0, "avg_ratio": 0}, + "adverbs": {"bin": 0, "avg_ratio": 0}, + "verbs": {"bin": 0, "avg_ratio": 0}, + "pronouns": {"bin": 0, "avg_ratio": 0}, + } + + adjacent_total_pairs = 0 + global_total_pairs = 0 + content_word_overlap_adjacent = [] + content_word_overlap_global = [] + + doc_length = len(sentences) + max_sentences_reached = False + + for i in range(doc_length - 1): + sentence1 = sentences[i] + sentence2 = sentences[i + 1] + adjacent_pairs = len(sentence1) * len(sentence2) + + # capture starting bin counts for per-pair calculation + start_bins = {cat: _other_adjacent[cat]["bin"] for cat in _other_adjacent} + + content_word_pairs = 0 + content_word_overlaps = 0 + + # Calculate overlaps for this adjacent pair + for token1 in sentence1._.cohesion_tokens: + for token2 in sentence2._.cohesion_tokens: + calculate_overlaps(token1, token2, _adjacent, _other_adjacent) + if token1.pos_ in content_tags and token2.pos_ in content_tags: + content_word_pairs += 1 + if token1.lemma_ == token2.lemma_: + _adjacent["content_words"]["ratio"] += 1 + content_word_overlaps += 1 + + # Anaphor overlap + for token in sentence2: + if token.pos_ == "PRON": + for np_ in sentence1._.np_chunks: + if gen_num(np_["morph"], token.morph): + _adjacent["anaphors"] += 1 + + # update per-pair avg_ratio using local bin increments + for cat in _other_adjacent: + local_count = _other_adjacent[cat]["bin"] - start_bins[cat] + if local_count > 0: + _other_adjacent[cat]["avg_ratio"] += (local_count / adjacent_pairs) + + adjacent_total_pairs += 1 + content_word_overlap_adjacent.append( + content_word_overlaps / content_word_pairs + if content_word_pairs > 0 and content_word_overlaps > 0 + else 0 + ) + + # only allow global metrics if the document is not too long + if doc_length >= config.MAX_SENTENCES: + max_sentences_reached = True + continue + + # Calculate global metrics + for j in range(i + 1, doc_length): + sentence2_global = sentences[j] + global_pairs = len(sentence1) * len(sentence2_global) + + start_bins_g = {cat: _other_global[cat]["bin"] for cat in _other_global} + cw_pairs_g = 0 + cw_overlaps_g = 0 + + for token1 in sentence1._.cohesion_tokens: + for token2 in sentence2_global._.cohesion_tokens: + calculate_overlaps(token1, token2, _global, _other_global) + if token1.pos_ in content_tags and token2.pos_ in content_tags: + cw_pairs_g += 1 + if token1.lemma_ == token2.lemma_: + _global["content_words"]["ratio"] += 1 + cw_overlaps_g += 1 + + for token in sentence2_global: + if token.pos_ == "PRON": + for np_ in sentence1._.np_chunks: + if gen_num(np_["morph"], token.morph): + _global["anaphors"] += 1 + + for cat in _other_global: + local_count_g = _other_global[cat]["bin"] - start_bins_g[cat] + if local_count_g > 0: + _other_global[cat]["avg_ratio"] += (local_count_g / global_pairs) + + global_total_pairs += 1 + content_word_overlap_global.append( + cw_overlaps_g / cw_pairs_g if cw_pairs_g > 0 and cw_overlaps_g > 0 else 0 + ) + + # finalize adjacent metrics + if adjacent_total_pairs > 0: + _adjacent["content_words"]["ratio"] /= adjacent_total_pairs + _adjacent["content_words"]["std"] = np.std( + np.array(content_word_overlap_adjacent) + ) + for cat in _other_adjacent: + _other_adjacent[cat]["avg_ratio"] /= adjacent_total_pairs + for cat in [k for k in _adjacent if k != "content_words"]: + _adjacent[cat] /= adjacent_total_pairs + _adjacent["other"] = _other_adjacent + + if max_sentences_reached: + return {"adjacent": _adjacent, "global": None} + + # finalize global metrics + if global_total_pairs > 0: + _global["content_words"]["ratio"] /= global_total_pairs + _global["content_words"]["std"] = np.std( + np.array(content_word_overlap_global) + ) + for cat in _other_global: + _other_global[cat]["avg_ratio"] /= global_total_pairs + for cat in [k for k in _global if k != "content_words"]: + _global[cat] /= global_total_pairs + _global["other"] = _other_global + + return {"adjacent": _adjacent, "global": _global} +