answers to some questions

7527c542 · Daniel Holmer · a6112c35 · 7527c542
Commit 7527c542 authored 1 year ago by Daniel Holmer
--- a/coh_metrix_new/lsa.py
+++ b/coh_metrix_new/lsa.py
@@ -61,6 +61,8 @@ def run_lsa(sentences: list, parsed: list) -> tuple[float, float]:
 # Taken from coh_metrix_kvtill/lsa_all_sentences.py, converted to a function from a class since we're not reusing the data and only computing once
 # Not sure if this represents the global_lsa from the original SAPIS, hard to verify as the API-call gives np.nan for both metrics
 # Check with Daniel
+# D: the computations for LSA all were too expensive for the live server - long texts froze the whole computation.
+# check if it is runnable on long texts. it looks like the algorithm is correct so we can use it (if it is possible to run without freezing the whole pipeline on long texts)
 def lsa_all_sentences(processed_sentences: list) -> tuple[float, float]:
    """
    Computes the LSASSp and LSASSpd scores for the given text.
@@ -93,6 +95,8 @@ def lsa_all_sentences(processed_sentences: list) -> tuple[float, float]:
 # "avg": 0.6881772577762604, "std": 0.5822818204760551 (this function)
 # Differs slightly from the original code as well, unsure if this is due to the new implementation from KvTill23 or if there might be some issues here
 # Check with Daniel if this seems right
+#D: This is expected. The model used for the calculations are completely different, although they serve the same purpose. The LSA model is trained
+# only on SUC, while the sentence transformer model is trained on vastly more data. LSA is ancient and should be replaced. 
 def lsa_adjacent_sentences(processed_sentences: list) -> tuple[float, float]:
    """
    Compute the average and standard deviation cosine similarity
@@ -125,13 +129,15 @@ def lsa_adjacent_sentences(processed_sentences: list) -> tuple[float, float]:
 # 0.4509948415622697 (this function) vs 0.4802052785923754 (API)
 # Looks very similar to the original giveness-code as well
 # Check with Daniel though
+# D: Try changing the pronoun-tag. It should have been converted to SUC-tags by then?
+# D: I think we should use the Taaco-version, yes
 def lsa_compute_givenness(parsed: list) -> tuple[float, float]:
    """Calculate the global givenness average for the entire text."""
    current_lemmas = set()
    total_lemmas, repeated_lemmas, pronouns = 0, 0, 0
-    content_tags = ["NN", "VB", "PN", "NOUN", "VERB", "PRON"]
+    content_tags = ["NN", "VB", "PN", "NOUN", "VERB", "PRON"] #D: Either the tagset is ["NN", "VB", "PN"] or ["NOUN", "VERB", "PRON"]? Probably the first version since that is the tagset used by SCREAM
-    punctuation_marks = ["MAD", "MID", "PAD", "PUNCT"]
+    punctuation_marks = ["MAD", "MID", "PAD", "PUNCT"] #D: ["MAD", "MID", "PAD"] are SUC tags, ["PUNCT"] UD. SCREAM uses the first set (UD).
    swedish_pronouns = "han hon hans hennes de dem deras mig dig vi ni dess du jag den det vi ni".split(
        " "
    )
@@ -148,7 +154,7 @@ def lsa_compute_givenness(parsed: list) -> tuple[float, float]:
                    repeated_lemmas += 1
                # if the word is a pronoun and exists in the defined list of swedish pronouns
-                if lemma in swedish_pronouns and upos == "PRON":
+                if lemma in swedish_pronouns and upos == "PRON": #D: Will this tag ever be used? Should probably be PN instead
                    pronouns += 1
                # if word is not a punctuation mark