Skip to content
Snippets Groups Projects
Commit 7527c542 authored by Daniel Holmer's avatar Daniel Holmer
Browse files

answers to some questions

parent a6112c35
No related branches found
No related tags found
No related merge requests found
Pipeline #132098 skipped
...@@ -61,6 +61,8 @@ def run_lsa(sentences: list, parsed: list) -> tuple[float, float]: ...@@ -61,6 +61,8 @@ def run_lsa(sentences: list, parsed: list) -> tuple[float, float]:
# Taken from coh_metrix_kvtill/lsa_all_sentences.py, converted to a function from a class since we're not reusing the data and only computing once # Taken from coh_metrix_kvtill/lsa_all_sentences.py, converted to a function from a class since we're not reusing the data and only computing once
# Not sure if this represents the global_lsa from the original SAPIS, hard to verify as the API-call gives np.nan for both metrics # Not sure if this represents the global_lsa from the original SAPIS, hard to verify as the API-call gives np.nan for both metrics
# Check with Daniel # Check with Daniel
# D: the computations for LSA all were too expensive for the live server - long texts froze the whole computation.
# check if it is runnable on long texts. it looks like the algorithm is correct so we can use it (if it is possible to run without freezing the whole pipeline on long texts)
def lsa_all_sentences(processed_sentences: list) -> tuple[float, float]: def lsa_all_sentences(processed_sentences: list) -> tuple[float, float]:
""" """
Computes the LSASSp and LSASSpd scores for the given text. Computes the LSASSp and LSASSpd scores for the given text.
...@@ -93,6 +95,8 @@ def lsa_all_sentences(processed_sentences: list) -> tuple[float, float]: ...@@ -93,6 +95,8 @@ def lsa_all_sentences(processed_sentences: list) -> tuple[float, float]:
# "avg": 0.6881772577762604, "std": 0.5822818204760551 (this function) # "avg": 0.6881772577762604, "std": 0.5822818204760551 (this function)
# Differs slightly from the original code as well, unsure if this is due to the new implementation from KvTill23 or if there might be some issues here # Differs slightly from the original code as well, unsure if this is due to the new implementation from KvTill23 or if there might be some issues here
# Check with Daniel if this seems right # Check with Daniel if this seems right
#D: This is expected. The model used for the calculations are completely different, although they serve the same purpose. The LSA model is trained
# only on SUC, while the sentence transformer model is trained on vastly more data. LSA is ancient and should be replaced.
def lsa_adjacent_sentences(processed_sentences: list) -> tuple[float, float]: def lsa_adjacent_sentences(processed_sentences: list) -> tuple[float, float]:
""" """
Compute the average and standard deviation cosine similarity Compute the average and standard deviation cosine similarity
...@@ -125,13 +129,15 @@ def lsa_adjacent_sentences(processed_sentences: list) -> tuple[float, float]: ...@@ -125,13 +129,15 @@ def lsa_adjacent_sentences(processed_sentences: list) -> tuple[float, float]:
# 0.4509948415622697 (this function) vs 0.4802052785923754 (API) # 0.4509948415622697 (this function) vs 0.4802052785923754 (API)
# Looks very similar to the original giveness-code as well # Looks very similar to the original giveness-code as well
# Check with Daniel though # Check with Daniel though
# D: Try changing the pronoun-tag. It should have been converted to SUC-tags by then?
# D: I think we should use the Taaco-version, yes
def lsa_compute_givenness(parsed: list) -> tuple[float, float]: def lsa_compute_givenness(parsed: list) -> tuple[float, float]:
"""Calculate the global givenness average for the entire text.""" """Calculate the global givenness average for the entire text."""
current_lemmas = set() current_lemmas = set()
total_lemmas, repeated_lemmas, pronouns = 0, 0, 0 total_lemmas, repeated_lemmas, pronouns = 0, 0, 0
content_tags = ["NN", "VB", "PN", "NOUN", "VERB", "PRON"] content_tags = ["NN", "VB", "PN", "NOUN", "VERB", "PRON"] #D: Either the tagset is ["NN", "VB", "PN"] or ["NOUN", "VERB", "PRON"]? Probably the first version since that is the tagset used by SCREAM
punctuation_marks = ["MAD", "MID", "PAD", "PUNCT"] punctuation_marks = ["MAD", "MID", "PAD", "PUNCT"] #D: ["MAD", "MID", "PAD"] are SUC tags, ["PUNCT"] UD. SCREAM uses the first set (UD).
swedish_pronouns = "han hon hans hennes de dem deras mig dig vi ni dess du jag den det vi ni".split( swedish_pronouns = "han hon hans hennes de dem deras mig dig vi ni dess du jag den det vi ni".split(
" " " "
) )
...@@ -148,7 +154,7 @@ def lsa_compute_givenness(parsed: list) -> tuple[float, float]: ...@@ -148,7 +154,7 @@ def lsa_compute_givenness(parsed: list) -> tuple[float, float]:
repeated_lemmas += 1 repeated_lemmas += 1
# if the word is a pronoun and exists in the defined list of swedish pronouns # if the word is a pronoun and exists in the defined list of swedish pronouns
if lemma in swedish_pronouns and upos == "PRON": if lemma in swedish_pronouns and upos == "PRON": #D: Will this tag ever be used? Should probably be PN instead
pronouns += 1 pronouns += 1
# if word is not a punctuation mark # if word is not a punctuation mark
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment