# Taken from coh_metrix_kvtill/lsa_all_sentences.py, converted to a function from a class since we're not reusing the data and only computing once
# Taken from coh_metrix_kvtill/lsa_all_sentences.py, converted to a function from a class since we're not reusing the data and only computing once
# Not sure if this represents the global_lsa from the original SAPIS, hard to verify as the API-call gives np.nan for both metrics
# Not sure if this represents the global_lsa from the original SAPIS, hard to verify as the API-call gives np.nan for both metrics
# Check with Daniel
# Check with Daniel
# D: the computations for LSA all were too expensive for the live server - long texts froze the whole computation.
# check if it is runnable on long texts. it looks like the algorithm is correct so we can use it (if it is possible to run without freezing the whole pipeline on long texts)
content_tags=["NN","VB","PN","NOUN","VERB","PRON"]#D: Either the tagset is ["NN", "VB", "PN"] or ["NOUN", "VERB", "PRON"]? Probably the first version since that is the tagset used by SCREAM
punctuation_marks=["MAD","MID","PAD","PUNCT"]
punctuation_marks=["MAD","MID","PAD","PUNCT"]#D: ["MAD", "MID", "PAD"] are SUC tags, ["PUNCT"] UD. SCREAM uses the first set (UD).
swedish_pronouns="han hon hans hennes de dem deras mig dig vi ni dess du jag den det vi ni".split(
swedish_pronouns="han hon hans hennes de dem deras mig dig vi ni dess du jag den det vi ni".split(