Skip to content
Snippets Groups Projects
Commit 30fcf06f authored by Love Arreborn's avatar Love Arreborn
Browse files

stashing all changes

parent 3628a5b0
No related branches found
No related tags found
No related merge requests found
Pipeline #132853 skipped
...@@ -3,7 +3,7 @@ StilLett API Service ...@@ -3,7 +3,7 @@ StilLett API Service
The original `SAPIS` (Fahlborg and Rennes, 2016), was a restful web service based on Java Spring, implementing an API with The original `SAPIS` (Fahlborg and Rennes, 2016), was a restful web service based on Java Spring, implementing an API with
the ability to interpret options and input data as variables in an input JSON the ability to interpret options and input data as variables in an input JSON
object, passed to the `SAPIS` service in a HTTP request . The serivce has since then been rewritten in Python3, with similar functionality. object, passed to the `SAPIS` service in a HTTP request. Since then, the service has been rewritten in Python twice. This current version parses the text using `spaCy` rather than the original `efselab` for speed.
The API service can be reached from a server located at: https://sapis.it.liu.se/api, accepting POST requests. The API service can be reached from a server located at: https://sapis.it.liu.se/api, accepting POST requests.
......
...@@ -321,9 +321,9 @@ def split_measures_to_scream_headings(structural_instance) -> dict: ...@@ -321,9 +321,9 @@ def split_measures_to_scream_headings(structural_instance) -> dict:
"verbial_root_ratio", # VerbalRoots OK? "verbial_root_ratio", # VerbalRoots OK?
"avg_verbal_arity", # AVA OK? "avg_verbal_arity", # AVA OK?
"verb_arity_unigram_probs", # UVA OK? "verb_arity_unigram_probs", # UVA OK?
"calculate_avg_word_per_clause", # TPC "calculate_avg_word_per_clause", # TPC OK
"dep_unigram_probs", # UnigramDep "dep_unigram_probs", # UnigramDep OK
"pos_unigram_probs", # UnigramPOS "pos_unigram_probs", # UnigramPOS OK
"lexical_density", # RatCont OK? "lexical_density", # RatCont OK?
] ]
surface_vars = ["avg_sentence_length", "avg_word_length", "avg_n_syllables"] # ok surface_vars = ["avg_sentence_length", "avg_word_length", "avg_n_syllables"] # ok
...@@ -334,8 +334,8 @@ def split_measures_to_scream_headings(structural_instance) -> dict: ...@@ -334,8 +334,8 @@ def split_measures_to_scream_headings(structural_instance) -> dict:
"swevoc_h_ratio", # ok? "swevoc_h_ratio", # ok?
] ]
additional_structural = [ additional_structural = [
"n_verbal_roots", "n_verbal_roots", # ok
"n_prep_comp", "n_prep_comp", # ok
] ]
additional_lexical = [ # Raw freqs additional_lexical = [ # Raw freqs
"n_swevoc_s", # ok "n_swevoc_s", # ok
......
...@@ -27,8 +27,45 @@ Below follows a brief explanation of all metrics calculated by `SCREAM2`. These ...@@ -27,8 +27,45 @@ Below follows a brief explanation of all metrics calculated by `SCREAM2`. These
### Lexical metrics ### Lexical metrics
#### Additional surface metrics - `swevoc_total_ratio`: placeholder
- `swevoc_c_ratio`: placeholder
- `swevoc_d_ratio`: placeholder
- `swevoc_h_ratio`: placeholder
#### Additional lexical metrics
- `n_swevoc_s`: placeholder
- `n_swevoc_k`: placeholder
- `n_swevoc_total`: placeholder
- `n_swevoc_c`: placeholder
- `n_swevoc_d`: placeholder
- `n_swevoc_h`: placeholder
- `swevoc_s_ratio`: placeholder
- `swevoc_k_ratio`: placeholder
- `swevoc_dict`: The dictionary containing all occurrences of SweVoc-words from the provided document. ALl lexical metrics are then calculated from these values. Primarily useful for debugging and can safetly be commented out from a production build.
### Structural vars ### Structural vars
#### Additional structural vars - `avg_dep_distance_dependent`: placeholder
\ No newline at end of file - `avg_dep_distance_sentence`: placeholder
- `avg_right_dep_ratio`: placeholder
- `avg_nominal_premodifiers`: placeholder
- `avg_nominal_postmodifiers`: placeholder
- `avg_prep_comp`: placeholder
- `avg_sentence_depth`: placeholder
- `verbial_root_ratio`: placeholder
- `avg_verb_arity`: placeholder
- `verb_arities_unigram_probs`: placeholder (namechange? not unigram now I think)
- `avg_words_per_clause`: placeholder
- `dep_ud_probs`: placeholder
- `pos_ud_probs`: placeholder
- `lexical_density`: placeholder
#### Additional structural vars
- `n_verbs`: placeholder (?)
- `total_verb_arity`: placeholder (?)
- `total_dep_distance`: placeholder (?)
- `n_verbial_roots`: placeholder
- `verb_arities_dict`: placeholder (?)
- `n_prep_comp`: placeholder
...@@ -37,6 +37,34 @@ PAROLE_TO_SUC_CONVERTION = { ...@@ -37,6 +37,34 @@ PAROLE_TO_SUC_CONVERTION = {
"NCN": "NN", "NCN": "NN",
} }
# Simple table for conversion from SUC tags to UPOS tags
SUC_TO_UPOS = {
"JJ": "ADJ",
"PP": "ADP",
"AB": "ADV",
"VB": "VERB",
"KN": "CCONJ",
"DT": "DET",
"IE": "INTJ",
"NN": "NOUN",
"RG": "NUM",
"PC": "PART",
"PN": "PRON",
"PM": "PROPN",
"MAD": "SYM",
"SN": "SCONJ",
"XX": "X",
# Some tags are missing here
"HS": "PRON", # Assuming HS (Possessive Pronoun) should be mapped to PRON
"HP": "PRON", # Assuming HP (Interrogative/Relative Pronoun) should be mapped to PRON
"PL": "PRON", # Assuming PL (Interrogative/Relative Pronoun) should be mapped to PRON
"HD": "ADV", # Assuming HD (Interrogative/Relative Adverb) should be mapped to ADV
"IN": "INTJ", # Assuming IN (Interjection) should be mapped to INTJ
"HA": "ADV", # Assuming HA (Interrogative/Relative Adverb) should be mapped to ADV
"PS": "PRON", # Assuming PS (Possessive Pronoun) should be mapped to PRON
"RO": "NUM", # Assuming RO (Ordinal Numeral) should be mapped to NUM
}
def parse_swe_voc(): def parse_swe_voc():
""" """
...@@ -58,7 +86,9 @@ def parse_swe_voc(): ...@@ -58,7 +86,9 @@ def parse_swe_voc():
with open(path, "r", encoding=ENCODING) as f: with open(path, "r", encoding=ENCODING) as f:
for line in f.readlines(): for line in f.readlines():
split_line = line.strip().split("\t") split_line = line.strip().split("\t")
tag = PAROLE_TO_SUC_CONVERTION[split_line[1]] tag = SUC_TO_UPOS[
PAROLE_TO_SUC_CONVERTION[split_line[1]]
] # dirty line, fix it later
word = split_line[2] word = split_line[2]
categories = [category.strip() for category in split_line[3].split(",")] categories = [category.strip() for category in split_line[3].split(",")]
......
Source diff could not be displayed: it is too large. Options to address this: view the blob.
approved_data_model_types = {"float", "int", "list", "dict"}
# Metrics # Metrics
QUOTATION_MARKS = {"'", '"', "", "", "", ""}
VOWELS = set("aeiouyåäöAEIOUYÅÄÖ") VOWELS = set("aeiouyåäöAEIOUYÅÄÖ")
LIX_LIMIT = 6 LIX_LIMIT = 6
SPECIAL_CHARS = [",", ":", ".", "!", '"', "'"] SPECIAL_CHARS = [",", ":", ".", "!", '"', "'"]
rules = ["svo", "p2a", "prox", "qi", "split_a", "split_r", "split_k"]
# Simple table for conversion from UPOS tags to SUC tags UPOS_POS_TYPES = {
UPOS_TO_SUC = { "ADJ": 0, # Adjective
"ADJ": "JJ", # adjective "ADP": 0, # Adposition
"ADP": "PP", # adposition "ADV": 0, # Adverb
"ADV": "AB", # adverb "AUX": 0, # Auxiliary verb
"AUX": "VB", # auxiliary "CCONJ": 0, # Coordinating conjunction
"CCONJ": "KN", # coordinating conjunction "DET": 0, # Determiner
"DET": "DT", # determiner "INTJ": 0, # Interjection
"INTJ": "IE", # interjection "NOUN": 0, # Noun
"NOUN": "NN", # noun "NUM": 0, # Numeral
"NUM": "RG", # numeral "PART": 0, # Particle
"PART": "PC", # particle "PRON": 0, # Pronoun
"PRON": "PN", # pronoun "PROPN": 0, # Proper noun
"PROPN": "PM", # proper noun "PUNCT": 0, # Punctuation
"PUNCT": "MAD", # punctuation "SCONJ": 0, # Subordinating conjunction
"SCONJ": "SN", # subordinating conjunction "SYM": 0, # Symbol
"SYM": "MAD", # symbol "VERB": 0, # Verb
"VERB": "VB", # verb "X": 0, # Other
"X": "XX", # other
} }
# Part-of-speech related settings DEP_UD_TYPES = {
POS_TAGS = [ "nsubj": 0,
"AB", "obj": 0,
"CITE", "iobj": 0,
"DT", "obl": 0,
"HA", "amod": 0,
"HD", "det": 0,
"HP", "nmod": 0,
"HS", "cc": 0,
"IE", "conj": 0,
"IN", "advmod": 0,
"JJ", "appos": 0,
"KN", "case": 0,
"MAD", "ccomp": 0,
"MID", "nsubj:pass": 0,
"NN", "parataxis": 0,
"PAD", "punct": 0,
"PC", "root": 0,
"PL", "mark": 0,
"PLQS", "advcl": 0,
"PM", "cop": 0,
"PN", "csubj": 0,
"PP", "expl": 0,
"PS", "list": 0,
"RG", "compound": 0,
"RO", "dislocated": 0,
"SN", "vocative": 0,
"UO", "aux": 0,
"VB", "discourse": 0,
] "reparandum": 0,
"flat": 0,
VERB_TAG = "VERB" "xcomp": 0,
CONTENT_WORD_TAGS = {"NOUN", "ADJ", "ADV"} "fixed": 0,
nominal_components = {"NOUN", "ADP", "PART"} "orphan": 0,
verb_components = {"PRON", "ADV", "VERB"} "dep": 0,
# punctuation_marks = {"MAD", "MID", "PAD"} Managed by token.is_punct "agent": 0,
# word_pos_tags = {tag for tag in POS_TAGS if tag not in punctuation_marks} "acl:relcl": 0,
"intj": 0,
# Dependency related settings "acl": 0,
dep_types = [ "compound:prt": 0,
"++", }
"+A",
"+F",
"AA",
"AG",
"AN",
"AT",
"CA",
"CJ",
"DB",
"DT",
"EF",
"EO",
"ES",
"ET",
"FO",
"FP",
"FS",
"FV",
"HD",
"I?",
"IC",
"IF",
"IG",
"IK",
"IO",
"IP",
"IQ",
"IR",
"IS",
"IT",
"IU",
"IV",
"JC",
"JG",
"JR",
"JT",
"KA",
"MA",
"MS",
"NA",
"OA",
"OO",
"OP",
"PA",
"PL",
"PT",
"RA",
"ROOT",
"SP",
"SS",
"TA",
"UA",
"VA",
"VG",
"VO",
"VS",
"XA",
"XF",
"XT",
"XX",
"YY",
]
DEP_REL_TYPES = [
"nsubj", # Subject
"obj", # Object
"iobj", # Indirect Object
"obl", # Oblique nominal
"amod", # Nominal (adjectival) pre-modifier
"det", # Determiner
"nmod", # Nominal modifier
"cc", # Coordinating conjunction
"conj", # Conjunct (in coordinate structure)
"advmod", # Adverbial modifier
"appos", # Apposition
"case", # Preposition
"ccomp", # Clausal complement
"obj", # Logical object (same as obj?)
"nsubj:pass", # Passive subject
"parataxis", # Parataxis
"punct", # Punctuation
"root", # Root
"mark", # Marker
"advcl", # Adverbial clause modifier
"cop", # Copula
"csubj", # Clausal subject
"expl", # Expletive (Logical subject?)
"list", # List
"compound", # Compound
"dislocated", # Dislocated
"vocative", # Vocative
"aux", # Auxiliary
"discourse", # Discourse
"punct", # Punctuation
"reparandum", # Overridden disfluency
"flat", # Flat multiword expression
"cop", # Copula
"xcomp", # Open clausal complement
"iobj", # Indirect object
"fixed", # Fixed multiword expression
"orphan", # Orphan
"dep", # Unspecified dependency
"advmod", # Conjunctional adverbial
"cc", # Coordination at main clause level
"advmod", # Other adverbial
"agent", # Agent
"advmod", # Contrastive adverbial
"dep", # Doubled function (unspecified)
"acl:relcl", # Relative clause in cleft
"obj", # Logical object
"obj", # Dummy object
"xcomp", # Free subjective predicative complement
"expl", # Dummy subject
"root", # Finite predicate verb
"punct", # Question mark
"punct", # Quotation mark
"intj", # Interjection phrase
"root", # Head
"xcomp", # Infinitive verb phrase minus infinitive marker
"acl", # Subordinate clause minus subordinating conjunction
"compound:prt", # Verb group
]
SUBCLAUSE_DEP = "acl" # UA SUBCLAUSE_DEP = "acl" # UA
POSTMODIFIER_DEP = "conj" # ET POSTMODIFIER_DEP = "conj" # ET
...@@ -198,5 +71,5 @@ PREMODIFIER_DEP = "amod" # AT ...@@ -198,5 +71,5 @@ PREMODIFIER_DEP = "amod" # AT
PRECOMP_DEP = "obl" # PA PRECOMP_DEP = "obl" # PA
ROOT = "ROOT" ROOT = "ROOT"
# Verb arity related settings VERB_TAG = "VERB"
verb_arities = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9} CONTENT_WORD_TAGS = {"NOUN", "ADJ", "ADV"}
...@@ -8,10 +8,8 @@ in this code. Variable declaration as well as calculations are all separated ...@@ -8,10 +8,8 @@ in this code. Variable declaration as well as calculations are all separated
into their own sections for readability and maintainability. into their own sections for readability and maintainability.
""" """
import spacy
import spacy.attrs
from spacy.language import Language from spacy.language import Language
from spacy.tokens import Doc from spacy.tokens import Doc, Token
from scream2.SweVoc.swevoc import load_swe_voc from scream2.SweVoc.swevoc import load_swe_voc
...@@ -49,7 +47,20 @@ def scream_metrics(doc: Doc) -> Doc: ...@@ -49,7 +47,20 @@ def scream_metrics(doc: Doc) -> Doc:
n_unique_tokens = 0 n_unique_tokens = 0
n_unique_words = 0 n_unique_words = 0
n_punctuations = 0 def add_probs(token: Token) -> None:
"""
Helper function to add probabilities to the UPOS_POS_TYPES and DEP_UD_TYPES dictionaries.
:param token: The token to add probabilities for.
"""
try:
config.UPOS_POS_TYPES[token.pos_] += 1
except KeyError:
pass
try:
config.DEP_UD_TYPES[token.dep_] += 1
except KeyError:
pass
# -- Lexical metrics -- # -- Lexical metrics --
...@@ -84,7 +95,7 @@ def scream_metrics(doc: Doc) -> Doc: ...@@ -84,7 +95,7 @@ def scream_metrics(doc: Doc) -> Doc:
n_content_words = 0 n_content_words = 0
n_pos_tags = 0 n_pos_tags = 0
n_verbs = 0 n_verbs = 0
n_verbial_root = 0 n_verbal_roots = 0
n_syllables = 0 n_syllables = 0
n_sub_clauses = 0 n_sub_clauses = 0
...@@ -95,10 +106,13 @@ def scream_metrics(doc: Doc) -> Doc: ...@@ -95,10 +106,13 @@ def scream_metrics(doc: Doc) -> Doc:
n_prep_comp = 0 n_prep_comp = 0
n_right_dependencies = 0 n_right_dependencies = 0
total_dep_distance = 0 total_dep_distance = 0
# n_dependencies = 0 NOTE: Seems to equal the amount of tokens in the document # n_dependencies = 0 NOTE: Seems to equal the amount of tokens in the document -- minus the root
# n_dep_tags = 0 NOTE: Seems to equal the amount of tokens in the document # n_dep_tags = 0 NOTE: Seems to equal the amount of tokens in the document
sentence_depth = 0 sentence_depth = 0
dep_ud_probs = {}
pos_ud_probs = {}
def get_sentence_depth(token) -> float: def get_sentence_depth(token) -> float:
""" """
Helper function to calculate the depth of a sentence. Helper function to calculate the depth of a sentence.
...@@ -126,7 +140,14 @@ def scream_metrics(doc: Doc) -> Doc: ...@@ -126,7 +140,14 @@ def scream_metrics(doc: Doc) -> Doc:
n_unique_tokens += 1 n_unique_tokens += 1
unique_tokens.add(token) unique_tokens.add(token)
if token.is_alpha: add_probs(token)
if token.dep_ != config.ROOT:
dep_distance = token.head.i - token.i
else:
dep_distance = 0
if not token.is_punct:
n_words += 1 n_words += 1
total_word_length += len(token.text) total_word_length += len(token.text)
n_syllables += sum(1 for char in token.text if char in config.VOWELS) n_syllables += sum(1 for char in token.text if char in config.VOWELS)
...@@ -138,6 +159,7 @@ def scream_metrics(doc: Doc) -> Doc: ...@@ -138,6 +159,7 @@ def scream_metrics(doc: Doc) -> Doc:
swevoc_entry = lookup(token.text.lower(), token.pos_) swevoc_entry = lookup(token.text.lower(), token.pos_)
if swevoc_entry: if swevoc_entry:
swevoc_dict["total"] += 1 swevoc_dict["total"] += 1
for swevoc_category in swevoc_entry: for swevoc_category in swevoc_entry:
swevoc_dict[swevoc_category] += 1 swevoc_dict[swevoc_category] += 1
...@@ -149,8 +171,6 @@ def scream_metrics(doc: Doc) -> Doc: ...@@ -149,8 +171,6 @@ def scream_metrics(doc: Doc) -> Doc:
n_content_words += token.pos_ in config.CONTENT_WORD_TAGS n_content_words += token.pos_ in config.CONTENT_WORD_TAGS
n_verbs += token.pos_ == config.VERB_TAG n_verbs += token.pos_ == config.VERB_TAG
dep_distance = token.head.i - token.i
match token.dep_: match token.dep_:
case config.SUBCLAUSE_DEP: case config.SUBCLAUSE_DEP:
n_sub_clauses += 1 n_sub_clauses += 1
...@@ -169,18 +189,24 @@ def scream_metrics(doc: Doc) -> Doc: ...@@ -169,18 +189,24 @@ def scream_metrics(doc: Doc) -> Doc:
case config.ROOT: case config.ROOT:
sentence_depth = get_sentence_depth(token) sentence_depth = get_sentence_depth(token)
if token.pos_ == config.VERB_TAG: if token.pos_ == config.VERB_TAG:
n_verbial_root += 1 n_verbal_roots += 1
if dep_distance > 0: if dep_distance > 0:
n_right_dependencies += 1 n_right_dependencies += 1
total_dep_distance += abs(dep_distance) total_dep_distance += abs(dep_distance)
for key in config.UPOS_POS_TYPES:
pos_ud_probs[key] = calculate_safe_division(config.UPOS_POS_TYPES[key], n_words)
for key in config.DEP_UD_TYPES:
dep_ud_probs[key] = calculate_safe_division(config.DEP_UD_TYPES[key], n_words)
# ----- Sentence metrics ----- # ----- Sentence metrics -----
n_sentences = 0 n_sentences = 0
sentence_lengths = 0 # don't count punctuations sentence_lengths = 0
sentence_length_list_np = np.zeros(len(doc_sents)) # preallocate array sentence_length_list_np = np.zeros(len(doc_sents)) # preallocate np array
verb_arities = [] verb_arities = []
verb_arities_dict = { verb_arities_dict = {
...@@ -266,6 +292,7 @@ def scream_metrics(doc: Doc) -> Doc: ...@@ -266,6 +292,7 @@ def scream_metrics(doc: Doc) -> Doc:
"avg_sentence_length": calculate_safe_division(sentence_lengths, n_sentences), "avg_sentence_length": calculate_safe_division(sentence_lengths, n_sentences),
"avg_word_length": calculate_safe_division(total_word_length, n_words), "avg_word_length": calculate_safe_division(total_word_length, n_words),
"avg_n_syllables": calculate_safe_division(n_syllables, n_words), "avg_n_syllables": calculate_safe_division(n_syllables, n_words),
"n_words": n_words,
} }
additional_surface_metrics = { additional_surface_metrics = {
...@@ -302,6 +329,7 @@ def scream_metrics(doc: Doc) -> Doc: ...@@ -302,6 +329,7 @@ def scream_metrics(doc: Doc) -> Doc:
"n_swevoc_h": swevoc_dict["H"], "n_swevoc_h": swevoc_dict["H"],
"swevoc_s_ratio": calculate_safe_division(swevoc_dict["S"], n_words), "swevoc_s_ratio": calculate_safe_division(swevoc_dict["S"], n_words),
"swevoc_k_ratio": calculate_safe_division(swevoc_dict["K"], n_words), "swevoc_k_ratio": calculate_safe_division(swevoc_dict["K"], n_words),
"swevoc_dict": swevoc_dict,
} }
# ===== Structural vars ===== # ===== Structural vars =====
...@@ -320,11 +348,16 @@ def scream_metrics(doc: Doc) -> Doc: ...@@ -320,11 +348,16 @@ def scream_metrics(doc: Doc) -> Doc:
"avg_nominal_postmodifiers": calculate_safe_division( "avg_nominal_postmodifiers": calculate_safe_division(
n_nominal_postmodifiers, n_sentences n_nominal_postmodifiers, n_sentences
), ),
"avg_nominal_prep_comp": calculate_safe_division(n_prep_comp, n_sentences), "avg_prep_comp": calculate_safe_division(n_prep_comp, n_sentences),
"avg_sentence_depth": calculate_safe_division(sentence_depth, n_sentences), "avg_sentence_depth": calculate_safe_division(sentence_depth, n_sentences),
"verbial_root_ratio": calculate_safe_division(n_verbial_root, n_verbs), "verbial_root_ratio": calculate_safe_division(n_verbal_roots, n_verbs),
"avg_verb_arity": calculate_safe_division(total_verb_arity, n_verbs), "avg_verb_arity": calculate_safe_division(total_verb_arity, n_verbs),
"verb_arities_unigram_probs": verb_arity_unigram_probs, "verb_arities_unigram_probs": verb_arity_unigram_probs,
"avg_words_per_clause": calculate_safe_division(
n_words, n_sentences + n_sub_clauses
),
"dep_ud_probs": dep_ud_probs,
"pos_ud_probs": pos_ud_probs,
"lexical_density": calculate_safe_division(n_content_words, n_words), "lexical_density": calculate_safe_division(n_content_words, n_words),
} }
...@@ -332,14 +365,11 @@ def scream_metrics(doc: Doc) -> Doc: ...@@ -332,14 +365,11 @@ def scream_metrics(doc: Doc) -> Doc:
"n_verbs": n_verbs, "n_verbs": n_verbs,
"total_verb_arity": total_verb_arity, "total_verb_arity": total_verb_arity,
"total_dep_distance": total_dep_distance, "total_dep_distance": total_dep_distance,
"n_verbial_root": n_verbial_root, "n_verbal_roots": n_verbal_roots,
"verb_arities_dict": verb_arities_dict, "verb_arities_dict": verb_arities_dict,
"n_prep_comp": n_prep_comp,
} }
pos_counts = doc.count_by(spacy.attrs.POS)
dep_counts = sum(values for _, values in doc.count_by(spacy.attrs.DEP).items())
# ===== Add metrics to doc object ===== # ===== Add metrics to doc object =====
additional_metrics = { additional_metrics = {
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment