Skip to content
Snippets Groups Projects
Commit 7fe740cf authored by Love Arreborn's avatar Love Arreborn
Browse files

SCREAM operating with Stanza-output

parent 1a530f5a
No related branches found
No related tags found
No related merge requests found
Pipeline #131688 skipped
...@@ -19,6 +19,8 @@ Aaron Smith <aaron.smith@lingfil.uu.se> ...@@ -19,6 +19,8 @@ Aaron Smith <aaron.smith@lingfil.uu.se>
import pprint import pprint
import time import time
import stanza import stanza
from math import isfinite
# Import Scream # Import Scream
from scream.metrics import LexicalMetrics from scream.metrics import LexicalMetrics
...@@ -47,14 +49,51 @@ from scream.document_parts import DependencyParsedDocument ...@@ -47,14 +49,51 @@ from scream.document_parts import DependencyParsedDocument
MAX_TOKEN = 256 MAX_TOKEN = 256
nlp = stanza.Pipeline(lang="sv", processors="tokenize,pos,lemma,depparse,ner") nlp = stanza.Pipeline(
lang="sv",
processors="tokenize,pos,lemma,depparse,ner",
download_method=stanza.DownloadMethod.REUSE_RESOURCES,
)
# Simple table for conversion from Stanza's UPOS tags to SUC tags
upos_to_suc = {
"NOUN": "NN",
"VERB": "VB",
"ADJ": "JJ",
"ADV": "AB",
"PRON": "PN",
"DET": "DT",
"ADP": "PP",
"CONJ": "KN",
"NUM": "RG",
"PART": "PC",
"INTJ": "IE",
"PUNCT": "MAD",
"X": "XX",
"SYM": "MAD",
"SCONJ": "SN",
}
# Likewise, the dependency relation tags need to be mapped to match efselabs output
dep_rel_mapping = {
"case": "RA",
"obl": "PA",
"root": "ROOT",
"nsubj": "SS",
"det": "DT",
"amod": "AT",
"obj": "OO",
"punct": "IP",
}
class AttrOptions: class AttrOptions:
"""A class to hold the options for the pipeline. Used in favor of a dict """A class to hold the options for the pipeline. Used in favor of a dict
to convert the keys to attributes. NOTE: Might be unnecessary.""" to convert the keys to attributes. NOTE: Might be unnecessary, could likely
just be a dict.
"""
def __init__(self, d): def __init__(self, d: dict):
self.__dict__ = d self.__dict__ = d
...@@ -99,47 +138,64 @@ def run_pipeline(options: AttrOptions, args: list) -> list: ...@@ -99,47 +138,64 @@ def run_pipeline(options: AttrOptions, args: list) -> list:
list: A list of dictionaries containing the processed data. list: A list of dictionaries containing the processed data.
""" """
start_time = time.time() pipeline_start_time = time.time()
result = [] result = []
for filename in args: for filename in args:
# ================== STANZA-TAGGING ==================
start_time = time.time()
proc = process_file(options, filename) proc = process_file(options, filename)
time_checker(start_time, "Stanza")
# ================== SCREAM ==================
start_time = time.time()
depdoc = DependencyParsedDocument() depdoc = DependencyParsedDocument()
# depdoc.build_document( depdoc.build_document(
# split_sentences_from_tagger(prep_parsed_for_build_doc(proc["parsed"])) split_sentences_from_tagger(prep_parsed_for_build_doc(proc["parsed"]))
# ) # proc[parsed] is the output from EFSELAB ) # proc[parsed] is the output from EFSELAB
# scream, additional_metrics = split_measures_to_scream_headings( scream, additional_metrics = split_measures_to_scream_headings(
# StructuralMetrics(depdoc) StructuralMetrics(depdoc)
# ) )
time_checker(start_time, "SCREAM")
print(f"FILENAME: {filename}") print(f"FILENAME: {filename}")
# ================== STILETT ==================
# start_time = time.time()
# simplified_text, sentence_transformations = ud_text_simplifier.sapis_wrapper( # simplified_text, sentence_transformations = ud_text_simplifier.sapis_wrapper(
# filename # filename
# ) # )
# time_checker(start_time, "Stilett")
# print(simplified_text, sentence_transformations) # print(simplified_text, sentence_transformations)
# print(f"proc {proc}") # print(f"proc {proc}")
# print(f"PARAGRAPHS {proc['paragraphs']}" ) # print(f"PARAGRAPHS {proc['paragraphs']}" )
elapsed_time1 = time.time() - start_time
# ================== COH-METRIX ================== # ================== COH-METRIX ==================
# start_time = time.time()
# coh_metrix_cohesion = cohesion.run_cohesion(proc['parsed']) # coh_metrix_cohesion = cohesion.run_cohesion(proc['parsed'])
# coh_metrix_lsa = lsa.run_LSA(proc['parsed'], proc['paragraphs']) # coh_metrix_lsa = lsa.run_LSA(proc['parsed'], proc['paragraphs'])
# coh_metrix_connectives = connectives.run_connectives(proc['parsed']) # coh_metrix_connectives = connectives.run_connectives(proc['parsed'])
# synonym_dict = synonyms.run_synonyms(proc['parsed'])
# coh_metrix_cohesion = {"cohesion": coh_metrix_cohesion} # coh_metrix_cohesion = {"cohesion": coh_metrix_cohesion}
# coh_metrix_connectives = {"connectives": coh_metrix_connectives} # coh_metrix_connectives = {"connectives": coh_metrix_connectives}
# coh_metrix_lsa = {"LSA" : coh_metrix_lsa} # coh_metrix_lsa = {"LSA" : coh_metrix_lsa}
# coh_metrix = {**coh_metrix_cohesion, **coh_metrix_connectives, **coh_metrix_lsa } # coh_metrix = {**coh_metrix_cohesion, **coh_metrix_connectives, **coh_metrix_lsa }
# coh_metrix = {} # coh_metrix = {}
# time_checker(start_time, "Coh-Metrix")
# ================== SYNONYMS ==================
# start_time = time.time()
# synonym_dict = synonyms.run_synonyms(proc['parsed'])
# time_checker(start_time, "Synonyms")
result.append( result.append(
{ {
"input": filename, "input": filename,
"efselab": proc, "efselab": proc,
# "scream": scream, "scream": scream,
# "additional_metrics": additional_metrics, "additional_metrics": additional_metrics,
# "stillett": { # "stillett": {
# "simplified_text": simplified_text, # "simplified_text": simplified_text,
# "sentence_transformations": sentence_transformations, # "sentence_transformations": sentence_transformations,
...@@ -157,11 +213,21 @@ def run_pipeline(options: AttrOptions, args: list) -> list: ...@@ -157,11 +213,21 @@ def run_pipeline(options: AttrOptions, args: list) -> list:
# print(d['coh-metrix']) # print(d['coh-metrix'])
# print(l[-2]) # print(l[-2])
time_checker(pipeline_start_time, "Pipeline")
pprint.pp(result) pprint.pp(result)
return result return result
def prep_parsed_for_build_doc(parsed): def prep_parsed_for_build_doc(parsed: list) -> str:
"""
Prepare parsed data for building a DependencyParsedDocument. Required for SCREAM.
Args:
parsed (list): The parsed data.
Returns:
list: Data prepared for building a DependencyParsedDocument.
"""
return "\n".join( return "\n".join(
[ [
"\t".join(map(str, word_info)) "\t".join(map(str, word_info))
...@@ -196,7 +262,10 @@ def process_file(options: AttrOptions, filename: str) -> dict: ...@@ -196,7 +262,10 @@ def process_file(options: AttrOptions, filename: str) -> dict:
if options.tagged or options.lemmatized or options.parsed: if options.tagged or options.lemmatized or options.parsed:
tagged = [ tagged = [
[(word.text, word.upos) for word in sentence.words] [
(word.text, upos_to_suc.get(word.upos, word.upos))
for word in sentence.words
]
for sentence in doc.sentences for sentence in doc.sentences
] ]
lemmas = [[word.lemma for word in sentence.words] for sentence in doc.sentences] lemmas = [[word.lemma for word in sentence.words] for sentence in doc.sentences]
...@@ -211,13 +280,14 @@ def process_file(options: AttrOptions, filename: str) -> dict: ...@@ -211,13 +280,14 @@ def process_file(options: AttrOptions, filename: str) -> dict:
parsed = [ parsed = [
[ [
( (
word.id, str(word.id),
word.text, word.text,
word.lemma, word.lemma,
word.upos, upos_to_suc.get(word.upos, word.upos),
word.xpos, upos_to_suc.get(word.upos, word.upos),
word.head, "|".join(upos_to_suc.get(word.xpos, word.xpos).split("|")[1:]),
word.deprel, str(word.head) if word.head != 0 else "0",
dep_rel_mapping.get(word.deprel, word.deprel),
) )
for word in sentence.words for word in sentence.words
] ]
...@@ -233,7 +303,16 @@ def process_file(options: AttrOptions, filename: str) -> dict: ...@@ -233,7 +303,16 @@ def process_file(options: AttrOptions, filename: str) -> dict:
} }
def split_sentences_from_tagger(resp) -> list: def split_sentences_from_tagger(resp: str) -> list:
"""Split the sentences from the tagger output.
Args:
resp (str): The tagger output.
Returns:
list: The sentences split from the tagger output.
"""
sentences = [] sentences = []
sentence = [] sentence = []
for line in resp.split("\n"): for line in resp.split("\n"):
...@@ -251,7 +330,14 @@ def split_sentences_from_tagger(resp) -> list: ...@@ -251,7 +330,14 @@ def split_sentences_from_tagger(resp) -> list:
def split_measures_to_scream_headings(structural_instance) -> dict: def split_measures_to_scream_headings(structural_instance) -> dict:
from math import isfinite """Split the measures to SCREAM headings.
Args:
structural_instance: The structural instance.
Returns:
dict: The measures split to SCREAM headings.
"""
calculated_metrics = vars(structural_instance) calculated_metrics = vars(structural_instance)
structural_vars = [ structural_vars = [
...@@ -340,5 +426,21 @@ def split_measures_to_scream_headings(structural_instance) -> dict: ...@@ -340,5 +426,21 @@ def split_measures_to_scream_headings(structural_instance) -> dict:
return fixed_dict, extra_dict return fixed_dict, extra_dict
def time_checker(start_time: float, method: str) -> None:
"""Check the time elapsed since the start time.
Args:
start_time (float): The start time.
method (str): The method being run.
"""
elapsed_time = time.time() - start_time
print(f"{method}, elapsed time: {elapsed_time}")
if __name__ == "__main__": if __name__ == "__main__":
main("I skolan äter jag ett rött äpple.".encode("utf-8")) main("I skolan äter jag ett rött äpple.".encode("utf-8"))
# main(
# "Det finns ett stort antal meningar som är onödigt långa, och vi behöver se till att dessa kan taggas godtyckligt. Denna, till synes, enkla uppgift är inte alltid så enkel.".encode(
# "utf-8"
# )
# )
...@@ -3,6 +3,7 @@ import os ...@@ -3,6 +3,7 @@ import os
from scream import conf from scream import conf
from scream import helper_methods from scream import helper_methods
class FinalizeError(Exception): class FinalizeError(Exception):
def __init__(self, *args): def __init__(self, *args):
""" """
...@@ -11,6 +12,7 @@ class FinalizeError(Exception): ...@@ -11,6 +12,7 @@ class FinalizeError(Exception):
""" """
super().__init__(*args) super().__init__(*args)
# Document parts # Document parts
class Sentence: class Sentence:
def __init__(self): def __init__(self):
...@@ -51,7 +53,9 @@ class Sentence: ...@@ -51,7 +53,9 @@ class Sentence:
:return: maximum tree depth :return: maximum tree depth
""" """
if not self.finalized: if not self.finalized:
raise FinalizeError("The sentence is not finalized, please finalize sentence before calculating the depth.") raise FinalizeError(
"The sentence is not finalized, please finalize sentence before calculating the depth."
)
return self.root.get_depth() return self.root.get_depth()
def assign_root(self) -> None: def assign_root(self) -> None:
...@@ -109,7 +113,8 @@ class Sentence: ...@@ -109,7 +113,8 @@ class Sentence:
if len(self.verb_arities) == 0: if len(self.verb_arities) == 0:
if not self.finalized: if not self.finalized:
raise FinalizeError( raise FinalizeError(
"The sentence is not finalized, please finalize sentence before calculating the total verb arity.") "The sentence is not finalized, please finalize sentence before calculating the total verb arity."
)
return self.root.calculate_verb_arities(self) return self.root.calculate_verb_arities(self)
def get_tokens(self): def get_tokens(self):
...@@ -117,12 +122,17 @@ class Sentence: ...@@ -117,12 +122,17 @@ class Sentence:
return self.unigram_representation + [self.root] return self.unigram_representation + [self.root]
return self.unigram_representation return self.unigram_representation
class SwevocManager: class SwevocManager:
def __init__(self): def __init__(self):
""" A manager for SweVoc.""" """A manager for SweVoc."""
self._swe_voc = dict() self._swe_voc = dict()
self._categories = set() self._categories = set()
self._load_swe_voc(os.path.join(os.path.dirname(os.path.abspath(__file__)), ".", conf.swe_voc_file)) self._load_swe_voc(
os.path.join(
os.path.dirname(os.path.abspath(__file__)), ".", conf.swe_voc_file
)
)
def _load_swe_voc(self, path): def _load_swe_voc(self, path):
""" """
...@@ -139,7 +149,9 @@ class SwevocManager: ...@@ -139,7 +149,9 @@ class SwevocManager:
tag = conf.parole_to_suc_conversion[split_line[1]] tag = conf.parole_to_suc_conversion[split_line[1]]
word = split_line[2] word = split_line[2]
categories = {category.strip() for category in split_line[3].split(",")} categories = {
category.strip() for category in split_line[3].split(",")
}
if word not in self._swe_voc.keys(): if word not in self._swe_voc.keys():
self._swe_voc[word] = dict() self._swe_voc[word] = dict()
...@@ -176,8 +188,11 @@ class SwevocManager: ...@@ -176,8 +188,11 @@ class SwevocManager:
except KeyError: except KeyError:
return set() return set()
class Token: class Token:
def __init__(self, token, pos, lemma=None, dep_rel=None, ref=None, dep_head_ref=None): def __init__(
self, token, pos, lemma=None, dep_rel=None, ref=None, dep_head_ref=None
):
""" """
A class representing a token. Mainly used for the tree based syntactic A class representing a token. Mainly used for the tree based syntactic
representation of sentences used in the Sentence class. representation of sentences used in the Sentence class.
...@@ -200,10 +215,10 @@ class Token: ...@@ -200,10 +215,10 @@ class Token:
def __str__(self) -> str: def __str__(self) -> str:
print_list = [ print_list = [
"Lemma: " + self.lemma, f"Lemma: {self.lemma}",
"PoS tag: " + self.pos_tag, f"PoS tag: {self.pos_tag}",
"Dependency relation: " + self.dep_rel, f"Dependency relation: {self.dep_rel}",
"Ref: " + str(self.ref) f"Ref: {self.ref}",
] ]
return "\n".join(print_list) return "\n".join(print_list)
...@@ -259,6 +274,7 @@ class Token: ...@@ -259,6 +274,7 @@ class Token:
return 0 return 0
return tmp return tmp
# Documents # Documents
class PosTaggedDocument: class PosTaggedDocument:
def __init__(self): def __init__(self):
...@@ -320,25 +336,29 @@ class PosTaggedDocument: ...@@ -320,25 +336,29 @@ class PosTaggedDocument:
""" """
for sentence in pos_tagged_sentences: for sentence in pos_tagged_sentences:
sentence_object = Sentence() sentence_object = Sentence()
#print("BUILD_DOC", sentence) # print("BUILD_DOC", sentence)
for token in sentence: for token in sentence:
#print(token) # print(token)
token_string = token[1] token_string = token[1]
pos_tag = token[4].split('|')[0] pos_tag = token[4].split("|")[0]
try: try:
lemma = token[2] lemma = token[2]
dep_rel = token[7] dep_rel = token[7]
ref = int(token[0]) ref = int(token[0])
dep_head_ref = int(token[6]) dep_head_ref = int(token[6])
except IndexError: except IndexError:
lemma = token[2] #None #TODO implementera lemmaization eller ta bort lemmastatistiken från postaggeddocument lemma = token[
2
] # None #TODO implementera lemmaization eller ta bort lemmastatistiken från postaggeddocument
dep_rel = None dep_rel = None
ref = None ref = None
dep_head_ref = None dep_head_ref = None
except ValueError: except ValueError:
print(token[6], '\n', sentence) print(token[6], "\n", sentence)
token_object = self.create_token(token_string, pos_tag, lemma, dep_rel, ref, dep_head_ref) token_object = self.create_token(
token_string, pos_tag, lemma, dep_rel, ref, dep_head_ref
)
sentence_object.add_token(token_object) sentence_object.add_token(token_object)
self.add_token_statistics(token_object) self.add_token_statistics(token_object)
...@@ -383,15 +403,14 @@ class PosTaggedDocument: ...@@ -383,15 +403,14 @@ class PosTaggedDocument:
if token.token not in self.word_dict.keys(): if token.token not in self.word_dict.keys():
self.word_dict[token.token] = 0 self.word_dict[token.token] = 0
self.n_unique_words += 1 self.n_unique_words += 1
if token.token.lower() not in self.lower_token_dict.keys(): if token.token.lower() not in self.lower_token_dict.keys():
self.lower_token_dict[token.token.lower()] = 0 self.lower_token_dict[token.token.lower()] = 0
if token.lemma not in self.lemma_dict.keys(): if token.lemma not in self.lemma_dict.keys():
self.lemma_dict[token.lemma] = 0 self.lemma_dict[token.lemma] = 0
self.n_unique_lemma += 1 self.n_unique_lemma += 1
self.word_dict[token.token] += 1 self.word_dict[token.token] += 1
self.lemma_dict[token.lemma] += 1 self.lemma_dict[token.lemma] += 1
self.n_words += 1 self.n_words += 1
...@@ -399,7 +418,7 @@ class PosTaggedDocument: ...@@ -399,7 +418,7 @@ class PosTaggedDocument:
self.total_word_length += len(token.lemma) self.total_word_length += len(token.lemma)
self.n_syllables += token.syllables self.n_syllables += token.syllables
#if len(token.lemma) > conf.lix_limit: #changed tthis to token.token # if len(token.lemma) > conf.lix_limit: #changed tthis to token.token
if len(token.token) > conf.lix_limit: if len(token.token) > conf.lix_limit:
self.n_lix_long_words += 1 self.n_lix_long_words += 1
...@@ -450,6 +469,7 @@ class PosTaggedDocument: ...@@ -450,6 +469,7 @@ class PosTaggedDocument:
self.n_content_words -= 1 self.n_content_words -= 1
self.n_verbs -= 1 self.n_verbs -= 1
class DependencyParsedDocument(PosTaggedDocument): class DependencyParsedDocument(PosTaggedDocument):
def __init__(self): def __init__(self):
PosTaggedDocument.__init__(self) PosTaggedDocument.__init__(self)
...@@ -521,30 +541,32 @@ class DependencyParsedDocument(PosTaggedDocument): ...@@ -521,30 +541,32 @@ class DependencyParsedDocument(PosTaggedDocument):
""" """
PosTaggedDocument.add_sentence_statistics(self, sentence) PosTaggedDocument.add_sentence_statistics(self, sentence)
#print('================') # print('================')
#print(sentence) # print(sentence)
#print('================') # print('================')
sentence.finalize() sentence.finalize()
self.total_sentence_depth += sentence.get_depth() self.total_sentence_depth += sentence.get_depth()
self.total_verb_arity += sentence.get_total_verb_arity() self.total_verb_arity += sentence.get_total_verb_arity()
#print(self.total_verb_arity) # print(self.total_verb_arity)
#print(f"sentence verb arities{sentence.verb_arities}") # print(f"sentence verb arities{sentence.verb_arities}")
for arity, number in zip(sentence.verb_arities.keys(), sentence.verb_arities.values()): for arity, number in zip(
sentence.verb_arities.keys(), sentence.verb_arities.values()
):
# HAR KOMMENTERAT UT VERB ARITIES TILLS VIDARE PGA FUNKAR INTE MEN VET EJ VARFÖR # HAR KOMMENTERAT UT VERB ARITIES TILLS VIDARE PGA FUNKAR INTE MEN VET EJ VARFÖR
#Funkar nu? # Funkar nu?
if int(arity) >= len(conf.verb_arities): if int(arity) >= len(conf.verb_arities):
self.verb_arities_dict[9] += 1 self.verb_arities_dict[9] += 1
else: else:
self.verb_arities_dict[arity] += 1 self.verb_arities_dict[arity] += 1
if sentence.has_verbial_root(): if sentence.has_verbial_root():
self.n_verbal_roots += 1 self.n_verbal_roots += 1
def add_dep_statistics(self, token) -> None: def add_dep_statistics(self, token: Token) -> None:
""" """
Incremebts the following attributes: Increments the following attributes:
The amount of dependency tags The amount of dependency tags
The amount of dependencies The amount of dependencies
The amount of right dependenceis (given positive relation distance) The amount of right dependenceis (given positive relation distance)
...@@ -555,16 +577,21 @@ class DependencyParsedDocument(PosTaggedDocument): ...@@ -555,16 +577,21 @@ class DependencyParsedDocument(PosTaggedDocument):
The amount of post modifiers (if the relation equals the predifined relation) The amount of post modifiers (if the relation equals the predifined relation)
The amount of pre modifiers (if the relation equals the predifined relation) The amount of pre modifiers (if the relation equals the predifined relation)
The amount of preposition compositions (if the relations equals the predifined relation) The amount of preposition compositions (if the relations equals the predifined relation)
:param token: Token Args:
token: Token
""" """
dep_distance = token.ref - token.dep_head_ref # TODO: var vaksam här, kan bli fel. kolla extractor: handleDependency
print(token)
dep_distance = (
token.ref - token.dep_head_ref
) # TODO: var vaksam här, kan bli fel. kolla extractor: handleDependency
self.n_dep_tags += 1 self.n_dep_tags += 1
self.n_dependencies += 1 self.n_dependencies += 1
#print("HITME!") # print("HITME!")
#print("t d l", token.dep_rel) # print("t d l", token.dep_rel)
if token.dep_rel in conf.dep_types: if token.dep_rel in conf.dep_types:
self.dep_type_dict[token.dep_rel] += 1 self.dep_type_dict[token.dep_rel] += 1
#print("HITME!") # print("HITME!")
if token.dep_rel == conf.subclause_dep: if token.dep_rel == conf.subclause_dep:
self.n_sub_clauses += 1 self.n_sub_clauses += 1
if dep_distance > 0: if dep_distance > 0:
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment