Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
S
SAPIS
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Iterations
Requirements
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Locked files
Build
Pipelines
Jobs
Pipeline schedules
Test cases
Artifacts
Deploy
Releases
Package registry
Container registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Code review analytics
Issue analytics
Insights
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Terms and privacy
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
DigInclude
SAPIS
Commits
7fe740cf
Commit
7fe740cf
authored
1 year ago
by
Love Arreborn
Browse files
Options
Downloads
Patches
Plain Diff
SCREAM operating with Stanza-output
parent
1a530f5a
Branches
Branches containing commit
No related tags found
No related merge requests found
Pipeline
#131688
skipped
Stage: push
Stage: deploy
Changes
2
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
pipeline.py
+125
-23
125 additions, 23 deletions
pipeline.py
scream/document_parts.py
+64
-37
64 additions, 37 deletions
scream/document_parts.py
with
189 additions
and
60 deletions
pipeline.py
+
125
−
23
View file @
7fe740cf
...
...
@@ -19,6 +19,8 @@ Aaron Smith <aaron.smith@lingfil.uu.se>
import
pprint
import
time
import
stanza
from
math
import
isfinite
# Import Scream
from
scream.metrics
import
LexicalMetrics
...
...
@@ -47,14 +49,51 @@ from scream.document_parts import DependencyParsedDocument
MAX_TOKEN
=
256
nlp
=
stanza
.
Pipeline
(
lang
=
"
sv
"
,
processors
=
"
tokenize,pos,lemma,depparse,ner
"
)
nlp
=
stanza
.
Pipeline
(
lang
=
"
sv
"
,
processors
=
"
tokenize,pos,lemma,depparse,ner
"
,
download_method
=
stanza
.
DownloadMethod
.
REUSE_RESOURCES
,
)
# Simple table for conversion from Stanza's UPOS tags to SUC tags
upos_to_suc
=
{
"
NOUN
"
:
"
NN
"
,
"
VERB
"
:
"
VB
"
,
"
ADJ
"
:
"
JJ
"
,
"
ADV
"
:
"
AB
"
,
"
PRON
"
:
"
PN
"
,
"
DET
"
:
"
DT
"
,
"
ADP
"
:
"
PP
"
,
"
CONJ
"
:
"
KN
"
,
"
NUM
"
:
"
RG
"
,
"
PART
"
:
"
PC
"
,
"
INTJ
"
:
"
IE
"
,
"
PUNCT
"
:
"
MAD
"
,
"
X
"
:
"
XX
"
,
"
SYM
"
:
"
MAD
"
,
"
SCONJ
"
:
"
SN
"
,
}
# Likewise, the dependency relation tags need to be mapped to match efselabs output
dep_rel_mapping
=
{
"
case
"
:
"
RA
"
,
"
obl
"
:
"
PA
"
,
"
root
"
:
"
ROOT
"
,
"
nsubj
"
:
"
SS
"
,
"
det
"
:
"
DT
"
,
"
amod
"
:
"
AT
"
,
"
obj
"
:
"
OO
"
,
"
punct
"
:
"
IP
"
,
}
class
AttrOptions
:
"""
A class to hold the options for the pipeline. Used in favor of a dict
to convert the keys to attributes. NOTE: Might be unnecessary.
"""
to convert the keys to attributes. NOTE: Might be unnecessary, could likely
just be a dict.
"""
def
__init__
(
self
,
d
):
def
__init__
(
self
,
d
:
dict
):
self
.
__dict__
=
d
...
...
@@ -99,47 +138,64 @@ def run_pipeline(options: AttrOptions, args: list) -> list:
list: A list of dictionaries containing the processed data.
"""
start_time
=
time
.
time
()
pipeline_
start_time
=
time
.
time
()
result
=
[]
for
filename
in
args
:
# ================== STANZA-TAGGING ==================
start_time
=
time
.
time
()
proc
=
process_file
(
options
,
filename
)
time_checker
(
start_time
,
"
Stanza
"
)
# ================== SCREAM ==================
start_time
=
time
.
time
()
depdoc
=
DependencyParsedDocument
()
# depdoc.build_document(
# split_sentences_from_tagger(prep_parsed_for_build_doc(proc["parsed"]))
# ) # proc[parsed] is the output from EFSELAB
# scream, additional_metrics = split_measures_to_scream_headings(
# StructuralMetrics(depdoc)
# )
depdoc
.
build_document
(
split_sentences_from_tagger
(
prep_parsed_for_build_doc
(
proc
[
"
parsed
"
]))
)
# proc[parsed] is the output from EFSELAB
scream
,
additional_metrics
=
split_measures_to_scream_headings
(
StructuralMetrics
(
depdoc
)
)
time_checker
(
start_time
,
"
SCREAM
"
)
print
(
f
"
FILENAME:
{
filename
}
"
)
# ================== STILETT ==================
# start_time = time.time()
# simplified_text, sentence_transformations = ud_text_simplifier.sapis_wrapper(
# filename
# )
# time_checker(start_time, "Stilett")
# print(simplified_text, sentence_transformations)
# print(f"proc {proc}")
# print(f"PARAGRAPHS {proc['paragraphs']}" )
elapsed_time1
=
time
.
time
()
-
start_time
# ================== COH-METRIX ==================
# start_time = time.time()
# coh_metrix_cohesion = cohesion.run_cohesion(proc['parsed'])
# coh_metrix_lsa = lsa.run_LSA(proc['parsed'], proc['paragraphs'])
# coh_metrix_connectives = connectives.run_connectives(proc['parsed'])
# synonym_dict = synonyms.run_synonyms(proc['parsed'])
# coh_metrix_cohesion = {"cohesion": coh_metrix_cohesion}
# coh_metrix_connectives = {"connectives": coh_metrix_connectives}
# coh_metrix_lsa = {"LSA" : coh_metrix_lsa}
# coh_metrix = {**coh_metrix_cohesion, **coh_metrix_connectives, **coh_metrix_lsa }
# coh_metrix = {}
# time_checker(start_time, "Coh-Metrix")
# ================== SYNONYMS ==================
# start_time = time.time()
# synonym_dict = synonyms.run_synonyms(proc['parsed'])
# time_checker(start_time, "Synonyms")
result
.
append
(
{
"
input
"
:
filename
,
"
efselab
"
:
proc
,
#
"scream": scream,
#
"additional_metrics": additional_metrics,
"
scream
"
:
scream
,
"
additional_metrics
"
:
additional_metrics
,
# "stillett": {
# "simplified_text": simplified_text,
# "sentence_transformations": sentence_transformations,
...
...
@@ -157,11 +213,21 @@ def run_pipeline(options: AttrOptions, args: list) -> list:
# print(d['coh-metrix'])
# print(l[-2])
time_checker
(
pipeline_start_time
,
"
Pipeline
"
)
pprint
.
pp
(
result
)
return
result
def
prep_parsed_for_build_doc
(
parsed
):
def
prep_parsed_for_build_doc
(
parsed
:
list
)
->
str
:
"""
Prepare parsed data for building a DependencyParsedDocument. Required for SCREAM.
Args:
parsed (list): The parsed data.
Returns:
list: Data prepared for building a DependencyParsedDocument.
"""
return
"
\n
"
.
join
(
[
"
\t
"
.
join
(
map
(
str
,
word_info
))
...
...
@@ -196,7 +262,10 @@ def process_file(options: AttrOptions, filename: str) -> dict:
if
options
.
tagged
or
options
.
lemmatized
or
options
.
parsed
:
tagged
=
[
[(
word
.
text
,
word
.
upos
)
for
word
in
sentence
.
words
]
[
(
word
.
text
,
upos_to_suc
.
get
(
word
.
upos
,
word
.
upos
))
for
word
in
sentence
.
words
]
for
sentence
in
doc
.
sentences
]
lemmas
=
[[
word
.
lemma
for
word
in
sentence
.
words
]
for
sentence
in
doc
.
sentences
]
...
...
@@ -211,13 +280,14 @@ def process_file(options: AttrOptions, filename: str) -> dict:
parsed
=
[
[
(
word
.
id
,
str
(
word
.
id
)
,
word
.
text
,
word
.
lemma
,
word
.
upos
,
word
.
xpos
,
word
.
head
,
word
.
deprel
,
upos_to_suc
.
get
(
word
.
upos
,
word
.
upos
),
upos_to_suc
.
get
(
word
.
upos
,
word
.
upos
),
"
|
"
.
join
(
upos_to_suc
.
get
(
word
.
xpos
,
word
.
xpos
).
split
(
"
|
"
)[
1
:]),
str
(
word
.
head
)
if
word
.
head
!=
0
else
"
0
"
,
dep_rel_mapping
.
get
(
word
.
deprel
,
word
.
deprel
),
)
for
word
in
sentence
.
words
]
...
...
@@ -233,7 +303,16 @@ def process_file(options: AttrOptions, filename: str) -> dict:
}
def
split_sentences_from_tagger
(
resp
)
->
list
:
def
split_sentences_from_tagger
(
resp
:
str
)
->
list
:
"""
Split the sentences from the tagger output.
Args:
resp (str): The tagger output.
Returns:
list: The sentences split from the tagger output.
"""
sentences
=
[]
sentence
=
[]
for
line
in
resp
.
split
(
"
\n
"
):
...
...
@@ -251,7 +330,14 @@ def split_sentences_from_tagger(resp) -> list:
def
split_measures_to_scream_headings
(
structural_instance
)
->
dict
:
from
math
import
isfinite
"""
Split the measures to SCREAM headings.
Args:
structural_instance: The structural instance.
Returns:
dict: The measures split to SCREAM headings.
"""
calculated_metrics
=
vars
(
structural_instance
)
structural_vars
=
[
...
...
@@ -340,5 +426,21 @@ def split_measures_to_scream_headings(structural_instance) -> dict:
return
fixed_dict
,
extra_dict
def
time_checker
(
start_time
:
float
,
method
:
str
)
->
None
:
"""
Check the time elapsed since the start time.
Args:
start_time (float): The start time.
method (str): The method being run.
"""
elapsed_time
=
time
.
time
()
-
start_time
print
(
f
"
{
method
}
, elapsed time:
{
elapsed_time
}
"
)
if
__name__
==
"
__main__
"
:
main
(
"
I skolan äter jag ett rött äpple.
"
.
encode
(
"
utf-8
"
))
# main(
# "Det finns ett stort antal meningar som är onödigt långa, och vi behöver se till att dessa kan taggas godtyckligt. Denna, till synes, enkla uppgift är inte alltid så enkel.".encode(
# "utf-8"
# )
# )
This diff is collapsed.
Click to expand it.
scream/document_parts.py
+
64
−
37
View file @
7fe740cf
...
...
@@ -3,6 +3,7 @@ import os
from
scream
import
conf
from
scream
import
helper_methods
class
FinalizeError
(
Exception
):
def
__init__
(
self
,
*
args
):
"""
...
...
@@ -11,6 +12,7 @@ class FinalizeError(Exception):
"""
super
().
__init__
(
*
args
)
# Document parts
class
Sentence
:
def
__init__
(
self
):
...
...
@@ -51,7 +53,9 @@ class Sentence:
:return: maximum tree depth
"""
if
not
self
.
finalized
:
raise
FinalizeError
(
"
The sentence is not finalized, please finalize sentence before calculating the depth.
"
)
raise
FinalizeError
(
"
The sentence is not finalized, please finalize sentence before calculating the depth.
"
)
return
self
.
root
.
get_depth
()
def
assign_root
(
self
)
->
None
:
...
...
@@ -109,7 +113,8 @@ class Sentence:
if
len
(
self
.
verb_arities
)
==
0
:
if
not
self
.
finalized
:
raise
FinalizeError
(
"
The sentence is not finalized, please finalize sentence before calculating the total verb arity.
"
)
"
The sentence is not finalized, please finalize sentence before calculating the total verb arity.
"
)
return
self
.
root
.
calculate_verb_arities
(
self
)
def
get_tokens
(
self
):
...
...
@@ -117,12 +122,17 @@ class Sentence:
return
self
.
unigram_representation
+
[
self
.
root
]
return
self
.
unigram_representation
class
SwevocManager
:
def
__init__
(
self
):
"""
A manager for SweVoc.
"""
"""
A manager for SweVoc.
"""
self
.
_swe_voc
=
dict
()
self
.
_categories
=
set
()
self
.
_load_swe_voc
(
os
.
path
.
join
(
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
)),
"
.
"
,
conf
.
swe_voc_file
))
self
.
_load_swe_voc
(
os
.
path
.
join
(
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
)),
"
.
"
,
conf
.
swe_voc_file
)
)
def
_load_swe_voc
(
self
,
path
):
"""
...
...
@@ -139,7 +149,9 @@ class SwevocManager:
tag
=
conf
.
parole_to_suc_conversion
[
split_line
[
1
]]
word
=
split_line
[
2
]
categories
=
{
category
.
strip
()
for
category
in
split_line
[
3
].
split
(
"
,
"
)}
categories
=
{
category
.
strip
()
for
category
in
split_line
[
3
].
split
(
"
,
"
)
}
if
word
not
in
self
.
_swe_voc
.
keys
():
self
.
_swe_voc
[
word
]
=
dict
()
...
...
@@ -176,8 +188,11 @@ class SwevocManager:
except
KeyError
:
return
set
()
class
Token
:
def
__init__
(
self
,
token
,
pos
,
lemma
=
None
,
dep_rel
=
None
,
ref
=
None
,
dep_head_ref
=
None
):
def
__init__
(
self
,
token
,
pos
,
lemma
=
None
,
dep_rel
=
None
,
ref
=
None
,
dep_head_ref
=
None
):
"""
A class representing a token. Mainly used for the tree based syntactic
representation of sentences used in the Sentence class.
...
...
@@ -200,10 +215,10 @@ class Token:
def
__str__
(
self
)
->
str
:
print_list
=
[
"
Lemma:
"
+
self
.
lemma
,
"
PoS tag:
"
+
self
.
pos_tag
,
"
Dependency relation:
"
+
self
.
dep_rel
,
"
Ref:
"
+
str
(
self
.
ref
)
f
"
Lemma:
{
self
.
lemma
}
"
,
f
"
PoS tag:
{
self
.
pos_tag
}
"
,
f
"
Dependency relation:
{
self
.
dep_rel
}
"
,
f
"
Ref:
{
self
.
ref
}
"
,
]
return
"
\n
"
.
join
(
print_list
)
...
...
@@ -259,6 +274,7 @@ class Token:
return
0
return
tmp
# Documents
class
PosTaggedDocument
:
def
__init__
(
self
):
...
...
@@ -320,25 +336,29 @@ class PosTaggedDocument:
"""
for
sentence
in
pos_tagged_sentences
:
sentence_object
=
Sentence
()
#print("BUILD_DOC", sentence)
#
print("BUILD_DOC", sentence)
for
token
in
sentence
:
#print(token)
#
print(token)
token_string
=
token
[
1
]
pos_tag
=
token
[
4
].
split
(
'
|
'
)[
0
]
pos_tag
=
token
[
4
].
split
(
"
|
"
)[
0
]
try
:
lemma
=
token
[
2
]
dep_rel
=
token
[
7
]
ref
=
int
(
token
[
0
])
dep_head_ref
=
int
(
token
[
6
])
except
IndexError
:
lemma
=
token
[
2
]
#None #TODO implementera lemmaization eller ta bort lemmastatistiken från postaggeddocument
lemma
=
token
[
2
]
# None #TODO implementera lemmaization eller ta bort lemmastatistiken från postaggeddocument
dep_rel
=
None
ref
=
None
dep_head_ref
=
None
except
ValueError
:
print
(
token
[
6
],
'
\n
'
,
sentence
)
print
(
token
[
6
],
"
\n
"
,
sentence
)
token_object
=
self
.
create_token
(
token_string
,
pos_tag
,
lemma
,
dep_rel
,
ref
,
dep_head_ref
)
token_object
=
self
.
create_token
(
token_string
,
pos_tag
,
lemma
,
dep_rel
,
ref
,
dep_head_ref
)
sentence_object
.
add_token
(
token_object
)
self
.
add_token_statistics
(
token_object
)
...
...
@@ -383,15 +403,14 @@ class PosTaggedDocument:
if
token
.
token
not
in
self
.
word_dict
.
keys
():
self
.
word_dict
[
token
.
token
]
=
0
self
.
n_unique_words
+=
1
if
token
.
token
.
lower
()
not
in
self
.
lower_token_dict
.
keys
():
self
.
lower_token_dict
[
token
.
token
.
lower
()]
=
0
if
token
.
lemma
not
in
self
.
lemma_dict
.
keys
():
self
.
lemma_dict
[
token
.
lemma
]
=
0
self
.
n_unique_lemma
+=
1
self
.
word_dict
[
token
.
token
]
+=
1
self
.
lemma_dict
[
token
.
lemma
]
+=
1
self
.
n_words
+=
1
...
...
@@ -399,7 +418,7 @@ class PosTaggedDocument:
self
.
total_word_length
+=
len
(
token
.
lemma
)
self
.
n_syllables
+=
token
.
syllables
#if len(token.lemma) > conf.lix_limit: #changed tthis to token.token
#
if len(token.lemma) > conf.lix_limit: #changed tthis to token.token
if
len
(
token
.
token
)
>
conf
.
lix_limit
:
self
.
n_lix_long_words
+=
1
...
...
@@ -450,6 +469,7 @@ class PosTaggedDocument:
self
.
n_content_words
-=
1
self
.
n_verbs
-=
1
class
DependencyParsedDocument
(
PosTaggedDocument
):
def
__init__
(
self
):
PosTaggedDocument
.
__init__
(
self
)
...
...
@@ -521,30 +541,32 @@ class DependencyParsedDocument(PosTaggedDocument):
"""
PosTaggedDocument
.
add_sentence_statistics
(
self
,
sentence
)
#print('================')
#print(sentence)
#print('================')
#
print('================')
#
print(sentence)
#
print('================')
sentence
.
finalize
()
self
.
total_sentence_depth
+=
sentence
.
get_depth
()
self
.
total_verb_arity
+=
sentence
.
get_total_verb_arity
()
#print(self.total_verb_arity)
#print(f"sentence verb arities{sentence.verb_arities}")
for
arity
,
number
in
zip
(
sentence
.
verb_arities
.
keys
(),
sentence
.
verb_arities
.
values
()):
# print(self.total_verb_arity)
# print(f"sentence verb arities{sentence.verb_arities}")
for
arity
,
number
in
zip
(
sentence
.
verb_arities
.
keys
(),
sentence
.
verb_arities
.
values
()
):
# HAR KOMMENTERAT UT VERB ARITIES TILLS VIDARE PGA FUNKAR INTE MEN VET EJ VARFÖR
#Funkar nu?
#
Funkar nu?
if
int
(
arity
)
>=
len
(
conf
.
verb_arities
):
self
.
verb_arities_dict
[
9
]
+=
1
else
:
else
:
self
.
verb_arities_dict
[
arity
]
+=
1
if
sentence
.
has_verbial_root
():
self
.
n_verbal_roots
+=
1
def
add_dep_statistics
(
self
,
token
)
->
None
:
def
add_dep_statistics
(
self
,
token
:
Token
)
->
None
:
"""
Increme
b
ts the following attributes:
Increme
n
ts the following attributes:
The amount of dependency tags
The amount of dependencies
The amount of right dependenceis (given positive relation distance)
...
...
@@ -555,16 +577,21 @@ class DependencyParsedDocument(PosTaggedDocument):
The amount of post modifiers (if the relation equals the predifined relation)
The amount of pre modifiers (if the relation equals the predifined relation)
The amount of preposition compositions (if the relations equals the predifined relation)
:param token: Token
Args:
token: Token
"""
dep_distance
=
token
.
ref
-
token
.
dep_head_ref
# TODO: var vaksam här, kan bli fel. kolla extractor: handleDependency
print
(
token
)
dep_distance
=
(
token
.
ref
-
token
.
dep_head_ref
)
# TODO: var vaksam här, kan bli fel. kolla extractor: handleDependency
self
.
n_dep_tags
+=
1
self
.
n_dependencies
+=
1
#print("HITME!")
#print("t d l", token.dep_rel)
#
print("HITME!")
#
print("t d l", token.dep_rel)
if
token
.
dep_rel
in
conf
.
dep_types
:
self
.
dep_type_dict
[
token
.
dep_rel
]
+=
1
#print("HITME!")
#
print("HITME!")
if
token
.
dep_rel
==
conf
.
subclause_dep
:
self
.
n_sub_clauses
+=
1
if
dep_distance
>
0
:
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment