Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
C
Codebase
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Iterations
Requirements
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Locked files
Build
Pipelines
Jobs
Pipeline schedules
Test cases
Artifacts
Deploy
Releases
Package registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Code review analytics
Issue analytics
Insights
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Terms and privacy
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
TDDE19-2022-1
Codebase
Commits
52956a93
Commit
52956a93
authored
2 years ago
by
Albin
Browse files
Options
Downloads
Plain Diff
Merge branch 'main' of gitlab.liu.se:tdde19-2022-1/codebase into main
parents
69239437
bc151a27
No related branches found
No related tags found
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
data/tokenizer.py
+132
-71
132 additions, 71 deletions
data/tokenizer.py
with
132 additions
and
71 deletions
data/tokenizer.py
+
132
−
71
View file @
52956a93
...
...
@@ -2,10 +2,131 @@
# Example usage: py tokenizer.py qald-9-test-linked.json
# Example usage: py tokenizer.py qald-9-test-linked.json REPLACE
# Note: er_link_style is the style of entity linking to use.
# It can be "REPLACE", "APPEND" or blank. Default is no usage of entity links.
# It can be "REPLACE", "APPEND"
, "TOKENIZE", "TOKENIZE-APPEND"
or blank. Default is no usage of entity links.
import
sys
import
json
prefixes
=
{
"
http://dbpedia.org/resource/
"
:
"
res:
"
,
"
http://dbpedia.org/ontology/
"
:
"
dbo:
"
,
"
http://dbpedia.org/property/
"
:
"
dbp:
"
,
"
http://www.w3.org/2000/01/rdf-schema#
"
:
"
rdfs:
"
,
"
http://www.w3.org/1999/02/22-rdf-syntax-ns#
"
:
"
rdf:
"
,
"
http://dbpedia.org/class/yago/
"
:
"
yago:
"
,
"
http://www.wikidata.org/prop/direct/
"
:
"
wdt:
"
,
"
http://www.wikidata.org/entity/
"
:
"
wd:
"
,
"
http://www.wikidata.org/prop/
"
:
"
p:
"
,
"
https://w3id.org/payswarm#
"
:
"
ps:
"
,
"
http://www.wikidata.org/prop/qualifier/
"
:
"
pq:
"
,
"
http://www.bigdata.com/rdf#
"
:
"
bd:
"
,
"
http://wikiba.se/ontology#
"
:
"
wikibase:
"
,
"
http://www.w3.org/2004/02/skos/core#
"
:
"
skos:
"
,
}
def
entity_link_replace
(
entities
,
relations
,
question_string
,
query_string
):
# Helper function since entities and relations are replaced in the same way.
def
replace_entities_or_relations
(
er_source
,
question_string
):
for
er
in
er_source
:
uri
=
er
[
"
URI
"
].
strip
()
surface_form
=
er
[
"
surface form
"
].
strip
()
if
not
surface_form
:
continue
# Hopefully the uri includes a uri which we know how to shorten
for
prefix
in
prefixes
:
if
uri
.
startswith
(
prefix
):
uri
=
prefixes
[
prefix
]
+
uri
[
len
(
prefix
):]
break
# Find the first occurence of the surface form
er_index
=
question_string
.
find
(
surface_form
)
while
er_index
!=
-
1
:
# Check if the surface form is not part of an already replaced entity/relation
previous_colon
=
question_string
.
rfind
(
"
:
"
,
0
,
er_index
)
previous_space
=
question_string
.
rfind
(
"
"
,
0
,
er_index
)
previous_space
=
max
(
0
,
previous_space
)
# If there is a colon to the left and there is a space after it then it is part of an entity/relation
if
previous_colon
>
previous_space
and
previous_colon
!=
-
1
:
next_space
=
question_string
.
find
(
"
"
,
er_index
)
er_index
=
question_string
.
find
(
surface_form
,
next_space
)
continue
# Else replace the surface form with the uri
question_string
=
question_string
[:
er_index
]
+
uri
+
question_string
[
er_index
+
len
(
surface_form
):]
# Find the next occurence of the surface form
er_index
=
question_string
.
find
(
surface_form
,
er_index
+
len
(
uri
))
return
question_string
question_string
=
replace_entities_or_relations
(
entities
,
question_string
)
question_string
=
replace_entities_or_relations
(
relations
,
question_string
)
return
question_string
def
entity_link_token
(
entities
,
relations
,
question_string
,
query_string
):
n_relations
=
0
n_entities
=
0
def
replace_entities_or_relations
(
er_source
,
question_string
,
query_string
,
n
,
token_prefix
):
for
er
in
er_source
:
uri
=
er
[
"
URI
"
].
strip
()
surface_form
=
er
[
"
surface form
"
].
strip
()
if
not
surface_form
:
continue
# Hopefully the uri includes a uri which we know how to shorten
for
prefix
in
prefixes
:
if
uri
.
startswith
(
prefix
):
uri
=
prefixes
[
prefix
]
+
uri
[
len
(
prefix
):]
break
# Find the first occurence of the surface form
er_index
=
question_string
.
find
(
surface_form
)
while
er_index
!=
-
1
:
# Make sure the uri exists in the query
er_query_index
=
query_string
.
find
(
uri
)
# If it does not exist or there is not a space after it then it is not part of the query
if
er_query_index
==
-
1
or
query_string
[
er_query_index
+
len
(
uri
)]
!=
"
"
:
er_index
=
question_string
.
find
(
surface_form
,
er_index
+
len
(
surface_form
))
continue
# Replace the surface form with <rel-n>
question_string
=
question_string
[:
er_index
]
+
f
"
<
{
token_prefix
}{
n
}
>
"
+
question_string
[
er_index
+
len
(
surface_form
):]
# Replace the uri with <rel-n>
query_string
=
query_string
[:
er_query_index
]
+
f
"
<
{
token_prefix
}{
n
}
>
"
+
query_string
[
er_query_index
+
len
(
uri
):]
# Find the next occurence of the surface form
er_index
=
question_string
.
find
(
surface_form
,
er_index
+
len
(
surface_form
))
n
+=
1
return
question_string
,
query_string
question_string
,
query_string
=
replace_entities_or_relations
(
entities
,
question_string
,
query_string
,
n_entities
,
"
e
"
)
question_string
,
query_string
=
replace_entities_or_relations
(
relations
,
question_string
,
query_string
,
n_relations
,
"
r
"
)
return
question_string
,
query_string
def
entity_link_append
(
entities
,
relations
,
question_string
):
def
append_entities_or_relations
(
er_source
,
question_string
):
if
(
len
(
er_source
)
>
0
):
question_string
+=
"
|
"
# Append all entities/relations to the end of the question
for
k
,
er
in
enumerate
(
er_source
):
if
k
>
0
:
question_string
+=
"
|
"
uri
=
er
[
"
URI
"
]
surface_form
=
er
[
"
surface form
"
].
strip
()
# Hopefully the uri includes a uri which we know how to shorten
for
prefix
in
prefixes
:
if
uri
.
startswith
(
prefix
):
uri
=
prefixes
[
prefix
]
+
uri
[
len
(
prefix
):]
break
question_string
+=
f
"'
{
surface_form
}
'
-
{
uri
}
"
return
question_string
question_string
=
append_entities_or_relations
(
entities
,
question_string
)
question_string
=
append_entities_or_relations
(
relations
,
question_string
)
return
question_string
def
main
():
print
(
sys
.
argv
[
0
])
if
len
(
sys
.
argv
)
>
1
:
...
...
@@ -15,7 +136,8 @@ def main():
sys
.
exit
(
1
)
if
len
(
sys
.
argv
)
>
2
:
er_link_style
=
sys
.
argv
[
2
]
if
(
er_link_style
!=
"
REPLACE
"
and
er_link_style
!=
"
APPEND
"
):
er_link_styles
=
[
"
REPLACE
"
,
"
APPEND
"
,
"
TOKENIZE
"
,
"
TOKENIZE_REPLACE
"
]
if
er_link_style
not
in
er_link_styles
:
print
(
"
Please provide a valid entity/relationship linking style
"
)
sys
.
exit
(
1
)
else
:
...
...
@@ -23,22 +145,6 @@ def main():
output_file
=
input_file
.
replace
(
"
.json
"
,
"
-tokenized.csv
"
)
prefixes
=
{
"
http://dbpedia.org/resource/
"
:
"
res:
"
,
"
http://dbpedia.org/ontology/
"
:
"
dbo:
"
,
"
http://dbpedia.org/property/
"
:
"
dbp:
"
,
"
http://www.w3.org/2000/01/rdf-schema#
"
:
"
rdfs:
"
,
"
http://www.w3.org/1999/02/22-rdf-syntax-ns#
"
:
"
rdf:
"
,
"
http://dbpedia.org/class/yago/
"
:
"
yago:
"
,
"
http://www.wikidata.org/prop/direct/
"
:
"
wdt:
"
,
"
http://www.wikidata.org/entity/
"
:
"
wd:
"
,
"
http://www.wikidata.org/prop/
"
:
"
p:
"
,
"
https://w3id.org/payswarm#
"
:
"
ps:
"
,
"
http://www.wikidata.org/prop/qualifier/
"
:
"
pq:
"
,
"
http://www.bigdata.com/rdf#
"
:
"
bd:
"
,
"
http://wikiba.se/ontology#
"
:
"
wikibase:
"
,
"
http://www.w3.org/2004/02/skos/core#
"
:
"
skos:
"
,
}
# Generate a csv file with the tokenized questions from a linked json file
with
open
(
input_file
,
"
r
"
,
encoding
=
"
utf-8
"
)
as
f
:
...
...
@@ -80,64 +186,19 @@ def main():
query_string
=
query_string
.
replace
(
"
COUNT(
"
,
"
COUNT(
"
)
if
er_link_style
==
"
APPEND
"
:
# Append all entities to the end of the question
if
(
len
(
entities
)
>
0
):
question_string
+=
"
|
"
for
k
,
entity
in
enumerate
(
entities
):
if
k
>
0
:
question_string
+=
"
|
"
question_string
=
entity_link_append
(
entities
,
relations
,
question_string
)
uri
=
entity
[
"
URI
"
]
# Hopefully the uri includes a uri which we know how to shorten
if
er_link_style
==
"
REPLACE
"
:
question_string
=
entity_link_replace
(
entities
,
relations
,
question_string
,
query_string
)
for
prefix
in
prefixes
:
if
uri
.
startswith
(
prefix
):
uri
=
prefixes
[
prefix
]
+
uri
[
len
(
prefix
):]
break
question_string
+=
f
"
{
uri
}
"
if
er_link_style
==
"
TOKENIZE
"
:
question_string
,
query_string
=
entity_link_token
(
entities
,
relations
,
question_string
,
query_string
)
if
er_link_style
==
"
REPLACE
"
:
# Helper function since entities and relations are replaced in the same way.
def
replace_entity_or_relation
(
er_source
,
question_string
):
for
er
in
er_source
:
uri
=
er
[
"
URI
"
].
strip
()
surface_form
=
er
[
"
surface form
"
].
strip
()
if
not
surface_form
:
continue
# Hopefully the uri includes a uri which we know how to shorten
for
prefix
in
prefixes
:
if
uri
.
startswith
(
prefix
):
uri
=
prefixes
[
prefix
]
+
uri
[
len
(
prefix
):]
break
# Find the first occurence of the surface form
er_index
=
question_string
.
find
(
surface_form
)
while
er_index
!=
-
1
:
# Check if the surface form is not part of an already replaced entity/relation
previous_colon
=
question_string
.
rfind
(
"
:
"
,
0
,
er_index
)
previous_space
=
question_string
.
rfind
(
"
"
,
0
,
er_index
)
previous_space
=
max
(
0
,
previous_space
)
# If there is a colon to the left and there is a space after it then it is part of an entity/relation
if
previous_colon
>
previous_space
and
previous_colon
!=
-
1
:
next_space
=
question_string
.
find
(
"
"
,
er_index
)
er_index
=
question_string
.
find
(
surface_form
,
next_space
)
continue
# Else replace the surface form with the uri
question_string
=
question_string
[:
er_index
]
+
uri
+
question_string
[
er_index
+
len
(
surface_form
):]
# Find the next occurence of the surface form
er_index
=
question_string
.
find
(
surface_form
,
er_index
+
len
(
uri
))
return
question_string
question_string
=
replace_entity_or_relation
(
entities
,
question_string
)
question_string
=
replace_entity_or_relation
(
relations
,
question_string
)
if
er_link_style
==
"
TOKENIZE_REPLACE
"
:
question_string
,
query_string
=
entity_link_token
(
entities
,
relations
,
question_string
,
query_string
)
question_string
=
entity_link_replace
(
entities
,
relations
,
question_string
,
query_string
)
out
.
write
(
f
'
\"
{
question_string
}
\"
,
\"
{
query_string
}
\"\n
'
)
if
__name__
==
"
__main__
"
:
main
()
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment