Skip to content
Snippets Groups Projects
Commit bc151a27 authored by Albin Henriksson's avatar Albin Henriksson
Browse files

Improve tokenizer script to add more options

parent 1c8ba7a8
Branches
No related tags found
No related merge requests found
......@@ -2,10 +2,131 @@
# Example usage: py tokenizer.py qald-9-test-linked.json
# Example usage: py tokenizer.py qald-9-test-linked.json REPLACE
# Note: er_link_style is the style of entity linking to use.
# It can be "REPLACE", "APPEND" or blank. Default is no usage of entity links.
# It can be "REPLACE", "APPEND", "TOKENIZE", "TOKENIZE-APPEND" or blank. Default is no usage of entity links.
import sys
import json
prefixes = {
"http://dbpedia.org/resource/": "res:",
"http://dbpedia.org/ontology/": "dbo:",
"http://dbpedia.org/property/": "dbp:",
"http://www.w3.org/2000/01/rdf-schema#": "rdfs:",
"http://www.w3.org/1999/02/22-rdf-syntax-ns#": "rdf:",
"http://dbpedia.org/class/yago/": "yago:",
"http://www.wikidata.org/prop/direct/": "wdt:",
"http://www.wikidata.org/entity/": "wd:",
"http://www.wikidata.org/prop/": "p:",
"https://w3id.org/payswarm#": "ps:",
"http://www.wikidata.org/prop/qualifier/": "pq:",
"http://www.bigdata.com/rdf#": "bd:",
"http://wikiba.se/ontology#": "wikibase:",
"http://www.w3.org/2004/02/skos/core#": "skos:",
}
def entity_link_replace(entities, relations, question_string, query_string):
# Helper function since entities and relations are replaced in the same way.
def replace_entities_or_relations(er_source, question_string):
for er in er_source:
uri = er["URI"].strip()
surface_form = er["surface form"].strip()
if not surface_form:
continue
# Hopefully the uri includes a uri which we know how to shorten
for prefix in prefixes:
if uri.startswith(prefix):
uri = prefixes[prefix] + uri[len(prefix):]
break
# Find the first occurence of the surface form
er_index = question_string.find(surface_form)
while er_index != -1:
# Check if the surface form is not part of an already replaced entity/relation
previous_colon = question_string.rfind(":", 0, er_index)
previous_space = question_string.rfind(" ", 0, er_index)
previous_space = max(0, previous_space)
# If there is a colon to the left and there is a space after it then it is part of an entity/relation
if previous_colon > previous_space and previous_colon != -1:
next_space = question_string.find(" ", er_index)
er_index = question_string.find(surface_form, next_space)
continue
# Else replace the surface form with the uri
question_string = question_string[:er_index] + uri + question_string[er_index+len(surface_form):]
# Find the next occurence of the surface form
er_index = question_string.find(surface_form, er_index + len(uri))
return question_string
question_string = replace_entities_or_relations(entities, question_string)
question_string = replace_entities_or_relations(relations, question_string)
return question_string
def entity_link_token(entities, relations, question_string, query_string):
n_relations = 0
n_entities = 0
def replace_entities_or_relations(er_source, question_string, query_string, n, token_prefix):
for er in er_source:
uri = er["URI"].strip()
surface_form = er["surface form"].strip()
if not surface_form:
continue
# Hopefully the uri includes a uri which we know how to shorten
for prefix in prefixes:
if uri.startswith(prefix):
uri = prefixes[prefix] + uri[len(prefix):]
break
# Find the first occurence of the surface form
er_index = question_string.find(surface_form)
while er_index != -1:
# Make sure the uri exists in the query
er_query_index = query_string.find(uri)
# If it does not exist or there is not a space after it then it is not part of the query
if er_query_index == -1 or query_string[er_query_index+len(uri)] != " ":
er_index = question_string.find(surface_form, er_index + len(surface_form))
continue
# Replace the surface form with <rel-n>
question_string = question_string[:er_index] + f"<{token_prefix}{n}>" + question_string[er_index+len(surface_form):]
# Replace the uri with <rel-n>
query_string = query_string[:er_query_index] + f"<{token_prefix}{n}>" + query_string[er_query_index+len(uri):]
# Find the next occurence of the surface form
er_index = question_string.find(surface_form, er_index + len(surface_form))
n += 1
return question_string, query_string
question_string, query_string = replace_entities_or_relations(entities, question_string, query_string, n_entities, "e")
question_string, query_string = replace_entities_or_relations(relations, question_string, query_string, n_relations, "r")
return question_string, query_string
def entity_link_append(entities, relations, question_string):
def append_entities_or_relations(er_source, question_string):
if (len(er_source) > 0):
question_string += " | "
# Append all entities/relations to the end of the question
for k, er in enumerate(er_source):
if k > 0:
question_string += " | "
uri = er["URI"]
surface_form = er["surface form"].strip()
# Hopefully the uri includes a uri which we know how to shorten
for prefix in prefixes:
if uri.startswith(prefix):
uri = prefixes[prefix] + uri[len(prefix):]
break
question_string += f"'{surface_form}'-{uri}"
return question_string
question_string = append_entities_or_relations(entities, question_string)
question_string = append_entities_or_relations(relations, question_string)
return question_string
def main():
print(sys.argv[0])
if len(sys.argv) > 1:
......@@ -15,7 +136,8 @@ def main():
sys.exit(1)
if len(sys.argv) > 2:
er_link_style = sys.argv[2]
if (er_link_style != "REPLACE" and er_link_style != "APPEND"):
er_link_styles = ["REPLACE", "APPEND", "TOKENIZE", "TOKENIZE_REPLACE"]
if er_link_style not in er_link_styles:
print("Please provide a valid entity/relationship linking style")
sys.exit(1)
else:
......@@ -23,22 +145,6 @@ def main():
output_file = input_file.replace(".json", "-tokenized.csv")
prefixes = {
"http://dbpedia.org/resource/": "res:",
"http://dbpedia.org/ontology/": "dbo:",
"http://dbpedia.org/property/": "dbp:",
"http://www.w3.org/2000/01/rdf-schema#": "rdfs:",
"http://www.w3.org/1999/02/22-rdf-syntax-ns#": "rdf:",
"http://dbpedia.org/class/yago/": "yago:",
"http://www.wikidata.org/prop/direct/": "wdt:",
"http://www.wikidata.org/entity/": "wd:",
"http://www.wikidata.org/prop/": "p:",
"https://w3id.org/payswarm#": "ps:",
"http://www.wikidata.org/prop/qualifier/": "pq:",
"http://www.bigdata.com/rdf#": "bd:",
"http://wikiba.se/ontology#": "wikibase:",
"http://www.w3.org/2004/02/skos/core#": "skos:",
}
# Generate a csv file with the tokenized questions from a linked json file
with open(input_file, "r", encoding = "utf-8") as f:
......@@ -80,64 +186,19 @@ def main():
query_string = query_string.replace("COUNT(", "COUNT( ")
if er_link_style == "APPEND":
# Append all entities to the end of the question
if (len(entities) > 0):
question_string += " | "
for k, entity in enumerate(entities):
if k > 0:
question_string += " | "
question_string = entity_link_append(entities, relations, question_string)
uri = entity["URI"]
# Hopefully the uri includes a uri which we know how to shorten
if er_link_style == "REPLACE":
question_string = entity_link_replace(entities, relations, question_string, query_string)
for prefix in prefixes:
if uri.startswith(prefix):
uri = prefixes[prefix] + uri[len(prefix):]
break
question_string += f"{uri}"
if er_link_style == "TOKENIZE":
question_string, query_string = entity_link_token(entities, relations, question_string, query_string)
if er_link_style == "REPLACE":
# Helper function since entities and relations are replaced in the same way.
def replace_entity_or_relation(er_source, question_string):
for er in er_source:
uri = er["URI"].strip()
surface_form = er["surface form"].strip()
if not surface_form:
continue
# Hopefully the uri includes a uri which we know how to shorten
for prefix in prefixes:
if uri.startswith(prefix):
uri = prefixes[prefix] + uri[len(prefix):]
break
# Find the first occurence of the surface form
er_index = question_string.find(surface_form)
while er_index != -1:
# Check if the surface form is not part of an already replaced entity/relation
previous_colon = question_string.rfind(":", 0, er_index)
previous_space = question_string.rfind(" ", 0, er_index)
previous_space = max(0, previous_space)
# If there is a colon to the left and there is a space after it then it is part of an entity/relation
if previous_colon > previous_space and previous_colon != -1:
next_space = question_string.find(" ", er_index)
er_index = question_string.find(surface_form, next_space)
continue
# Else replace the surface form with the uri
question_string = question_string[:er_index] + uri + question_string[er_index+len(surface_form):]
# Find the next occurence of the surface form
er_index = question_string.find(surface_form, er_index + len(uri))
return question_string
question_string = replace_entity_or_relation(entities, question_string)
question_string = replace_entity_or_relation(relations, question_string)
if er_link_style == "TOKENIZE_REPLACE":
question_string, query_string = entity_link_token(entities, relations, question_string, query_string)
question_string = entity_link_replace(entities, relations, question_string, query_string)
out.write(f'\"{question_string}\", \"{query_string}\"\n')
if __name__ == "__main__":
main()
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment