Skip to content
Snippets Groups Projects
Commit 52956a93 authored by Albin's avatar Albin
Browse files

Merge branch 'main' of gitlab.liu.se:tdde19-2022-1/codebase into main

parents 69239437 bc151a27
No related branches found
No related tags found
No related merge requests found
......@@ -2,10 +2,131 @@
# Example usage: py tokenizer.py qald-9-test-linked.json
# Example usage: py tokenizer.py qald-9-test-linked.json REPLACE
# Note: er_link_style is the style of entity linking to use.
# It can be "REPLACE", "APPEND" or blank. Default is no usage of entity links.
# It can be "REPLACE", "APPEND", "TOKENIZE", "TOKENIZE-APPEND" or blank. Default is no usage of entity links.
import sys
import json
prefixes = {
"http://dbpedia.org/resource/": "res:",
"http://dbpedia.org/ontology/": "dbo:",
"http://dbpedia.org/property/": "dbp:",
"http://www.w3.org/2000/01/rdf-schema#": "rdfs:",
"http://www.w3.org/1999/02/22-rdf-syntax-ns#": "rdf:",
"http://dbpedia.org/class/yago/": "yago:",
"http://www.wikidata.org/prop/direct/": "wdt:",
"http://www.wikidata.org/entity/": "wd:",
"http://www.wikidata.org/prop/": "p:",
"https://w3id.org/payswarm#": "ps:",
"http://www.wikidata.org/prop/qualifier/": "pq:",
"http://www.bigdata.com/rdf#": "bd:",
"http://wikiba.se/ontology#": "wikibase:",
"http://www.w3.org/2004/02/skos/core#": "skos:",
}
def entity_link_replace(entities, relations, question_string, query_string):
# Helper function since entities and relations are replaced in the same way.
def replace_entities_or_relations(er_source, question_string):
for er in er_source:
uri = er["URI"].strip()
surface_form = er["surface form"].strip()
if not surface_form:
continue
# Hopefully the uri includes a uri which we know how to shorten
for prefix in prefixes:
if uri.startswith(prefix):
uri = prefixes[prefix] + uri[len(prefix):]
break
# Find the first occurence of the surface form
er_index = question_string.find(surface_form)
while er_index != -1:
# Check if the surface form is not part of an already replaced entity/relation
previous_colon = question_string.rfind(":", 0, er_index)
previous_space = question_string.rfind(" ", 0, er_index)
previous_space = max(0, previous_space)
# If there is a colon to the left and there is a space after it then it is part of an entity/relation
if previous_colon > previous_space and previous_colon != -1:
next_space = question_string.find(" ", er_index)
er_index = question_string.find(surface_form, next_space)
continue
# Else replace the surface form with the uri
question_string = question_string[:er_index] + uri + question_string[er_index+len(surface_form):]
# Find the next occurence of the surface form
er_index = question_string.find(surface_form, er_index + len(uri))
return question_string
question_string = replace_entities_or_relations(entities, question_string)
question_string = replace_entities_or_relations(relations, question_string)
return question_string
def entity_link_token(entities, relations, question_string, query_string):
n_relations = 0
n_entities = 0
def replace_entities_or_relations(er_source, question_string, query_string, n, token_prefix):
for er in er_source:
uri = er["URI"].strip()
surface_form = er["surface form"].strip()
if not surface_form:
continue
# Hopefully the uri includes a uri which we know how to shorten
for prefix in prefixes:
if uri.startswith(prefix):
uri = prefixes[prefix] + uri[len(prefix):]
break
# Find the first occurence of the surface form
er_index = question_string.find(surface_form)
while er_index != -1:
# Make sure the uri exists in the query
er_query_index = query_string.find(uri)
# If it does not exist or there is not a space after it then it is not part of the query
if er_query_index == -1 or query_string[er_query_index+len(uri)] != " ":
er_index = question_string.find(surface_form, er_index + len(surface_form))
continue
# Replace the surface form with <rel-n>
question_string = question_string[:er_index] + f"<{token_prefix}{n}>" + question_string[er_index+len(surface_form):]
# Replace the uri with <rel-n>
query_string = query_string[:er_query_index] + f"<{token_prefix}{n}>" + query_string[er_query_index+len(uri):]
# Find the next occurence of the surface form
er_index = question_string.find(surface_form, er_index + len(surface_form))
n += 1
return question_string, query_string
question_string, query_string = replace_entities_or_relations(entities, question_string, query_string, n_entities, "e")
question_string, query_string = replace_entities_or_relations(relations, question_string, query_string, n_relations, "r")
return question_string, query_string
def entity_link_append(entities, relations, question_string):
def append_entities_or_relations(er_source, question_string):
if (len(er_source) > 0):
question_string += " | "
# Append all entities/relations to the end of the question
for k, er in enumerate(er_source):
if k > 0:
question_string += " | "
uri = er["URI"]
surface_form = er["surface form"].strip()
# Hopefully the uri includes a uri which we know how to shorten
for prefix in prefixes:
if uri.startswith(prefix):
uri = prefixes[prefix] + uri[len(prefix):]
break
question_string += f"'{surface_form}'-{uri}"
return question_string
question_string = append_entities_or_relations(entities, question_string)
question_string = append_entities_or_relations(relations, question_string)
return question_string
def main():
print(sys.argv[0])
if len(sys.argv) > 1:
......@@ -15,7 +136,8 @@ def main():
sys.exit(1)
if len(sys.argv) > 2:
er_link_style = sys.argv[2]
if (er_link_style != "REPLACE" and er_link_style != "APPEND"):
er_link_styles = ["REPLACE", "APPEND", "TOKENIZE", "TOKENIZE_REPLACE"]
if er_link_style not in er_link_styles:
print("Please provide a valid entity/relationship linking style")
sys.exit(1)
else:
......@@ -23,22 +145,6 @@ def main():
output_file = input_file.replace(".json", "-tokenized.csv")
prefixes = {
"http://dbpedia.org/resource/": "res:",
"http://dbpedia.org/ontology/": "dbo:",
"http://dbpedia.org/property/": "dbp:",
"http://www.w3.org/2000/01/rdf-schema#": "rdfs:",
"http://www.w3.org/1999/02/22-rdf-syntax-ns#": "rdf:",
"http://dbpedia.org/class/yago/": "yago:",
"http://www.wikidata.org/prop/direct/": "wdt:",
"http://www.wikidata.org/entity/": "wd:",
"http://www.wikidata.org/prop/": "p:",
"https://w3id.org/payswarm#": "ps:",
"http://www.wikidata.org/prop/qualifier/": "pq:",
"http://www.bigdata.com/rdf#": "bd:",
"http://wikiba.se/ontology#": "wikibase:",
"http://www.w3.org/2004/02/skos/core#": "skos:",
}
# Generate a csv file with the tokenized questions from a linked json file
with open(input_file, "r", encoding = "utf-8") as f:
......@@ -80,64 +186,19 @@ def main():
query_string = query_string.replace("COUNT(", "COUNT( ")
if er_link_style == "APPEND":
# Append all entities to the end of the question
if (len(entities) > 0):
question_string += " | "
for k, entity in enumerate(entities):
if k > 0:
question_string += " | "
question_string = entity_link_append(entities, relations, question_string)
uri = entity["URI"]
# Hopefully the uri includes a uri which we know how to shorten
if er_link_style == "REPLACE":
question_string = entity_link_replace(entities, relations, question_string, query_string)
for prefix in prefixes:
if uri.startswith(prefix):
uri = prefixes[prefix] + uri[len(prefix):]
break
question_string += f"{uri}"
if er_link_style == "TOKENIZE":
question_string, query_string = entity_link_token(entities, relations, question_string, query_string)
if er_link_style == "REPLACE":
# Helper function since entities and relations are replaced in the same way.
def replace_entity_or_relation(er_source, question_string):
for er in er_source:
uri = er["URI"].strip()
surface_form = er["surface form"].strip()
if not surface_form:
continue
# Hopefully the uri includes a uri which we know how to shorten
for prefix in prefixes:
if uri.startswith(prefix):
uri = prefixes[prefix] + uri[len(prefix):]
break
# Find the first occurence of the surface form
er_index = question_string.find(surface_form)
while er_index != -1:
# Check if the surface form is not part of an already replaced entity/relation
previous_colon = question_string.rfind(":", 0, er_index)
previous_space = question_string.rfind(" ", 0, er_index)
previous_space = max(0, previous_space)
# If there is a colon to the left and there is a space after it then it is part of an entity/relation
if previous_colon > previous_space and previous_colon != -1:
next_space = question_string.find(" ", er_index)
er_index = question_string.find(surface_form, next_space)
continue
# Else replace the surface form with the uri
question_string = question_string[:er_index] + uri + question_string[er_index+len(surface_form):]
# Find the next occurence of the surface form
er_index = question_string.find(surface_form, er_index + len(uri))
return question_string
question_string = replace_entity_or_relation(entities, question_string)
question_string = replace_entity_or_relation(relations, question_string)
if er_link_style == "TOKENIZE_REPLACE":
question_string, query_string = entity_link_token(entities, relations, question_string, query_string)
question_string = entity_link_replace(entities, relations, question_string, query_string)
out.write(f'\"{question_string}\", \"{query_string}\"\n')
if __name__ == "__main__":
main()
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment