From bc151a27b468947a0b87545601f4b03de8cf801b Mon Sep 17 00:00:00 2001 From: Albin Henriksson <albhe428@student.liu.se> Date: Fri, 18 Nov 2022 13:52:45 +0100 Subject: [PATCH] Improve tokenizer script to add more options --- data/tokenizer.py | 203 ++++++++++++++++++++++++++++++---------------- 1 file changed, 132 insertions(+), 71 deletions(-) diff --git a/data/tokenizer.py b/data/tokenizer.py index 6195a8f..41d6f35 100644 --- a/data/tokenizer.py +++ b/data/tokenizer.py @@ -2,10 +2,131 @@ # Example usage: py tokenizer.py qald-9-test-linked.json # Example usage: py tokenizer.py qald-9-test-linked.json REPLACE # Note: er_link_style is the style of entity linking to use. -# It can be "REPLACE", "APPEND" or blank. Default is no usage of entity links. +# It can be "REPLACE", "APPEND", "TOKENIZE", "TOKENIZE-APPEND" or blank. Default is no usage of entity links. import sys import json +prefixes = { + "http://dbpedia.org/resource/": "res:", + "http://dbpedia.org/ontology/": "dbo:", + "http://dbpedia.org/property/": "dbp:", + "http://www.w3.org/2000/01/rdf-schema#": "rdfs:", + "http://www.w3.org/1999/02/22-rdf-syntax-ns#": "rdf:", + "http://dbpedia.org/class/yago/": "yago:", + "http://www.wikidata.org/prop/direct/": "wdt:", + "http://www.wikidata.org/entity/": "wd:", + "http://www.wikidata.org/prop/": "p:", + "https://w3id.org/payswarm#": "ps:", + "http://www.wikidata.org/prop/qualifier/": "pq:", + "http://www.bigdata.com/rdf#": "bd:", + "http://wikiba.se/ontology#": "wikibase:", + "http://www.w3.org/2004/02/skos/core#": "skos:", +} + +def entity_link_replace(entities, relations, question_string, query_string): + # Helper function since entities and relations are replaced in the same way. + def replace_entities_or_relations(er_source, question_string): + for er in er_source: + uri = er["URI"].strip() + surface_form = er["surface form"].strip() + if not surface_form: + continue + # Hopefully the uri includes a uri which we know how to shorten + + for prefix in prefixes: + if uri.startswith(prefix): + uri = prefixes[prefix] + uri[len(prefix):] + break + + # Find the first occurence of the surface form + er_index = question_string.find(surface_form) + while er_index != -1: + + # Check if the surface form is not part of an already replaced entity/relation + previous_colon = question_string.rfind(":", 0, er_index) + previous_space = question_string.rfind(" ", 0, er_index) + previous_space = max(0, previous_space) + + # If there is a colon to the left and there is a space after it then it is part of an entity/relation + if previous_colon > previous_space and previous_colon != -1: + next_space = question_string.find(" ", er_index) + er_index = question_string.find(surface_form, next_space) + continue + + # Else replace the surface form with the uri + question_string = question_string[:er_index] + uri + question_string[er_index+len(surface_form):] + # Find the next occurence of the surface form + er_index = question_string.find(surface_form, er_index + len(uri)) + return question_string + + question_string = replace_entities_or_relations(entities, question_string) + question_string = replace_entities_or_relations(relations, question_string) + return question_string + +def entity_link_token(entities, relations, question_string, query_string): + n_relations = 0 + n_entities = 0 + def replace_entities_or_relations(er_source, question_string, query_string, n, token_prefix): + for er in er_source: + uri = er["URI"].strip() + surface_form = er["surface form"].strip() + if not surface_form: + continue + # Hopefully the uri includes a uri which we know how to shorten + + for prefix in prefixes: + if uri.startswith(prefix): + uri = prefixes[prefix] + uri[len(prefix):] + break + + # Find the first occurence of the surface form + er_index = question_string.find(surface_form) + while er_index != -1: + # Make sure the uri exists in the query + er_query_index = query_string.find(uri) + # If it does not exist or there is not a space after it then it is not part of the query + if er_query_index == -1 or query_string[er_query_index+len(uri)] != " ": + er_index = question_string.find(surface_form, er_index + len(surface_form)) + continue + # Replace the surface form with <rel-n> + question_string = question_string[:er_index] + f"<{token_prefix}{n}>" + question_string[er_index+len(surface_form):] + # Replace the uri with <rel-n> + query_string = query_string[:er_query_index] + f"<{token_prefix}{n}>" + query_string[er_query_index+len(uri):] + # Find the next occurence of the surface form + er_index = question_string.find(surface_form, er_index + len(surface_form)) + n += 1 + return question_string, query_string + + question_string, query_string = replace_entities_or_relations(entities, question_string, query_string, n_entities, "e") + question_string, query_string = replace_entities_or_relations(relations, question_string, query_string, n_relations, "r") + return question_string, query_string + +def entity_link_append(entities, relations, question_string): + def append_entities_or_relations(er_source, question_string): + if (len(er_source) > 0): + question_string += " | " + # Append all entities/relations to the end of the question + for k, er in enumerate(er_source): + if k > 0: + question_string += " | " + + uri = er["URI"] + surface_form = er["surface form"].strip() + # Hopefully the uri includes a uri which we know how to shorten + + for prefix in prefixes: + if uri.startswith(prefix): + uri = prefixes[prefix] + uri[len(prefix):] + break + question_string += f"'{surface_form}'-{uri}" + return question_string + + question_string = append_entities_or_relations(entities, question_string) + question_string = append_entities_or_relations(relations, question_string) + return question_string + + + def main(): print(sys.argv[0]) if len(sys.argv) > 1: @@ -15,7 +136,8 @@ def main(): sys.exit(1) if len(sys.argv) > 2: er_link_style = sys.argv[2] - if (er_link_style != "REPLACE" and er_link_style != "APPEND"): + er_link_styles = ["REPLACE", "APPEND", "TOKENIZE", "TOKENIZE_REPLACE"] + if er_link_style not in er_link_styles: print("Please provide a valid entity/relationship linking style") sys.exit(1) else: @@ -23,22 +145,6 @@ def main(): output_file = input_file.replace(".json", "-tokenized.csv") - prefixes = { - "http://dbpedia.org/resource/": "res:", - "http://dbpedia.org/ontology/": "dbo:", - "http://dbpedia.org/property/": "dbp:", - "http://www.w3.org/2000/01/rdf-schema#": "rdfs:", - "http://www.w3.org/1999/02/22-rdf-syntax-ns#": "rdf:", - "http://dbpedia.org/class/yago/": "yago:", - "http://www.wikidata.org/prop/direct/": "wdt:", - "http://www.wikidata.org/entity/": "wd:", - "http://www.wikidata.org/prop/": "p:", - "https://w3id.org/payswarm#": "ps:", - "http://www.wikidata.org/prop/qualifier/": "pq:", - "http://www.bigdata.com/rdf#": "bd:", - "http://wikiba.se/ontology#": "wikibase:", - "http://www.w3.org/2004/02/skos/core#": "skos:", - } # Generate a csv file with the tokenized questions from a linked json file with open(input_file, "r", encoding = "utf-8") as f: @@ -80,64 +186,19 @@ def main(): query_string = query_string.replace("COUNT(", "COUNT( ") if er_link_style == "APPEND": - # Append all entities to the end of the question - if (len(entities) > 0): - question_string += " | " - - for k, entity in enumerate(entities): - if k > 0: - question_string += " | " + question_string = entity_link_append(entities, relations, question_string) - uri = entity["URI"] - # Hopefully the uri includes a uri which we know how to shorten + if er_link_style == "REPLACE": + question_string = entity_link_replace(entities, relations, question_string, query_string) - for prefix in prefixes: - if uri.startswith(prefix): - uri = prefixes[prefix] + uri[len(prefix):] - break - question_string += f"{uri}" + if er_link_style == "TOKENIZE": + question_string, query_string = entity_link_token(entities, relations, question_string, query_string) - if er_link_style == "REPLACE": - # Helper function since entities and relations are replaced in the same way. - def replace_entity_or_relation(er_source, question_string): - for er in er_source: - uri = er["URI"].strip() - surface_form = er["surface form"].strip() - if not surface_form: - continue - # Hopefully the uri includes a uri which we know how to shorten - - for prefix in prefixes: - if uri.startswith(prefix): - uri = prefixes[prefix] + uri[len(prefix):] - break - - # Find the first occurence of the surface form - er_index = question_string.find(surface_form) - while er_index != -1: - - # Check if the surface form is not part of an already replaced entity/relation - previous_colon = question_string.rfind(":", 0, er_index) - previous_space = question_string.rfind(" ", 0, er_index) - previous_space = max(0, previous_space) - - # If there is a colon to the left and there is a space after it then it is part of an entity/relation - if previous_colon > previous_space and previous_colon != -1: - next_space = question_string.find(" ", er_index) - er_index = question_string.find(surface_form, next_space) - continue - - # Else replace the surface form with the uri - question_string = question_string[:er_index] + uri + question_string[er_index+len(surface_form):] - # Find the next occurence of the surface form - er_index = question_string.find(surface_form, er_index + len(uri)) - return question_string - - question_string = replace_entity_or_relation(entities, question_string) - question_string = replace_entity_or_relation(relations, question_string) + if er_link_style == "TOKENIZE_REPLACE": + question_string, query_string = entity_link_token(entities, relations, question_string, query_string) + question_string = entity_link_replace(entities, relations, question_string, query_string) out.write(f'\"{question_string}\", \"{query_string}\"\n') - if __name__ == "__main__": main() -- GitLab