Improve tokenizer script to add more options

bc151a27 · Albin Henriksson · 1c8ba7a8 · bc151a27
Commit bc151a27 authored 2 years ago by Albin Henriksson
--- a/data/tokenizer.py
+++ b/data/tokenizer.py
@@ -2,10 +2,131 @@
 # Example usage: py tokenizer.py qald-9-test-linked.json
 # Example usage: py tokenizer.py qald-9-test-linked.json REPLACE
 # Note: er_link_style is the style of entity linking to use.
-# It can be "REPLACE", "APPEND" or blank. Default is no usage of entity links.
+# It can be "REPLACE", "APPEND", "TOKENIZE", "TOKENIZE-APPEND" or blank. Default is no usage of entity links.
 import sys
 import json

+prefixes = {
+    "http://dbpedia.org/resource/": "res:",
+    "http://dbpedia.org/ontology/": "dbo:",
+    "http://dbpedia.org/property/": "dbp:",
+    "http://www.w3.org/2000/01/rdf-schema#": "rdfs:",
+    "http://www.w3.org/1999/02/22-rdf-syntax-ns#": "rdf:",
+    "http://dbpedia.org/class/yago/": "yago:",
+    "http://www.wikidata.org/prop/direct/": "wdt:",
+    "http://www.wikidata.org/entity/": "wd:",
+    "http://www.wikidata.org/prop/": "p:",
+    "https://w3id.org/payswarm#": "ps:",
+    "http://www.wikidata.org/prop/qualifier/": "pq:",
+    "http://www.bigdata.com/rdf#": "bd:",
+    "http://wikiba.se/ontology#": "wikibase:",
+    "http://www.w3.org/2004/02/skos/core#": "skos:",
+}
+
+def entity_link_replace(entities, relations, question_string, query_string):
+    # Helper function since entities and relations are replaced in the same way.
+    def replace_entities_or_relations(er_source, question_string):
+        for er in er_source:
+            uri = er["URI"].strip()
+            surface_form = er["surface form"].strip()
+            if not surface_form:
+                continue
+            # Hopefully the uri includes a uri which we know how to shorten
+
+            for prefix in prefixes:
+                if uri.startswith(prefix):
+                    uri = prefixes[prefix] + uri[len(prefix):]
+                    break
+
+            # Find the first occurence of the surface form
+            er_index = question_string.find(surface_form)
+            while er_index != -1:
+
+                # Check if the surface form is not part of an already replaced entity/relation
+                previous_colon = question_string.rfind(":", 0, er_index)
+                previous_space = question_string.rfind(" ", 0, er_index)
+                previous_space = max(0, previous_space)
+
+                # If there is a colon to the left and there is a space after it then it is part of an entity/relation
+                if previous_colon > previous_space and previous_colon != -1:
+                    next_space = question_string.find(" ", er_index)
+                    er_index = question_string.find(surface_form, next_space)
+                    continue
+
+                # Else replace the surface form with the uri
+                question_string = question_string[:er_index] + uri + question_string[er_index+len(surface_form):]
+                # Find the next occurence of the surface form
+                er_index = question_string.find(surface_form, er_index + len(uri))
+        return question_string
+
+    question_string = replace_entities_or_relations(entities, question_string)
+    question_string = replace_entities_or_relations(relations, question_string)
+    return question_string
+
+def entity_link_token(entities, relations, question_string, query_string):
+    n_relations = 0
+    n_entities = 0
+    def replace_entities_or_relations(er_source, question_string, query_string, n, token_prefix):
+        for er in er_source:
+            uri = er["URI"].strip()
+            surface_form = er["surface form"].strip()
+            if not surface_form:
+                continue
+            # Hopefully the uri includes a uri which we know how to shorten
+
+            for prefix in prefixes:
+                if uri.startswith(prefix):
+                    uri = prefixes[prefix] + uri[len(prefix):]
+                    break
+
+            # Find the first occurence of the surface form
+            er_index = question_string.find(surface_form)
+            while er_index != -1:
+                # Make sure the uri exists in the query
+                er_query_index = query_string.find(uri)
+                # If it does not exist or there is not a space after it then it is not part of the query
+                if er_query_index == -1 or query_string[er_query_index+len(uri)] != " ":
+                    er_index = question_string.find(surface_form, er_index + len(surface_form))
+                    continue
+                # Replace the surface form with <rel-n>
+                question_string = question_string[:er_index] + f"<{token_prefix}{n}>" + question_string[er_index+len(surface_form):]
+                # Replace the uri with <rel-n>
+                query_string = query_string[:er_query_index] + f"<{token_prefix}{n}>" + query_string[er_query_index+len(uri):]
+                # Find the next occurence of the surface form
+                er_index = question_string.find(surface_form, er_index + len(surface_form))
+                n += 1
+        return question_string, query_string
+
+    question_string, query_string = replace_entities_or_relations(entities, question_string, query_string, n_entities, "e")
+    question_string, query_string = replace_entities_or_relations(relations, question_string, query_string, n_relations, "r")
+    return question_string, query_string
+
+def entity_link_append(entities, relations, question_string):
+    def append_entities_or_relations(er_source, question_string):
+        if (len(er_source) > 0):
+            question_string += " | "
+        # Append all entities/relations to the end of the question
+        for k, er in enumerate(er_source):
+            if k > 0:
+                question_string += " | "
+
+            uri = er["URI"]
+            surface_form = er["surface form"].strip()
+            # Hopefully the uri includes a uri which we know how to shorten
+
+            for prefix in prefixes:
+                if uri.startswith(prefix):
+                    uri = prefixes[prefix] + uri[len(prefix):]
+                    break
+            question_string += f"'{surface_form}'-{uri}"
+        return question_string
+
+    question_string = append_entities_or_relations(entities, question_string)
+    question_string = append_entities_or_relations(relations, question_string)
+    return question_string
+
+
+
 def main():
    print(sys.argv[0])
    if len(sys.argv) > 1:
@@ -15,7 +136,8 @@ def main():
        sys.exit(1)
    if len(sys.argv) > 2:
        er_link_style = sys.argv[2]
-        if (er_link_style != "REPLACE" and er_link_style != "APPEND"):
+        er_link_styles = ["REPLACE", "APPEND", "TOKENIZE", "TOKENIZE_REPLACE"]
+        if er_link_style not in er_link_styles:
            print("Please provide a valid entity/relationship linking style")
            sys.exit(1)
    else:
@@ -23,22 +145,6 @@ def main():

    output_file = input_file.replace(".json", "-tokenized.csv")

-    prefixes = {
-        "http://dbpedia.org/resource/": "res:",
-        "http://dbpedia.org/ontology/": "dbo:",
-        "http://dbpedia.org/property/": "dbp:",
-        "http://www.w3.org/2000/01/rdf-schema#": "rdfs:",
-        "http://www.w3.org/1999/02/22-rdf-syntax-ns#": "rdf:",
-        "http://dbpedia.org/class/yago/": "yago:",
-        "http://www.wikidata.org/prop/direct/": "wdt:",
-        "http://www.wikidata.org/entity/": "wd:",
-        "http://www.wikidata.org/prop/": "p:",
-        "https://w3id.org/payswarm#": "ps:",
-        "http://www.wikidata.org/prop/qualifier/": "pq:",
-        "http://www.bigdata.com/rdf#": "bd:",
-        "http://wikiba.se/ontology#": "wikibase:",
-        "http://www.w3.org/2004/02/skos/core#": "skos:",
-    }

    # Generate a csv file with the tokenized questions from a linked json file
    with open(input_file, "r", encoding = "utf-8") as f:
@@ -80,64 +186,19 @@ def main():
            query_string = query_string.replace("COUNT(", "COUNT( ")

            if er_link_style == "APPEND":
-                # Append all entities to the end of the question
-                if (len(entities) > 0):
-                    question_string += " | "
-
-                for k, entity in enumerate(entities):
-                    if k > 0:
-                        question_string += " | "
+                question_string = entity_link_append(entities, relations, question_string)

-                    uri = entity["URI"]
-                    # Hopefully the uri includes a uri which we know how to shorten
+            if er_link_style == "REPLACE":
+                question_string = entity_link_replace(entities, relations, question_string, query_string)

-                    for prefix in prefixes:
-                        if uri.startswith(prefix):
-                            uri = prefixes[prefix] + uri[len(prefix):]
-                            break
-                    question_string += f"{uri}"
+            if er_link_style == "TOKENIZE":
+                question_string, query_string = entity_link_token(entities, relations, question_string, query_string)

-            if er_link_style == "REPLACE":
-                # Helper function since entities and relations are replaced in the same way.
-                def replace_entity_or_relation(er_source, question_string):
-                    for er in er_source:
-                        uri = er["URI"].strip()
-                        surface_form = er["surface form"].strip()
-                        if not surface_form:
-                            continue
-                        # Hopefully the uri includes a uri which we know how to shorten
-
-                        for prefix in prefixes:
-                            if uri.startswith(prefix):
-                                uri = prefixes[prefix] + uri[len(prefix):]
-                                break
-
-                        # Find the first occurence of the surface form
-                        er_index = question_string.find(surface_form)
-                        while er_index != -1:
-
-                            # Check if the surface form is not part of an already replaced entity/relation
-                            previous_colon = question_string.rfind(":", 0, er_index)
-                            previous_space = question_string.rfind(" ", 0, er_index)
-                            previous_space = max(0, previous_space)
-
-                            # If there is a colon to the left and there is a space after it then it is part of an entity/relation
-                            if previous_colon > previous_space and previous_colon != -1:
-                                next_space = question_string.find(" ", er_index)
-                                er_index = question_string.find(surface_form, next_space)
-                                continue
-
-                            # Else replace the surface form with the uri
-                            question_string = question_string[:er_index] + uri + question_string[er_index+len(surface_form):]
-                            # Find the next occurence of the surface form
-                            er_index = question_string.find(surface_form, er_index + len(uri))
-                    return question_string
-
-                question_string = replace_entity_or_relation(entities, question_string)
-                question_string = replace_entity_or_relation(relations, question_string)
+            if er_link_style == "TOKENIZE_REPLACE":
+                question_string, query_string = entity_link_token(entities, relations, question_string, query_string)
+                question_string = entity_link_replace(entities, relations, question_string, query_string)

            out.write(f'\"{question_string}\", \"{query_string}\"\n')
-
 if __name__ == "__main__":
    main()