diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..c7c52a9d66bd25f7f9a09dad6a4bc2931fb148d9 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +credentials.json \ No newline at end of file diff --git a/README.MD b/README.MD new file mode 100644 index 0000000000000000000000000000000000000000..ee035e61919f66ce7983a1e19296ba72b83878cd --- /dev/null +++ b/README.MD @@ -0,0 +1,22 @@ +Text-to-scQL Dataset +==================== + +This can be used to generate a dataset of sentences and corresponding queries. + +The default is to use the following command to generate the dataset: + +```bash +./generate.py +``` + +It will output three lines per test data: + +1) a sentence in English +2) a scQL query +3) a json representation of the expected outcome of the query + +The script can also be used with the following arguments: + +* `--validate` can be used to validate the queries (check for errors, for instance). +* `--localhost` to use an analyser server running locally +* `--no-model` to generate training data without a model diff --git a/generate.py b/generate.py index 958aa3cf972ddaa660bfd42d39d3a233c931d8b7..5b86cade130bc6c875addafb2d174f1555005f68 100755 --- a/generate.py +++ b/generate.py @@ -1,6 +1,29 @@ #!/usr/bin/python3 import itertools +import sys +import json + +# +# ./generate.py [options] +# --validate indicates that you want to validate and show the model for the queries generated by the script +# --localhost indicates that you want to use a scql_analyser server located on this computer (default is to use terra8 webserver) +# --no-model indicates that you want to generate dataset without generating models + +validate = '--validate' in sys.argv +localhost = '--localhost' in sys.argv +nomodel = '--no-model' in sys.argv + +if localhost: + WEBSERVER_URI = 'http://localhost:8181/api/get_model' + WEBSERVER_AUTH = None +else: + credentials = json.load(open('credentials.json')) + WEBSERVER_URI = credentials['uri'] + WEBSERVER_AUTH = (credentials['username'], credentials['password']) + + + class select_query: def __init__(self, variables, constraints, restrictions): @@ -50,13 +73,23 @@ def add_query(queries, sentence_fragments, query_def): for s in sentences: queries.append([s, query_def]) return queries - + def generate_scql(queries): # Take a list of queries and generate the scQL query for s,q in queries: print(s) print(q.to_scql()) +def generate_model_scql(queries): + import requests + # Take a list of queries and generate the scQL query + for s,q in queries: + print(s) + print(q.to_scql()) + payload = {'query': q.to_scql()} + r = requests.post(WEBSERVER_URI, data=payload, auth=WEBSERVER_AUTH).json() + print(r["model"]["models"]) + def validate_scql(queries): import requests from termcolor import colored @@ -64,7 +97,8 @@ def validate_scql(queries): results = [] for s,q in queries: payload = {'query': q.to_scql()} - r = requests.post('http://localhost:8181/api/get_model', data=payload).json() + + r = requests.post(WEBSERVER_URI, data=payload, auth=WEBSERVER_AUTH).json() # print(r) if not r['model']['parse']: print(f"Failed to parse: '{colored(q.to_scql(), 'blue')}' with error '{colored(r['model']['message'], 'red')}'") @@ -160,5 +194,10 @@ for bbd in human_sick_injured_statuses: queries = add_query(queries, [desire_perf, human_in_need_statuses], select_query([[mu("scql_types:salient_point"), "human"]], [["human.klass", "=", mu("ex:human")], ["human.status", "in", all_human_bad_statuses]], [])) -generate_scql(queries) -# validate_scql(queries) +if validate: + validate_scql(queries) +elif nomodel: + generate_scql(queries) +else: + generate_model_scql(queries) +