diff --git a/README.md b/README.md index 441dc130f..4d610fe05 100644 --- a/README.md +++ b/README.md @@ -23,25 +23,24 @@ `apt-get install python3-yaml python3-requests python3-click python3-distro python3-psutil python3-pexpect python3-pyftpdlib python3-statsd python3-selenium python3-pip gdb` the `python3-semver` on debian is to old - need to use the pip version instead: - `pip3 install semver beautifultable allure_python_commons certifi tabulate` + `pip3 install semver beautifultable allure_python_commons certifi csv tabulate` Ubuntu 16.40 pip3 system package is broken. Fix like this: `dpkg -r python3-pip python3-pexpect` `python3.8 -m easy_install pip` - `pip install distro semver pexpect psutil beautifultable allure_python_commons certifi` - + `pip install distro semver pexpect psutil beautifultable tabulate allure_python_commons certifi csv` - **centos**: - `yum update ; yum install python3 python3-pyyaml python36-PyYAML python3-requests python3-click gcc platform-python-devel python3-distro python3-devel python36-distro python36-click python36-pexpect python3-pexpect python3-pyftpdlib; pip3 install psutil semver beautifultable` + `yum update ; yum install python3 python3-pyyaml python36-PyYAML python3-requests python3-click gcc platform-python-devel python3-distro python3-devel python36-distro python36-click python36-pexpect python3-pexpect python3-pyftpdlib; pip3 install psutil semver beautifultable tabulate allure_python_commons certifi csv` `sudo yum install gdb` - **plain pip**: - `pip3 install psutil pyyaml pexpect requests click semver ftplib selenium beautifultable tabulate allure_python_commons certifi` + `pip3 install psutil pyyaml pexpect requests click semver ftplib selenium beautifultable tabulate allure_python_commons certifi csv` or: `pip install -r requirements.txt` ## Mac OS : `brew install gnu-tar` - `pip3 install click psutil requests pyyaml semver pexpect selenium beautifultable tabulate allure_python_commons certifi` + `pip3 install click psutil requests pyyaml semver pexpect selenium beautifultable markdown allure_python_commons tabulate allure_python_commons certifi csv` `brew install gdb` if `python --version` is below 3.9 you also have to download ftplib: `pip3 install click ftplib` diff --git a/containers/docker_deb/Dockerfile b/containers/docker_deb/Dockerfile index 280f9123d..954f1bc40 100644 --- a/containers/docker_deb/Dockerfile +++ b/containers/docker_deb/Dockerfile @@ -53,7 +53,7 @@ RUN if [ -z "$CHROME_DRIVER_VERSION" ]; \ && sudo ln -fs /opt/selenium/chromedriver-$CHROME_DRIVER_VERSION /usr/bin/chromedriver -RUN pip3 install semver selenium beautifultable allure_python_commons mss tabulate +RUN pip3 install semver selenium beautifultable allure_python_commons mss tabulate csv run mkdir -p /home/release-test-automation \ /home/package_cache \ diff --git a/containers/docker_rpm/Dockerfile b/containers/docker_rpm/Dockerfile index 8579d205b..6c5afd4a4 100644 --- a/containers/docker_rpm/Dockerfile +++ b/containers/docker_rpm/Dockerfile @@ -17,8 +17,8 @@ run mkdir -p /home/release-test-automation \ /home/entrypoint RUN mkdir -p /home/entrypoint -RUN yum -y update; yum install -y python3 python3-pyyaml python36-PyYAML python3-requests python3-click gcc platform-python-devel python3-distro python3-devel python36-distro python36-click python36-pexpect python3-pexpect python3-pyftpdlib initscripts file gdb chromedriver chromium python3-markdown; -RUN pip3 install selenium psutil semver click requests pyyaml distro pexpect beautifultable allure_python_commons tabulate certifi mss +RUN yum -y update; yum install -y python3 python3-pyyaml python36-PyYAML python3-requests python3-click gcc platform-python-devel python3-distro python3-devel python36-distro python36-click python36-pexpect python3-pexpect python3-pyftpdlib initscripts file gdb chromedriver chromium; +RUN pip3 install selenium psutil semver click requests pyyaml distro pexpect beautifultable allure_python_commons tabulate certifi mss csv RUN (cd /lib/systemd/system/sysinit.target.wants/; for i in ; do [ $i == systemd-tmpfiles-setup.service ] || rm -f $i; done); \ rm -rf /lib/systemd/system/multi-user.target.wants/;\ diff --git a/containers/docker_tar/Dockerfile b/containers/docker_tar/Dockerfile index 9452407e6..8a5dac0e1 100644 --- a/containers/docker_tar/Dockerfile +++ b/containers/docker_tar/Dockerfile @@ -48,7 +48,7 @@ RUN if [ -z "$CHROME_DRIVER_VERSION" ]; \ #VOLUME ["/sys/fs/cgroup"] # STOPSIGNAL SIGRTMIN+3 -RUN pip3 install semver beautifultable allure_python_commons mss tabulate +RUN pip3 install semver beautifultable allure_python_commons mss tabulate csv RUN mkdir -p /home/entrypoint /home/release-test-automation /home/package_cache /home/versions /home/test_dir # ADD tarball_nightly_test.py /home/entrypoint/tarball_nightly_test.py # ENTRYPOINT ["/home/entrypoint/tarball_nightly_test.py"] diff --git a/containers/this_version.txt b/containers/this_version.txt index 5625e59da..c068b2447 100644 --- a/containers/this_version.txt +++ b/containers/this_version.txt @@ -1 +1 @@ -1.2 +1.4 diff --git a/release_tester/arangodb/async_client.py b/release_tester/arangodb/async_client.py index f43cee5dd..f94caa20e 100644 --- a/release_tester/arangodb/async_client.py +++ b/release_tester/arangodb/async_client.py @@ -47,6 +47,8 @@ def convert_result(result_array): result += "\n" + one_line[0].decode("utf-8").rstrip() return result +def custom_writer(ArangoCLIprogressiveTimeoutExecutorInstance, writer): + writer(ArangoCLIprogressiveTimeoutExecutorInstance) class CliExecutionException(Exception): """transport CLI error texts""" @@ -70,6 +72,7 @@ def __init__(self, config, connect_instance): """launcher class for cli tools""" self.connect_instance = connect_instance self.cfg = config + self.process = None def run_arango_tool_monitored( self, @@ -79,6 +82,7 @@ def run_arango_tool_monitored( result_line, verbose, expect_to_fail=False, + writer=None ): """ runs a script in background tracing with @@ -93,18 +97,21 @@ def run_arango_tool_monitored( "--server.username", str(self.cfg.username), "--server.password", str(self.connect_instance.get_passvoid()) ] + more_args - return self.run_monitored(executeable, run_cmd, timeout, result_line, verbose, expect_to_fail) + return self.run_monitored(executeable, run_cmd, timeout, result_line, verbose, expect_to_fail, writer=writer) # fmt: on - def run_monitored(self, executeable, args, timeout, result_line, verbose, expect_to_fail=False): + def run_monitored(self, executeable, args, timeout, result_line, verbose, expect_to_fail=False, writer=None): """ run a script in background tracing with a dynamic timeout that its got output (is still alive...) """ - + write_pipe = None + if writer is not None: + write_pipe = PIPE run_cmd = [executeable] + args lh.log_cmd(run_cmd, verbose) - process = Popen( + self.process = Popen( run_cmd, + stdin=write_pipe, stdout=PIPE, stderr=PIPE, close_fds=ON_POSIX, @@ -114,23 +121,31 @@ def run_monitored(self, executeable, args, timeout, result_line, verbose, expect thread1 = Thread( name="readIO", target=enqueue_stdout, - args=(process.stdout, queue, self.connect_instance), + args=(self.process.stdout, queue, self.connect_instance), ) thread2 = Thread( name="readErrIO", target=enqueue_stderr, - args=(process.stderr, queue, self.connect_instance), + args=(self.process.stderr, queue, self.connect_instance), ) thread1.start() thread2.start() + thread3 = None + if writer is not None: + thread3 = Thread( + name="WriteIO", + target=custom_writer, + args=(self, writer), + ) + thread3.start() try: print( "me PID:%d launched PID:%d with LWPID:%d and LWPID:%d" - % (os.getpid(), process.pid, thread1.native_id, thread2.native_id) + % (os.getpid(), self.process.pid, thread1.native_id, thread2.native_id) ) except AttributeError: - print("me PID:%d launched PID:%d with LWPID:N/A and LWPID:N/A" % (os.getpid(), process.pid)) + print("me PID:%d launched PID:%d with LWPID:N/A and LWPID:N/A" % (os.getpid(), self.process.pid)) # ... do other things here # out = logfile.open('wb') @@ -169,10 +184,12 @@ def run_monitored(self, executeable, args, timeout, result_line, verbose, expect timeout_str = "TIMEOUT OCCURED!" print(timeout_str) timeout_str += "\n" - process.kill() - rc_exit = process.wait() + self.process.kill() + rc_exit = self.process.wait() thread1.join() thread2.join() + if writer: + thread3.join() if have_timeout or rc_exit != 0: res = (False, timeout_str + convert_result(result), rc_exit, line_filter) diff --git a/release_tester/arangodb/imp.py b/release_tester/arangodb/imp.py index a0f1b051b..e19d1aeae 100644 --- a/release_tester/arangodb/imp.py +++ b/release_tester/arangodb/imp.py @@ -2,7 +2,12 @@ """ Run a javascript command by spawning an arangosh to the configured connection """ +import json +import csv +import ctypes + from arangodb.async_client import ArangoCLIprogressiveTimeoutExecutor, dummy_line_result +from tools.asciiprint import print_progress as progress def get_type_args(filename): @@ -13,15 +18,51 @@ def get_type_args(filename): return ["--type=json"] if str(filename).endswith("csv"): return ["--type=csv"] + if filename == "-": + return ["--type=jsonl"] raise NotImplementedError("no filename type encoding implemented for " + filename) +month_decode = { + "JAN":"01", + "FEB":"02", + "MAR":"03", + "APR":"04", + "MAY":"05", + "JUN":"06", + "JUL":"07", + "AUG":"08", + "SEP":"09", + "OCT":"10", + "NOV":"11", + "DEC":"12" +} + +def decode_date(date): + """convert date to something more arango'ish""" + if len(date) == 24: + month = date[3:6] + day = date[0:2] + year = date[7:11] + time = date[12:24] + year += "-" + year += month_decode.get(month, "01") + year += "-" + year += day + year += "T" + year += time + return year + return date class ArangoImportExecutor(ArangoCLIprogressiveTimeoutExecutor): """configuration""" # pylint: disable=W0102 + def __init__(self, config, connect_instance): + super().__init__(config, connect_instance) + self.wikidata_reader = None + self.wikidata_nlines = 0 - def run_import_monitored(self, args, timeout, verbose=True, expect_to_fail=False): + def run_import_monitored(self, args, timeout, verbose=True, expect_to_fail=False, writer=None): # pylint: disable=R0913 disable=R0902 disable=R0915 disable=R0912 disable=R0914 """ runs an import in background tracing with @@ -40,9 +81,10 @@ def run_import_monitored(self, args, timeout, verbose=True, expect_to_fail=False dummy_line_result, verbose, expect_to_fail, + writer=writer ) - def import_collection(self, collection_name, filename, more_args=[]): + def import_collection(self, collection_name, filename, more_args=[], writer=None): """import into any collection""" # fmt: off args = [ @@ -51,7 +93,7 @@ def import_collection(self, collection_name, filename, more_args=[]): ] + get_type_args(filename) + more_args # fmt: on - ret = self.run_import_monitored(args, timeout=20, verbose=self.cfg.verbose) + ret = self.run_import_monitored(args, timeout=20, verbose=self.cfg.verbose, writer=writer) return ret def import_smart_edge_collection(self, collection_name, filename, edge_relations, more_args=[]): @@ -67,3 +109,39 @@ def import_smart_edge_collection(self, collection_name, filename, edge_relations ret = self.import_collection(collection_name, filename, more_args=args) return ret + + def wikidata_writer(self): + """pipe wikidata file into improter while translating it""" + count = 0 + for row in self.wikidata_reader: + count += 1 + if count > self.wikidata_nlines: + print("imported enough, aborting.") + break + if count > 1: # headline, we don't care... + line = json.dumps({ + 'title': row[0], + 'body': row[2], + 'count': count, + 'created':decode_date(row[1])}) + "\n" + # print(line) + progress("I") + self.process.stdin.write(line.encode()) + self.process.stdin.close() + + def import_wikidata(self, collection_name, nlines, filename, more_args=[]): + """import by write piping""" + filedes = filename.open("r", encoding='utf-8', errors='replace') + self.wikidata_reader = csv.reader(filedes, delimiter='\t') + self.wikidata_nlines = nlines + # Override csv default 128k field size + csv.field_size_limit(int(ctypes.c_ulong(-1).value // 2)) + + # args = get_type_args('foo.json') + more_args + args = ['--create-collection', 'true' ] + more_args + ret = self.import_collection( + collection_name, + filename="-", + more_args=args, + writer=ArangoImportExecutor.wikidata_writer) + return ret diff --git a/release_tester/arangodb/starter/deployments/activefailover.py b/release_tester/arangodb/starter/deployments/activefailover.py index d2c65a960..4c8586507 100644 --- a/release_tester/arangodb/starter/deployments/activefailover.py +++ b/release_tester/arangodb/starter/deployments/activefailover.py @@ -211,6 +211,8 @@ def test_setup_impl(self): if self.selenium: self.set_selenium_instances() self.selenium.test_setup() + self.wikidata_import_impl() + self.execute_views_tests_impl() def wait_for_restore_impl(self, backup_starter): backup_starter.wait_for_restore() diff --git a/release_tester/arangodb/starter/deployments/cluster.py b/release_tester/arangodb/starter/deployments/cluster.py index 0494583c1..0699f04ec 100644 --- a/release_tester/arangodb/starter/deployments/cluster.py +++ b/release_tester/arangodb/starter/deployments/cluster.py @@ -132,6 +132,8 @@ def finish_setup_impl(self): def test_setup_impl(self): if self.selenium: self.selenium.test_setup() + self.wikidata_import_impl() + self.execute_views_tests_impl() def wait_for_restore_impl(self, backup_starter): for starter in self.starter_instances: diff --git a/release_tester/arangodb/starter/deployments/dc2dc.py b/release_tester/arangodb/starter/deployments/dc2dc.py index 608f587e6..393079cd1 100644 --- a/release_tester/arangodb/starter/deployments/dc2dc.py +++ b/release_tester/arangodb/starter/deployments/dc2dc.py @@ -430,6 +430,9 @@ def test_setup_impl(self): print(res[1]) raise Exception("replication fuzzing test failed") self._get_in_sync(12) + self.wikidata_import_impl() + self.execute_views_tests_impl() + self._get_in_sync(12) def wait_for_restore_impl(self, backup_starter): for dbserver in self.cluster1["instance"].get_dbservers(): diff --git a/release_tester/arangodb/starter/deployments/leaderfollower.py b/release_tester/arangodb/starter/deployments/leaderfollower.py index e46ad365a..d3bc78b98 100644 --- a/release_tester/arangodb/starter/deployments/leaderfollower.py +++ b/release_tester/arangodb/starter/deployments/leaderfollower.py @@ -1,20 +1,21 @@ #!/usr/bin/env python """ launch and manage an arango deployment using the starter""" import time +import os import logging from pathlib import Path -from tools.interact import prompt_user -from tools.killall import get_all_processes +from arangodb.async_client import dummy_line_result from arangodb.starter.manager import StarterManager from arangodb.instance import InstanceType from arangodb.starter.deployments.runner import Runner, RunnerProperties import tools.loghelper as lh +from tools.interact import prompt_user +from tools.killall import get_all_processes from tools.asciiprint import print_progress as progress from reporting.reporting_utils import step - class LeaderFollower(Runner): """this runs a leader / Follower setup with synchronisation""" @@ -224,7 +225,8 @@ def test_setup_impl(self): self.make_data() if self.selenium: self.selenium.test_setup() - + self.wikidata_import_impl() + self.execute_views_tests_impl() logging.info("Leader follower setup successfully finished!") @step diff --git a/release_tester/arangodb/starter/deployments/runner.py b/release_tester/arangodb/starter/deployments/runner.py index 3254ab576..a391ffbb8 100644 --- a/release_tester/arangodb/starter/deployments/runner.py +++ b/release_tester/arangodb/starter/deployments/runner.py @@ -26,8 +26,10 @@ import tools.loghelper as lh from reporting.reporting_utils import step - -from arangodb.async_client import CliExecutionException +from arangodb.async_client import ( + CliExecutionException, + dummy_line_result +) from arangodb.bench import load_scenarios from arangodb.instance import InstanceType, print_instances_table from arangodb.sh import ArangoshExecutor @@ -756,6 +758,61 @@ def check_non_backup_data(self): def make_data_wait_for_upgrade_impl(self): """check the data after the upgrade""" + + @step + def wikidata_import_impl(self, collection_name='wikipedia'): + """upload wikipedia from specified CSV file""" + self.makedata_instances[0].arango_importer.import_wikidata( + collection_name, + nlines=10000, + filename=Path(os.environ["WIKI_DATA"])) + + @step + def execute_views_tests_impl(self): + """execute the views tests on the wikidata""" + all_arangosearch_tests = [ + "arangosearch-ngram_match-test-setup.js", + "arangosearch-ngram_match-test.js", + + "arangosearch-phrase-test-setup.js", + "arangosearch-phrase-test.js", + + "arangosearch-stemming-languages-test.js", + + "arangosearch-stored-values-test-setup.js", + "arangosearch-stored-values-test.js", + "arangosearch-stored-values-compression-test.js", + + "arangosearch-wildcard-levenshtein-starts-test.js", + ] + + ret_failed = [] + for one_test in all_arangosearch_tests: + this_test = self.cfg.test_data_dir / "tests" / "arangosearch" / one_test + if one_test.endswith("setup.js"): + ret = self.makedata_instances[0].arangosh.run_script_monitored( + cmd=["setting up test data", this_test], + args=[], + timeout=50, + verbose=self.cfg.verbose, + result_line=dummy_line_result + ) + if not ret[0]: + ret_failed.append( + {(one_test + " failed") : ret}) + else: + ret = self.makedata_instances[0].arangosh.run_in_arangosh( + this_test, + [], + [self.makedata_instances[0].get_frontend().get_public_url("root:%s@" % self.passvoid)], + ) + if not ret[0]: + ret_failed.append( + {(one_test + " failed") : ret}) + if len(ret_failed) is not 0: + print(ret_failed) + raise Exception('tests failed!') + @step def before_backup(self): """preparing SUT for the execution of the backup steps""" diff --git a/requirements.txt b/requirements.txt index 76aab28a6..f5dac367d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,3 +11,4 @@ markdown allure_python_commons certifi mss +csv diff --git a/test_data/tests/arangosearch/3rdparty/damerau-levenshtein-js/LICENSE b/test_data/tests/arangosearch/3rdparty/damerau-levenshtein-js/LICENSE new file mode 100644 index 000000000..af5124c47 --- /dev/null +++ b/test_data/tests/arangosearch/3rdparty/damerau-levenshtein-js/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2018 Fabvalaaah - fabvalaaah@laposte.net + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. \ No newline at end of file diff --git a/test_data/tests/arangosearch/3rdparty/damerau-levenshtein-js/README.md b/test_data/tests/arangosearch/3rdparty/damerau-levenshtein-js/README.md new file mode 100644 index 000000000..8a6993cb6 --- /dev/null +++ b/test_data/tests/arangosearch/3rdparty/damerau-levenshtein-js/README.md @@ -0,0 +1,48 @@ +# damerau-levenshtein-js + +NPM package that calculates synchronously or asynchronously the Damerau-Levenshtein distance between strings. + +## Installation + +`npm i damerau-levenshtein-js` + +## Usage + +Call to "distance" or "distanceProm" functions outputs an integer, the calculated Damerau-Levenshtein distance between 2 strings given as parameters. If the result is 0, strings are identical. The higher the result, the less similar strings are. If the result is -1, it means that a problem occurred because of a bad parameter (e.g. null or undefined). Call to "minDistanceProm" function outputs the minimum distance between a string and a list of strings given as parameters. + +### Require + +```javascript +const dljs = require("damerau-levenshtein-js"); +``` + +### Synchronous Damerau-Levenshtein distance calculation between 2 strings + +```javascript +let result = dljs.distance("hello here", "hello there"); +``` + +### Asynchronous Damerau-Levenshtein distance calculation between 2 strings + +```javascript +dljs.distanceProm("hello here", "hello there") + .then((result) => ...) + .catch((result) => ...); +``` + +### Asynchronous minimum Damerau-Levenshtein distance calculation between a string and an array of strings + +```javascript +let list = ["hello here", "hello there", "world", "world hello"]; +dljs.minDistanceProm("hello world", list) + .then((result) => ...) + .catch((result) => ...); +``` + +### Tests + +`npm test` + +## Disclaimer + +I am not responsible in any way of any consequence of the usage of this piece of software. You are warned, use it at your own risks. diff --git a/test_data/tests/arangosearch/3rdparty/damerau-levenshtein-js/__tests__/app.distance.spec.js b/test_data/tests/arangosearch/3rdparty/damerau-levenshtein-js/__tests__/app.distance.spec.js new file mode 100644 index 000000000..58e8d6cb7 --- /dev/null +++ b/test_data/tests/arangosearch/3rdparty/damerau-levenshtein-js/__tests__/app.distance.spec.js @@ -0,0 +1,93 @@ +const dljs = require("../app"); + +describe("distance", () => { + test("with correct values", () => { + const result = dljs.distance("hello here", "hello there"); + + expect(result).toStrictEqual(1); + }); + + test("with correct values (inverts)", () => { + const result = dljs.distance("hello here", "ereh olleh"); + + expect(result).toStrictEqual(8); + }); + + test("with correct values (equals)", () => { + const result = dljs.distance("hello here", "hello here"); + + expect(result).toStrictEqual(0); + }); + + test("with empty 1", () => { + const result = dljs.distance("", "hello there"); + + expect(result).toStrictEqual("hello there".length); + }); + + test("with empty 2", () => { + const result = dljs.distance("hello here", ""); + + expect(result).toStrictEqual("hello here".length); + }); + + test("with empty 1 & 2", () => { + const result = dljs.distance("", ""); + + expect(result).toStrictEqual(0); + }); + + test("with wrong type 1", () => { + const result = dljs.distance(8, "hello there"); + + expect(result).toStrictEqual(-1); + }); + + test("with wrong type 2", () => { + const result = dljs.distance("hello here", 8); + + expect(result).toStrictEqual(-1); + }); + + test("with wrong type 1 & 2", () => { + const result = dljs.distance(7, 8); + + expect(result).toStrictEqual(-1); + }); + + test("with null 1", () => { + const result = dljs.distance(null, "hello there"); + + expect(result).toStrictEqual(-1); + }); + + test("with null 2", () => { + const result = dljs.distance("hello here", null); + + expect(result).toStrictEqual(-1); + }); + + test("with null 1 & 2", () => { + const result = dljs.distance(null, null); + + expect(result).toStrictEqual(-1); + }); + + test("with undefined 1", () => { + const result = dljs.distance(undefined, "hello there"); + + expect(result).toStrictEqual(-1); + }); + + test("with undefined 2", () => { + const result = dljs.distance("hello here", undefined); + + expect(result).toStrictEqual(-1); + }); + + test("with undefined 1 & 2", () => { + const result = dljs.distance(undefined, undefined); + + expect(result).toStrictEqual(-1); + }); +}); diff --git a/test_data/tests/arangosearch/3rdparty/damerau-levenshtein-js/__tests__/app.distanceProm.spec.js b/test_data/tests/arangosearch/3rdparty/damerau-levenshtein-js/__tests__/app.distanceProm.spec.js new file mode 100644 index 000000000..cb27cb456 --- /dev/null +++ b/test_data/tests/arangosearch/3rdparty/damerau-levenshtein-js/__tests__/app.distanceProm.spec.js @@ -0,0 +1,120 @@ +const dljs = require("../app"); + +describe("distanceProm", () => { + test("with correct values", async () => { + const result = await dljs.distanceProm("hello here", "hello there"); + + expect(result).toStrictEqual(1); + }); + + test("with correct values (inverts)", async () => { + const result = await dljs.distanceProm("hello here", "ereh olleh"); + + expect(result).toStrictEqual(8); + }); + + test("with correct values (equals)", async () => { + const result = await dljs.distanceProm("hello here", "hello here"); + + expect(result).toStrictEqual(0); + }); + + test("with empty 1", async () => { + const result = await dljs.distanceProm("", "hello there"); + + expect(result).toStrictEqual("hello there".length); + }); + + test("with empty 2", async () => { + const result = await dljs.distanceProm("hello here", ""); + + expect(result).toStrictEqual("hello here".length); + }); + + test("with empty 1 & 2", async () => { + const result = await dljs.distanceProm("", ""); + + expect(result).toStrictEqual(0); + }); + + test("with wrong type 1", async () => { + try { + const result = await dljs.distanceProm(8, "hello there"); + fail("should not end up here"); + } catch (result) { + expect(result).toStrictEqual(-1); + } + }); + + test("with wrong type 2", async () => { + try { + const result = await dljs.distanceProm("hello here", 8); + fail("should not end up here"); + } catch (result) { + expect(result).toStrictEqual(-1); + } + }); + + test("with wrong type 1 & 2", async () => { + try { + const result = await dljs.distanceProm(7, 8); + fail("should not end up here"); + } catch (result) { + expect(result).toStrictEqual(-1); + } + }); + + test("with null 1", async () => { + try { + const result = await dljs.distanceProm(null, "hello there"); + fail("should not end up here"); + } catch (result) { + expect(result).toStrictEqual(-1); + } + }); + + test("with null 2", async () => { + try { + const result = await dljs.distanceProm("hello here", null); + fail("should not end up here"); + } catch (result) { + expect(result).toStrictEqual(-1); + } + }); + + test("with null 1 & 2", async () => { + try { + const result = await dljs.distanceProm(null, null); + fail("should not end up here"); + } catch (result) { + expect(result).toStrictEqual(-1); + } + }); + + test("with undefined 1", async () => { + try { + const result = await dljs.distanceProm(undefined, "hello there"); + fail("should not end up here"); + } catch (result) { + expect(result).toStrictEqual(-1); + } + }); + + test("with undefined 2", async () => { + try { + const result = await dljs.distanceProm("hello here", undefined); + fail("should not end up here"); + } catch (result) { + expect(result).toStrictEqual(-1); + } + }); + + test("with undefined 1 & 2", async () => { + try { + const result = await dljs.distanceProm(undefined, undefined); + fail("should not end up here"); + } catch (result) { + expect(result).toStrictEqual(-1); + } + }); +}); diff --git a/test_data/tests/arangosearch/3rdparty/damerau-levenshtein-js/__tests__/app.minDistanceProm.spec.js b/test_data/tests/arangosearch/3rdparty/damerau-levenshtein-js/__tests__/app.minDistanceProm.spec.js new file mode 100644 index 000000000..88a106ab6 --- /dev/null +++ b/test_data/tests/arangosearch/3rdparty/damerau-levenshtein-js/__tests__/app.minDistanceProm.spec.js @@ -0,0 +1,131 @@ +const dljs = require("../app"); + +describe("minDistanceProm", () => { + const list = ["hello here", "hello there", "world", "world hello"]; + + test("with correct values", async () => { + const result = await dljs.minDistanceProm("hello world", list); + + expect(result).toStrictEqual(4); + }); + + test("with correct values (inverts)", async () => { + const result = await dljs.minDistanceProm("ereh olleh", list); + + expect(result).toStrictEqual(8); + }); + + test("with correct values (equals)", async () => { + const result = await dljs.minDistanceProm("hello here", list); + + expect(result).toStrictEqual(0); + }); + + test("with empty 1", async () => { + const result = await dljs.minDistanceProm("", list); + + expect(result).toStrictEqual("world".length); + }); + + test("with empty 2", async () => { + const result = await dljs.minDistanceProm("hello here", []); + + expect(result).toStrictEqual(await dljs.distanceProm("hello here", "")); + }); + + test("with empty 1 & 2", async () => { + const result = await dljs.minDistanceProm("", []); + + expect(result).toStrictEqual(0); + }); + + test("with wrong type 1", async () => { + try { + await dljs.minDistanceProm(8, list); + fail("should not end up here"); + } catch (result) { + expect(result).toStrictEqual(-1); + } + }); + + test("with wrong type 2", async () => { + try { + await dljs.minDistanceProm("hello here", 8); + fail("should not end up here"); + } catch (result) { + expect(result).toStrictEqual(-1); + } + }); + + test("with wrong type 1 & 2", async () => { + try { + await dljs.minDistanceProm(7, 8); + fail("should not end up here"); + } catch (result) { + expect(result).toStrictEqual(-1); + } + }); + + test("with list containing wrong types", async () => { + try { + await dljs.minDistanceProm("hello here", ["world", 1, true]); + fail("should not end up here"); + } catch (result) { + expect(result).toStrictEqual(-1); + } + }); + + test("with null 1", async () => { + try { + await dljs.minDistanceProm(null, list); + fail("should not end up here"); + } catch (result) { + expect(result).toStrictEqual(-1); + } + }); + + test("with null 2", async () => { + try { + await dljs.minDistanceProm("hello here", null); + fail("should not end up here"); + } catch (result) { + expect(result).toStrictEqual(-1); + } + }); + + test("with null 1 & 2", async () => { + try { + await dljs.minDistanceProm(null, null); + fail("should not end up here"); + } catch (result) { + expect(result).toStrictEqual(-1); + } + }); + + test("with undefined 1", async () => { + try { + await dljs.minDistanceProm(undefined, list); + fail("should not end up here"); + } catch (result) { + expect(result).toStrictEqual(-1); + } + }); + + test("with undefined 2", async () => { + try { + await dljs.minDistanceProm("hello here", undefined); + fail("should not end up here"); + } catch (result) { + expect(result).toStrictEqual(-1); + } + }); + + test("with undefined 1 & 2", async () => { + try { + await dljs.minDistanceProm(undefined, undefined); + fail("should not end up here"); + } catch (result) { + expect(result).toStrictEqual(-1); + } + }); +}); diff --git a/test_data/tests/arangosearch/3rdparty/damerau-levenshtein-js/app.js b/test_data/tests/arangosearch/3rdparty/damerau-levenshtein-js/app.js new file mode 100644 index 000000000..c184b0c8e --- /dev/null +++ b/test_data/tests/arangosearch/3rdparty/damerau-levenshtein-js/app.js @@ -0,0 +1,132 @@ +/** + * MIT License + * + * Copyright (c) 2018 Fabvalaaah - fabvalaaah@laposte.net + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/** + * DISCLAIMER: + * I am not responsible in any way of any consequence of the usage of this piece + * of software. You are warned, use it at your own risks. + */ + +const initMatrix = (s1, s2) => { + /* istanbul ignore next */ + if (undefined == s1 || undefined == s2) { + return null; + } + + let d = []; + for (let i = 0; i <= s1.length; i++) { + d[i] = []; + d[i][0] = i; + } + for (let j = 0; j <= s2.length; j++) { + d[0][j] = j; + } + + return d; +}; + +const damerau = (i, j, s1, s2, d, cost) => { + if (i > 1 && j > 1 && s1[i - 1] === s2[j - 2] && s1[i - 2] === s2[j - 1]) { + d[i][j] = Math.min.apply(null, [d[i][j], d[i - 2][j - 2] + cost]); + } +}; + +const distance = (s1, s2) => { + if ( + undefined == s1 || + undefined == s2 || + "string" !== typeof s1 || + "string" !== typeof s2 + ) { + return -1; + } + + let d = initMatrix(s1, s2); + /* istanbul ignore next */ + if (null === d) { + return -1; + } + for (var i = 1; i <= s1.length; i++) { + let cost; + for (let j = 1; j <= s2.length; j++) { + if (s1.charAt(i - 1) === s2.charAt(j - 1)) { + cost = 0; + } else { + cost = 1; + } + + d[i][j] = Math.min.apply(null, [ + d[i - 1][j] + 1, + d[i][j - 1] + 1, + d[i - 1][j - 1] + cost, + ]); + + damerau(i, j, s1, s2, d, cost); + } + } + + return d[s1.length][s2.length]; +}; + +const distanceProm = (s1, s2) => + new Promise((resolve, reject) => { + let result = distance(s1, s2); + if (0 <= result) { + resolve(result); + } else { + reject(result); + } + }); + +const minDistanceProm = (s1, list) => + new Promise((resolve, reject) => { + if (undefined == list || !Array.isArray(list)) { + reject(-1); + return; + } else if (0 === list.length) { + resolve(distance(s1, "")); + return; + } + + let min = -2; + + list.forEach((s2) => { + let d = distance(s1, s2); + if (-2 === min || d < min) { + min = d; + } + }); + + if (0 <= min) { + resolve(min); + } else { + reject(min); + } + }); + +module.exports = { + distanceProm, + distance, + minDistanceProm, +}; diff --git a/test_data/tests/arangosearch/3rdparty/damerau-levenshtein-js/package.json b/test_data/tests/arangosearch/3rdparty/damerau-levenshtein-js/package.json new file mode 100644 index 000000000..19ed68e97 --- /dev/null +++ b/test_data/tests/arangosearch/3rdparty/damerau-levenshtein-js/package.json @@ -0,0 +1,67 @@ +{ + "_from": "damerau-levenshtein-js", + "_id": "damerau-levenshtein-js@1.1.6", + "_inBundle": false, + "_integrity": "sha512-D/ZPDdlxY9yWzH9WZHATDoaeafqWi+EEK0kFfHdumxXKneNLVBhplETxIYPv3yIWGISpICdOruV8f9eMTA4nrA==", + "_location": "/damerau-levenshtein-js", + "_phantomChildren": {}, + "_requested": { + "type": "tag", + "registry": true, + "raw": "damerau-levenshtein-js", + "name": "damerau-levenshtein-js", + "escapedName": "damerau-levenshtein-js", + "rawSpec": "", + "saveSpec": null, + "fetchSpec": "latest" + }, + "_requiredBy": [ + "#USER", + "/" + ], + "_resolved": "https://registry.npmjs.org/damerau-levenshtein-js/-/damerau-levenshtein-js-1.1.6.tgz", + "_shasum": "b49d09fd3dd75cbf993669c254e2c6ed9a19f39b", + "_spec": "damerau-levenshtein-js", + "_where": "C:\\Working\\release-qa\\arangosearch", + "author": { + "name": "Fabvalaaah" + }, + "bugs": { + "url": "https://github.com/fabvalaaah/damerau-levenshtein-js/issues" + }, + "bundleDependencies": false, + "contributors": [ + { + "name": "Fabvalaaah" + } + ], + "deprecated": false, + "description": "NPM package that calculates synchronously or asynchronously the Damerau-Levenshtein distance between strings", + "devDependencies": { + "jest": "26.0.1" + }, + "email": "fabvalaaah@laposte.net", + "homepage": "https://github.com/fabvalaaah/damerau-levenshtein-js#readme", + "jest": { + "testEnvironment": "node" + }, + "keywords": [ + "damerau", + "levenshtein", + "damerau-levenshtein", + "levenshtein-damerau", + "string", + "distance" + ], + "license": "MIT", + "main": "app.js", + "name": "damerau-levenshtein-js", + "repository": { + "type": "git", + "url": "git+https://github.com/fabvalaaah/damerau-levenshtein-js.git" + }, + "scripts": { + "test": "jest --coverage" + }, + "version": "1.1.6" +} diff --git a/test_data/tests/arangosearch/3rdparty/js-levenshtein/LICENSE b/test_data/tests/arangosearch/3rdparty/js-levenshtein/LICENSE new file mode 100644 index 000000000..fcda385cc --- /dev/null +++ b/test_data/tests/arangosearch/3rdparty/js-levenshtein/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2017 Gustaf Andersson + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/test_data/tests/arangosearch/3rdparty/js-levenshtein/README.md b/test_data/tests/arangosearch/3rdparty/js-levenshtein/README.md new file mode 100644 index 000000000..f2f9650d0 --- /dev/null +++ b/test_data/tests/arangosearch/3rdparty/js-levenshtein/README.md @@ -0,0 +1,59 @@ +# js-levenshtein [![Build Status](https://travis-ci.org/gustf/js-levenshtein.svg?branch=master)](https://travis-ci.org/gustf/js-levenshtein) + +A very efficient JS implementation calculating the Levenshtein distance, i.e. the difference between two strings. + +Based on Wagner-Fischer dynamic programming algorithm, optimized for speed and memory + - use a single distance vector instead of a matrix + - loop unrolling on the outer loop + - remove common prefixes/postfixes from the calculation + - minimize the number of comparisons + +## Install + +``` +$ npm install --save js-levenshtein +``` + + +## Usage + +```js +const levenshtein = require('js-levenshtein'); + +levenshtein('kitten', 'sitting'); +//=> 3 +``` + + +## Benchmark + +``` +$ npm run bench + + 50 paragraphs, length max=500 min=240 avr=372.5 + 162 op/s » js-levenshtein + 98 op/s » talisman + 94 op/s » levenshtein-edit-distance + 85 op/s » leven + 39 op/s » fast-levenshtein + + 100 sentences, length max=170 min=6 avr=57.5 + 3,076 op/s » js-levenshtein + 2,024 op/s » talisman + 1,817 op/s » levenshtein-edit-distance + 1,633 op/s » leven + 800 op/s » fast-levenshtein + + 2000 words, length max=20 min=3 avr=9.5 + 3,119 op/s » js-levenshtein + 2,416 op/s » talisman + 2,141 op/s » levenshtein-edit-distance + 1,855 op/s » leven + 1,260 op/s » fast-levenshtein +``` + +Benchmarks was performed with node v8.12.0 on a MacBook Pro 15", 2.9 GHz Intel Core i9 + +## License + +MIT © Gustaf Andersson \ No newline at end of file diff --git a/test_data/tests/arangosearch/3rdparty/js-levenshtein/index.js b/test_data/tests/arangosearch/3rdparty/js-levenshtein/index.js new file mode 100644 index 000000000..e3ae013c8 --- /dev/null +++ b/test_data/tests/arangosearch/3rdparty/js-levenshtein/index.js @@ -0,0 +1,105 @@ +'use strict'; +module.exports = (function() +{ + function _min(d0, d1, d2, bx, ay) + { + return d0 < d1 || d2 < d1 + ? d0 > d2 + ? d2 + 1 + : d0 + 1 + : bx === ay + ? d1 + : d1 + 1; + } + + return function(a, b) + { + if (a === b) { + return 0; + } + + if (a.length > b.length) { + var tmp = a; + a = b; + b = tmp; + } + + var la = a.length; + var lb = b.length; + + while (la > 0 && (a.charCodeAt(la - 1) === b.charCodeAt(lb - 1))) { + la--; + lb--; + } + + var offset = 0; + + while (offset < la && (a.charCodeAt(offset) === b.charCodeAt(offset))) { + offset++; + } + + la -= offset; + lb -= offset; + + if (la === 0 || lb < 3) { + return lb; + } + + var x = 0; + var y; + var d0; + var d1; + var d2; + var d3; + var dd; + var dy; + var ay; + var bx0; + var bx1; + var bx2; + var bx3; + + var vector = []; + + for (y = 0; y < la; y++) { + vector.push(y + 1); + vector.push(a.charCodeAt(offset + y)); + } + + var len = vector.length - 1; + + for (; x < lb - 3;) { + bx0 = b.charCodeAt(offset + (d0 = x)); + bx1 = b.charCodeAt(offset + (d1 = x + 1)); + bx2 = b.charCodeAt(offset + (d2 = x + 2)); + bx3 = b.charCodeAt(offset + (d3 = x + 3)); + dd = (x += 4); + for (y = 0; y < len; y += 2) { + dy = vector[y]; + ay = vector[y + 1]; + d0 = _min(dy, d0, d1, bx0, ay); + d1 = _min(d0, d1, d2, bx1, ay); + d2 = _min(d1, d2, d3, bx2, ay); + dd = _min(d2, d3, dd, bx3, ay); + vector[y] = dd; + d3 = d2; + d2 = d1; + d1 = d0; + d0 = dy; + } + } + + for (; x < lb;) { + bx0 = b.charCodeAt(offset + (d0 = x)); + dd = ++x; + for (y = 0; y < len; y += 2) { + dy = vector[y]; + vector[y] = dd = _min(dy, d0, dd, bx0, vector[y + 1]); + d0 = dy; + } + } + + return dd; + }; +})(); + diff --git a/test_data/tests/arangosearch/3rdparty/js-levenshtein/package.json b/test_data/tests/arangosearch/3rdparty/js-levenshtein/package.json new file mode 100644 index 000000000..e2f2f02c1 --- /dev/null +++ b/test_data/tests/arangosearch/3rdparty/js-levenshtein/package.json @@ -0,0 +1,82 @@ +{ + "_from": "js-levenshtein", + "_id": "js-levenshtein@1.1.6", + "_inBundle": false, + "_integrity": "sha512-X2BB11YZtrRqY4EnQcLX5Rh373zbK4alC1FW7D7MBhL2gtcC17cTnr6DmfHZeS0s2rTHjUTMMHfG7gO8SSdw+g==", + "_location": "/js-levenshtein", + "_phantomChildren": {}, + "_requested": { + "type": "tag", + "registry": true, + "raw": "js-levenshtein", + "name": "js-levenshtein", + "escapedName": "js-levenshtein", + "rawSpec": "", + "saveSpec": null, + "fetchSpec": "latest" + }, + "_requiredBy": [ + "#USER", + "/" + ], + "_resolved": "https://registry.npmjs.org/js-levenshtein/-/js-levenshtein-1.1.6.tgz", + "_shasum": "c6cee58eb3550372df8deb85fad5ce66ce01d59d", + "_spec": "js-levenshtein", + "_where": "C:\\Working\\release-qa\\arangosearch", + "author": { + "name": "Gustaf Andersson", + "email": "gustaf@me.com" + }, + "bugs": { + "url": "https://github.com/gustf/js-levenshtein/issues" + }, + "bundleDependencies": false, + "deprecated": false, + "description": "The most efficient JS implementation calculating the Levenshtein distance, i.e. the difference between two strings.", + "devDependencies": { + "ava": "^0.25.0", + "fast-levenshtein": "^2.0.6", + "leven": "^2.1.0", + "levenshtein-edit-distance": "^2.0.3", + "matcha": "^0.7.0", + "talisman": "^0.21.0", + "xo": "^0.23.0" + }, + "engines": { + "node": ">=0.10.0" + }, + "files": [ + "index.js" + ], + "homepage": "https://github.com/gustf/js-levenshtein#readme", + "keywords": [ + "levenshtein", + "distance", + "algorithm", + "algo", + "string", + "difference", + "diff", + "fast", + "fuzzy", + "similar", + "similarity", + "compare", + "comparison", + "edit", + "text", + "match", + "matching" + ], + "license": "MIT", + "name": "js-levenshtein", + "repository": { + "type": "git", + "url": "git+https://github.com/gustf/js-levenshtein.git" + }, + "scripts": { + "bench": "matcha bench.js", + "test": "ava" + }, + "version": "1.1.6" +} diff --git a/test_data/tests/arangosearch/WikiLoader.py b/test_data/tests/arangosearch/WikiLoader.py new file mode 100644 index 000000000..9f88ac2b7 --- /dev/null +++ b/test_data/tests/arangosearch/WikiLoader.py @@ -0,0 +1,107 @@ +# python script for loading IResearch benchmark dump of Wikipedia into +# ArangoDB database. Uses python-arango driver https://github.com/Joowani/python-arango +# Data is loaded in form { title: 'XXXXX', body: 'XXXXXXXXXXXXX', 'count': XXXX, 'created':XXXX}. +# DB server should be set up to run without authorization + +################################################################################ +## DISCLAIMER +## +## Copyright 2020 ArangoDB GmbH, Cologne, Germany +## +## Licensed under the Apache License, Version 2.0 (the "License"); +## you may not use this file except in compliance with the License. +## You may obtain a copy of the License at +## +## http://www.apache.org/licenses/LICENSE-2.0 +## +## Unless required by applicable law or agreed to in writing, software +## distributed under the License is distributed on an "AS IS" BASIS, +## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +## See the License for the specific language governing permissions and +## limitations under the License. +## +## Copyright holder is ArangoDB GmbH, Cologne, Germany +## +## @author Andrei Lobov +################################################################################ + +import sys +import csv +import ctypes +from arango import ArangoClient + +monthDecode = { + "JAN":"01", "FEB":"02", "MAR":"03", "APR":"04", + "MAY":"05", "JUN":"06", "JUL":"07", "AUG":"08", + "SEP":"09", "OCT":"10", "NOV":"11", "DEC":"12" +} + +def decodeDate(d): + if len(d) == 24: + month = d[3:6] + day = d[0:2] + year = d[7:11] + time = d[12:24] + year += "-"; + year += monthDecode.get(month, "01") + year += "-" + year += day + year += "T" + year += time + return year + return d + +def main(): + if len(sys.argv) < 6: + print("Usage: host database collection data_file count [offset] Example: python WikiLoader.py 'http://localhost:8529' _system wikipedia benchmark.data 10000000") + return + + + # Override csv default 128k field size + csv.field_size_limit(int(ctypes.c_ulong(-1).value // 2)) + + # Initialize the client for ArangoDB. + client = ArangoClient(hosts=sys.argv[1]) + + # Upload settings + filename = sys.argv[4] # data file + collection = sys.argv[3] # target collection + database = sys.argv[2] # target database + line_limit = int(sys.argv[5]) # how many documents to upload + batch_size = 10000 # batch size for inserting into Arango + + offset = 0 + if len(sys.argv) > 6: + offset = int(sys.argv[6]) + + db = client.db(database) + if db.has_collection(collection): + wikipedia = db.collection(collection) + else: + wikipedia = db.create_collection(collection) + f = open(filename, mode ='r', encoding='utf-8', errors='replace') + reader = csv.reader(f, delimiter='\t') + data = [] + total = 0 + count = offset + for row in reader: + if offset > 0: + offset = offset - 1 + continue + data.append({'title': row[0], 'body': row[2], 'count': count, 'created':decodeDate(row[1])}) + if len(data) > batch_size: + wikipedia.insert_many(data) + data.clear() + print('Loaded ' + str(total) + ' ' + str( round((total/line_limit) * 100, 2)) + '% \n') + total = total + 1 + if total >= line_limit: + break + count = count + 1 + if len(data) > 0: + wikipedia.insert_many(data) + print('Loaded ' + str(total) + ' ' + str( round((total/line_limit) * 100, 2)) + '% \n') + f.close() + + +if __name__== "__main__": + main() diff --git a/test_data/tests/arangosearch/arangosearch-ngram_match-test-setup.js b/test_data/tests/arangosearch/arangosearch-ngram_match-test-setup.js new file mode 100644 index 000000000..aec82e33b --- /dev/null +++ b/test_data/tests/arangosearch/arangosearch-ngram_match-test-setup.js @@ -0,0 +1,63 @@ +/*jshint globalstrict:false, strict:false, maxlen: 500 */ + +//////////////////////////////////////////////////////////////////////////////// +/// DISCLAIMER +/// +/// Copyright 2020 ArangoDB GmbH, Cologne, Germany +/// +/// Licensed under the Apache License, Version 2.0 (the "License"); +/// you may not use this file except in compliance with the License. +/// You may obtain a copy of the License at +/// +/// http://www.apache.org/licenses/LICENSE-2.0 +/// +/// Unless required by applicable law or agreed to in writing, software +/// distributed under the License is distributed on an "AS IS" BASIS, +/// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +/// See the License for the specific language governing permissions and +/// limitations under the License. +/// +/// Copyright holder is ArangoDB GmbH, Cologne, Germany +/// +/// @author Andrey Abramov +//////////////////////////////////////////////////////////////////////////////// + +const db = require("@arangodb").db; +const analyzers = require("@arangodb/analyzers"); + +let wiki = db._collection("wikipedia"); +if (wiki == null || wiki == undefined) { + db._create("wikipedia"); +} + +analyzers.save( + "tokenizer", "text", + { locale: "en", stemming:false, accent:true, case:"none" }); +analyzers.save( + "unigram", "ngram", + { min: 1, max: 1, preserveOriginal:false, streamType:"utf8" }, + ["frequency", "position", "norm"]); +analyzers.save( + "bigram", "ngram", + { min: 2, max: 2, preserveOriginal:false, streamType:"utf8" }, + ["frequency", "position", "norm"]); +analyzers.save( + "bigramWithoutPosition", "ngram", + { min: 1, max: 1, preserveOriginal:false, streamType:"utf8" }, + ["frequency", "norm"]); +analyzers.save( + "trigram", "ngram", + { min: 3, max: 3, preserveOriginal:false, streamType:"utf8" }, + ["frequency", "position", "norm"]); + +db._createView( + "v_wiki_ngram", "arangosearch", + { + links : { + wikipedia : { + includeAllFields: true, + fields: { + title: { analyzers: ["unigram", "bigram", "trigram", "identity"] }, + body: { analyzers: ["unigram", "bigram", "trigram", "tokenizer" ] } + } + } } }); diff --git a/test_data/tests/arangosearch/arangosearch-ngram_match-test.js b/test_data/tests/arangosearch/arangosearch-ngram_match-test.js new file mode 100644 index 000000000..994651551 --- /dev/null +++ b/test_data/tests/arangosearch/arangosearch-ngram_match-test.js @@ -0,0 +1,170 @@ +/*jshint globalstrict:false, strict:false, maxlen: 500 */ + +//////////////////////////////////////////////////////////////////////////////// +/// DISCLAIMER +/// +/// Copyright 2020 ArangoDB GmbH, Cologne, Germany +/// +/// Licensed under the Apache License, Version 2.0 (the "License"); +/// you may not use this file except in compliance with the License. +/// You may obtain a copy of the License at +/// +/// http://www.apache.org/licenses/LICENSE-2.0 +/// +/// Unless required by applicable law or agreed to in writing, software +/// distributed under the License is distributed on an "AS IS" BASIS, +/// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +/// See the License for the specific language governing permissions and +/// limitations under the License. +/// +/// Copyright holder is ArangoDB GmbH, Cologne, Germany +/// +/// @author Andrey Abramov +//////////////////////////////////////////////////////////////////////////////// + +const jsunity = require("jsunity"); +const errors = require("internal").errors; +const _ = require('lodash'); +const { + assertEqual, assertTrue, + assertFalse, assertNotEqual, + assertException, assertNotNull } = jsunity.jsUnity.assertions; +const db = require("@arangodb").db; +const analyzers = require("@arangodb/analyzers"); + +//////////////////////////////////////////////////////////////////////////////// +/// @brief test suite +//////////////////////////////////////////////////////////////////////////////// + +function ArangoSearch_NGRAM_MATCH() { + return { + testNonStringInput: function () { + try { + db._query(` + FOR d IN v_wiki_ngram + SEARCH NGRAM_MATCH(d.title, null, 1, "bigram") + SORT d._id + RETURN d`); + fail(); + } catch (e) { + assertEqual(errors.ERROR_BAD_PARAMETER.code, e.errorNum); + } + }, + + testInvalidAnalyzer: function () { + try { + db._query(` + FOR d IN v_wiki_ngram + SEARCH NGRAM_MATCH(d.title, "Lord Rings", 1, null) + SORT d._id + RETURN d`); + fail(); + } catch (e) { + assertEqual(errors.ERROR_BAD_PARAMETER.code, e.errorNum); + } + + try { + db._query(` + FOR d IN v_wiki_ngram + SEARCH NGRAM_MATCH(d.title, "Lord Rings", 1, "!!invalidAnalyzer@@") + SORT d._id + RETURN d`); + fail(); + } catch (e) { + assertEqual(errors.ERROR_BAD_PARAMETER.code, e.errorNum); + } + }, + + testNonStringField: function () { + let actual = db._query(` + FOR d IN v_wiki_ngram + SEARCH NGRAM_MATCH(d.count, "Lord Rings", 1, "bigram") + SORT d._id + RETURN d`).toArray(); + assertEqual(actual.length, 0); + }, + + testStringFieldWithoutPositions: function () { + let actual = db._query(` + FOR d IN v_wiki_ngram + SEARCH NGRAM_MATCH(d.title, "Lord Rings", 1, "bigramWithoutPosition") + SORT d._id + RETURN d`).toArray(); + assertEqual(actual.length, 0); + }, + + // 0 threshold == match all + testZeroThreshold: function () { + let expected = db._query(` + FOR d IN wikipedia + SORT d._key + RETURN d`).toArray(); + assertNotEqual(expected.length, 0); + + let actual = db._query(` + FOR d IN v_wiki_ngram + SEARCH NGRAM_MATCH(d.title, "Lord Rings", 0, "bigram") + SORT d._id + RETURN d`).toArray(); + + assertEqual(expected.length, actual.length); + actual.forEach((rhs, i) => assertTrue(_.isEqual(expected[i], rhs))); + }, + + testEquality: function () { + let expected = db._query(` + FOR d IN wikipedia + FILTER [ "Lord", "Rings" ] ALL IN TOKENS(d.title, "tokenizer") + SORT d._id + RETURN d`).toArray(); + assertNotEqual(expected.length, 0); + + { + let actual = db._query(` + FOR d IN v_wiki_ngram + SEARCH NGRAM_MATCH(d.title, "Lord Rings", 1, "bigram") + SORT d._id + RETURN d`).toArray(); + + assertEqual(expected.length, actual.length); + actual.forEach((rhs, i) => assertTrue(_.isEqual(expected[i], rhs))); + } + + // arguments as bind variables + { + let actual = db._query(` + FOR d IN v_wiki_ngram + SEARCH NGRAM_MATCH(d.title, @input, @threshold, @analyzer) + SORT d._id + RETURN d`, { input : "Lord Rings", analyzer: "bigram", threshold:1}).toArray(); + + assertEqual(expected.length, actual.length); + actual.forEach((rhs, i) => assertTrue(_.isEqual(expected[i], rhs))); + } + + // arguments as references + { + let actual = db._query(` + LET input = NOOPT("Lord Rings") + LET analyzer = NOOPT("bigram") + LET threshold = NOOPT(1) + FOR d IN v_wiki_ngram + SEARCH NGRAM_MATCH(d.title, input, threshold, analyzer) + SORT d._id + RETURN d`).toArray(); + + assertEqual(expected.length, actual.length); + actual.forEach((rhs, i) => assertTrue(_.isEqual(expected[i], rhs))); + } + } + }; +} + +//////////////////////////////////////////////////////////////////////////////// +/// @brief executes the test suite +//////////////////////////////////////////////////////////////////////////////// + +jsunity.run(ArangoSearch_NGRAM_MATCH); +if (false === jsunity.done().status) { + throw "fail"; +} diff --git a/test_data/tests/arangosearch/arangosearch-phrase-test-setup.js b/test_data/tests/arangosearch/arangosearch-phrase-test-setup.js new file mode 100644 index 000000000..251ac6408 --- /dev/null +++ b/test_data/tests/arangosearch/arangosearch-phrase-test-setup.js @@ -0,0 +1,48 @@ +/*jshint globalstrict:false, strict:false, maxlen: 500 */ + +//////////////////////////////////////////////////////////////////////////////// +/// DISCLAIMER +/// +/// Copyright 2020 ArangoDB GmbH, Cologne, Germany +/// +/// Licensed under the Apache License, Version 2.0 (the "License"); +/// you may not use this file except in compliance with the License. +/// You may obtain a copy of the License at +/// +/// http://www.apache.org/licenses/LICENSE-2.0 +/// +/// Unless required by applicable law or agreed to in writing, software +/// distributed under the License is distributed on an "AS IS" BASIS, +/// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +/// See the License for the specific language governing permissions and +/// limitations under the License. +/// +/// Copyright holder is ArangoDB GmbH, Cologne, Germany +/// +/// @author Andrey Abramov +//////////////////////////////////////////////////////////////////////////////// + +const db = require("@arangodb").db; +const analyzers = require("@arangodb/analyzers"); + +analyzers.save( + "tokenizer", "text", + { locale: "en", stemming:false, accent:false, case:"lower" }, + ["frequency", "position", "norm"]); + +analyzers.save( + "tokenizerWithoutPosition", "text", + { locale: "en", stemming:false, accent:false, case:"lower" }, + ["frequency", "norm"]); + +db._createView( + "v_wiki_phrase", "arangosearch", + { + links : { + wikipedia : { + includeAllFields: true, + fields: { + title: { analyzers: ["tokenizer", "tokenizerWithoutPosition"] }, + body: { analyzers: ["tokenizer" ] } + } + } } }); diff --git a/test_data/tests/arangosearch/arangosearch-phrase-test.js b/test_data/tests/arangosearch/arangosearch-phrase-test.js new file mode 100644 index 000000000..3fedb5a5a --- /dev/null +++ b/test_data/tests/arangosearch/arangosearch-phrase-test.js @@ -0,0 +1,373 @@ +/*jshint globalstrict:false, strict:false, maxlen: 500 */ +/*global assertUndefined, assertEqual, assertNotEqual, assertTrue, assertFalse, fail*/ + +//////////////////////////////////////////////////////////////////////////////// +/// DISCLAIMER +/// +/// Copyright 2020 ArangoDB GmbH, Cologne, Germany +/// +/// Licensed under the Apache License, Version 2.0 (the "License"); +/// you may not use this file except in compliance with the License. +/// You may obtain a copy of the License at +/// +/// http://www.apache.org/licenses/LICENSE-2.0 +/// +/// Unless required by applicable law or agreed to in writing, software +/// distributed under the License is distributed on an "AS IS" BASIS, +/// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +/// See the License for the specific language governing permissions and +/// limitations under the License. +/// +/// Copyright holder is ArangoDB GmbH, Cologne, Germany +/// +/// @author Andrey Abramov +//////////////////////////////////////////////////////////////////////////////// + +const jsunity = require("jsunity"); +const errors = require("internal").errors; +const _ = require('lodash'); +const { + assertEqual, assertNotEqual, + assertTrue, assertFalse, + assertNull, assertNotNull, + assertIdentical, assertNotIdentical, + assertMatch, assertNotMatch, + assertTypeOf, assertNotTypeOf, + assertInstanceOf, assertNotInstanceOf, + assertUndefined, assertNotUndefined, + assertNan, assertNotNan, + fail } = jsunity.jsUnity.assertions; +const db = require("@arangodb").db; +const analyzers = require("@arangodb/analyzers"); + +function ArangoSearch_PHRASE() { + return { + testNonStringInput: function () { + try { + db._query(` + FOR d IN v_wiki_phrase + SEARCH PHRASE(d.title, null, "text_en") + SORT d._id + RETURN d`); + fail(); + } catch (e) { + assertEqual(errors.ERROR_BAD_PARAMETER.code, e.errorNum); + } + }, + + testInvalidOffset: function () { + try { + db._query(` + FOR d IN v_wiki_phrase + SEARCH PHRASE(d.title, "Lord Of", "The Rings", "tokenizer") + SORT d._id + RETURN d`); + fail(); + } catch (e) { + assertEqual(errors.ERROR_BAD_PARAMETER.code, e.errorNum); + } + + try { + db._query(` + FOR d IN v_wiki_phrase + SEARCH PHRASE(d.title, "Lord Of", null, "The Rings", "tokenizer") + SORT d._id + RETURN d`); + fail(); + } catch (e) { + assertEqual(errors.ERROR_BAD_PARAMETER.code, e.errorNum); + } + }, + + testInvalidAnalyzer: function () { + try { + db._query(` + FOR d IN v_wiki_phrase + SEARCH PHRASE(d.title, "Lord Of The Rings", null) + SORT d._id + RETURN d`); + fail(); + } catch (e) { + assertEqual(errors.ERROR_BAD_PARAMETER.code, e.errorNum); + } + + try { + db._query(` + FOR d IN v_wiki_phrase + SEARCH PHRASE(d.title, "Lord Of The Rings", "!!invalidAnalyzer@@") + SORT d._id + RETURN d`); + fail(); + } catch (e) { + assertEqual(errors.ERROR_BAD_PARAMETER.code, e.errorNum); + } + }, + + testNonStringField: function () { + let actual = db._query(` + FOR d IN v_wiki_phrase + SEARCH PHRASE(d.count, "Lord Of The Rings", "text_en") + SORT d._id + RETURN d`).toArray(); + assertEqual(actual.length, 0); + }, + + testStringFieldWithoutPositions: function () { + let actual = db._query(` + FOR d IN v_wiki_phrase + SEARCH PHRASE(d.title, "Lord Of The Rings", "tokenizerWithoutPosition") + SORT d._id + RETURN d`).toArray(); + assertEqual(actual.length, 0); + }, + + testExactPhrase: function () { + let expected = db._query(` + LET input = "Lord of The Rings" + LET phrase = TOKENS(input, "tokenizer") + FOR d IN wikipedia + FILTER LENGTH(d.title) >= LENGTH(input) + LET tokens = TOKENS(d.title, "tokenizer") + FILTER phrase ALL IN tokens + FILTER LENGTH( + FOR leadPos IN 0..LENGTH(tokens)-1 + FILTER tokens[leadPos] == phrase[0] + FILTER LENGTH(FOR i IN 0..LENGTH(phrase)-1 + FILTER phrase[i] == tokens[leadPos+i] + RETURN 1) == LENGTH(phrase) + RETURN 1) > 0 + SORT d._id + RETURN d`).toArray(); + assertNotEqual(expected.length, 0); + + { + let actual = db._query(` + FOR d IN v_wiki_phrase + SEARCH PHRASE(d.title, "Lord of The Rings", "tokenizer") + SORT d._id + RETURN d`).toArray(); + + assertEqual(expected.length, actual.length); + actual.forEach((rhs, i) => assertTrue(_.isEqual(expected[i], rhs))); + } + + // variadic arguments + { + let actual = db._query(` + FOR d IN v_wiki_phrase + SEARCH PHRASE(d.title, "Lord", 0, "of", 0, "The", 0, "Rings", "tokenizer") + SORT d._id + RETURN d`).toArray(); + + assertEqual(expected.length, actual.length); + actual.forEach((rhs, i) => assertTrue(_.isEqual(expected[i], rhs))); + } + + // variadic arguments + object notation + { + let actual = db._query(` + FOR d IN v_wiki_phrase + SEARCH PHRASE(d.title, { TERM : "lord" }, 0, "of", 0, "The", 0, "Rings", "tokenizer") + SORT d._id + RETURN d`).toArray(); + + assertEqual(expected.length, actual.length); + actual.forEach((rhs, i) => assertTrue(_.isEqual(expected[i], rhs))); + } + + // variadic arguments + object notation + { + let actual = db._query(` + FOR d IN v_wiki_phrase + SEARCH PHRASE(d.title, { TERM : ["lord"] }, 0, "of", 0, "The", 0, "Rings", "tokenizer") + SORT d._id + RETURN d`).toArray(); + + assertEqual(expected.length, actual.length); + actual.forEach((rhs, i) => assertTrue(_.isEqual(expected[i], rhs))); + } + + // variadic arguments + object notation, analyzer is not applied to object notation + { + let actual = db._query(` + FOR d IN v_wiki_phrase + SEARCH PHRASE(d.title, { TERM : "Lord" }, 0, "of", 0, "The", 0, "Rings", "tokenizer") + SORT d._id + RETURN d`).toArray(); + + assertEqual(0, actual.length); + } + + // variadic arguments + object notation, analyzer is not applied to object notation + { + let actual = db._query(` + FOR d IN v_wiki_phrase + SEARCH PHRASE(d.title, { TERM : ["Lord"] }, 0, "of", 0, "The", 0, "Rings", "tokenizer") + SORT d._id + RETURN d`).toArray(); + + assertEqual(0, actual.length); + } + + // arguments as bind variables + { + let actual = db._query(` + FOR d IN v_wiki_phrase + SEARCH PHRASE(d.title, @input, @analyzer) + SORT d._id + RETURN d`, { input : "Lord of The Rings", analyzer: "tokenizer"}).toArray(); + + assertEqual(expected.length, actual.length); + actual.forEach((rhs, i) => assertTrue(_.isEqual(expected[i], rhs))); + } + + // arguments as references + { + let actual = db._query(` + LET input = NOOPT("Lord of The Rings") + LET analyzer = NOOPT("tokenizer") + FOR d IN v_wiki_phrase + SEARCH PHRASE(d.title, input, analyzer) + SORT d._id + RETURN d`).toArray(); + + assertEqual(expected.length, actual.length); + actual.forEach((rhs, i) => assertTrue(_.isEqual(expected[i], rhs))); + } + + // arguments as an array + { + let actual = db._query(` + LET input = "Lord of The Rings" + LET analyzer = "tokenizer" + FOR d IN v_wiki_phrase + SEARCH PHRASE(d.title, [input], analyzer) + SORT d._id + RETURN d`).toArray(); + + assertEqual(expected.length, actual.length); + actual.forEach((rhs, i) => assertTrue(_.isEqual(expected[i], rhs))); + } + + // arguments as an array + { + let actual = db._query(` + LET analyzer = "tokenizer" + FOR d IN v_wiki_phrase + SEARCH PHRASE(d.title, ["Lord", 0, "of", "The", "Rings"], analyzer) + SORT d._id + RETURN d`).toArray(); + + assertEqual(expected.length, actual.length); + actual.forEach((rhs, i) => assertTrue(_.isEqual(expected[i], rhs))); + } + + // arguments as an array, object notation + { + let actual = db._query(` + LET analyzer = "tokenizer" + FOR d IN v_wiki_phrase + SEARCH PHRASE(d.title, [{TERM:"lord"}, 0, "of", "The", "Rings"], analyzer) + SORT d._id + RETURN d`).toArray(); + + assertEqual(expected.length, actual.length); + actual.forEach((rhs, i) => assertTrue(_.isEqual(expected[i], rhs))); + } + + // arguments as an array, object notation + { + let actual = db._query(` + LET analyzer = "tokenizer" + FOR d IN v_wiki_phrase + SEARCH PHRASE(d.title, [{TERM:["lord"]}, 0, "of", "The", "Rings"], analyzer) + SORT d._id + RETURN d`).toArray(); + + assertEqual(expected.length, actual.length); + actual.forEach((rhs, i) => assertTrue(_.isEqual(expected[i], rhs))); + } + + // arguments as an array, object notation, analyzer isn't applied + { + let actual = db._query(` + LET analyzer = "tokenizer" + FOR d IN v_wiki_phrase + SEARCH PHRASE(d.title, [{TERM:["Lord"]}, 0, "of", "The", "Rings"], analyzer) + SORT d._id + RETURN d`).toArray(); + + assertEqual(0, actual.length); + } + + // arguments as an array + { + let actual = db._query(` + LET input = NOOPT("Lord of The Rings") + LET analyzer = "tokenizer" + FOR d IN v_wiki_phrase + SEARCH PHRASE(d.title, [input], analyzer) + SORT d._id + RETURN d`).toArray(); + + assertEqual(expected.length, actual.length); + actual.forEach((rhs, i) => assertTrue(_.isEqual(expected[i], rhs))); + } + + // arguments as a reference + { + let actual = db._query(` + LET analyzer = "tokenizer" + LET phraseStruct = NOOPT(["Lord", 0, "of", "The", "Rings"]) + FOR d IN v_wiki_phrase + SEARCH PHRASE(d.title, phraseStruct, analyzer) + SORT d._id + RETURN d`).toArray(); + + assertEqual(expected.length, actual.length); + actual.forEach((rhs, i) => assertTrue(_.isEqual(expected[i], rhs))); + } + }, + + testProximityPhrase: function () { + let expected = db._query(` + LET input = "Lord Rings" + LET phrase = TOKENS(input, "tokenizer") + LET offsets = [ 0, 2 ] + FOR d IN wikipedia + FILTER LENGTH(d.title) >= LENGTH(input) + LET tokens = TOKENS(d.title, "tokenizer") + FILTER phrase ALL IN tokens + FILTER LENGTH( + FOR leadPos IN 0..LENGTH(tokens)-1 + FILTER tokens[leadPos] == phrase[0] + FILTER LENGTH(FOR i IN 0..LENGTH(phrase)-1 + FILTER phrase[i] == tokens[leadPos+offsets[i]] + RETURN 1) == LENGTH(phrase) + RETURN 1) > 0 + SORT d._id + RETURN d`).toArray(); + assertNotEqual(expected.length, 0); + + { + let actual = db._query(` + FOR d IN v_wiki_phrase + SEARCH PHRASE(d.title, "Lord", 2, "Rings", "tokenizer") + SORT d._id + RETURN d`).toArray(); + + assertEqual(expected.length, actual.length); + actual.forEach((rhs, i) => assertTrue(_.isEqual(expected[i], rhs))); + } + } + }; +} + +//////////////////////////////////////////////////////////////////////////////// +/// @brief executes the test suite +//////////////////////////////////////////////////////////////////////////////// + +jsunity.run(ArangoSearch_PHRASE); +if (false === jsunity.done().status) { + throw "fail"; +} diff --git a/test_data/tests/arangosearch/arangosearch-stemming-languages-test.js b/test_data/tests/arangosearch/arangosearch-stemming-languages-test.js new file mode 100644 index 000000000..bf36170c6 --- /dev/null +++ b/test_data/tests/arangosearch/arangosearch-stemming-languages-test.js @@ -0,0 +1,128 @@ +/* jshint globalstrict:false, strict:false, maxlen: 500 */ + +//////////////////////////////////////////////////////////////////////////////// +/// DISCLAIMER +/// +/// Copyright 2020 ArangoDB GmbH, Cologne, Germany +/// +/// Licensed under the Apache License, Version 2.0 (the "License"); +/// you may not use this file except in compliance with the License. +/// You may obtain a copy of the License at +/// +/// http://www.apache.org/licenses/LICENSE-2.0 +/// +/// Unless required by applicable law or agreed to in writing, software +/// distributed under the License is distributed on an "AS IS" BASIS, +/// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +/// See the License for the specific language governing permissions and +/// limitations under the License. +/// +/// Copyright holder is ArangoDB GmbH, Cologne, Germany +/// +/// @author Andrei Lobov +//////////////////////////////////////////////////////////////////////////////// + +const jsunity = require("jsunity"); +const analyzers = require("@arangodb/analyzers"); +const {assertTrue} = jsunity.jsUnity.assertions; +const db = require("@arangodb").db; +const path = require('path'); +const _ = require('lodash'); + +//////////////////////////////////////////////////////////////////////////////// +/// @brief test suite +//////////////////////////////////////////////////////////////////////////////// + +function arangoSearchStemmingLanguages () { + function doTest (localeName, input, expected, expectedEdgeNgram) { + const analyzerName = "stemminAnalyzerTest"; + const analyzerEdgeName = "stemminEdgeAnalyzerTest"; + try { analyzers.remove(analyzerName, true); } catch {} + try { analyzers.remove(analyzerEdgeName, true); } catch {} + try { + analyzers.save(analyzerName, "text", { locale: localeName, + stemming: true, + accent: false, + stopwords: [], + case: "none" }); + analyzers.save(analyzerEdgeName, "text", { locale: localeName, + stemming: false, + stopwords: [], + accent: false, + case: "lower", + edgeNgram: { min: 2, + max: 3, + preserveOriginal: true }}); + let res = db._query("RETURN TOKENS('" + input + "', '" + analyzerName + "')").toArray(); + let resEdgeNgram = db._query("RETURN TOKENS('" + input + + "', '" + analyzerEdgeName + "')").toArray(); + assertTrue(_.isEqual(res[0], expected)); + assertTrue(_.isEqual(resEdgeNgram[0], expectedEdgeNgram)); + } finally { + try { analyzers.remove(analyzerName, true); } catch {} + try { analyzers.remove(analyzerEdgeName, true); } catch {} + } + } + return { + testArabic: function () { + doTest("ar_Arab_EG.UTF-8", "الرياضيين", ["رياض"], ["ال", "الر", "الرياضيين"]); + }, + testBasque: function () { + doTest("eu_ES.UTF-8", "Kirolariak", ["Kiro"], ["ki", "kir", "kirolariak"]); + }, + testCatalan: function () { + doTest("ca_ES.UTF-8", "Esportistes", ["Esport"], ["es", "esp", "esportistes"]); + }, + testDanish: function () { + doTest("da_DK.UTF-8", "Atleter", ["Atlet"], ["at", "atl", "atleter"]); + }, + testGreek: function () { + doTest("el_GR.UTF-8", "Αθλητές", [ "Aθλητ" ], [ "αθ", "αθλ", "αθλητες" ]); + // FIXME: looks like a bug in ICU ^ A should not be converted here + }, + testHindi: function () { + doTest("hi_IN.UTF-8", "\u090F\u0925\u0932\u0940\u091F", [ "\u090F\u0925\u0932\u0940\u091F" ], + [ "\u090F\u0925", "\u090F\u0925\u0932", "\u090F\u0925\u0932\u0940\u091F" ]); + }, + testHungarian: function () { + doTest("hu_HU.UTF-8", "Sportoló", ["Sportol"], ["sp", "spo", "sportolo"]); + }, + testIndonesian: function () { + doTest("id_ID.UTF-8", "Atlet", ["Atlet"], ["at", "atl", "atlet"]); + }, + testIrish: function () { + doTest("ga_IE.UTF-8", "Lúthchleasaithe", ["Luthchleasaithe"], ["lu", "lut", "luthchleasaithe"]); + }, + testLithuanian: function () { + doTest("lt_LT.UTF-8", "Sportininkai", ["Sportinink"], ["sp", "spo", "sportininkai"]); + }, + testNepali: function () { + doTest("ne_NP.UTF-8", "खेलाडीहरू", ["\u0916\u0932\u093e\u0921\u0940\u0939\u0930"], + ["\u0916\u0932", "\u0916\u0932\u093e", "\u0916\u0932\u093e\u0921\u0940\u0939\u0930"]); + }, + testRomanian: function () { + doTest("ro_RO.UTF-8", "Sportivii", ["Sportiv"], ["sp", "spo", "sportivii"]); + }, + testSerbian: function () { + doTest("sr_RS.UTF-8", "Спортисти", ["Сportist"], ["сп", "спо", "спортисти"]); + }, + testTamil: function () { + doTest("ta_IN.UTF-8", "மரங்கள்", + ["\u0bae\u0bb0\u0b99\u0b95\u0bb3"], + ["\u0bae\u0bb0", "\u0bae\u0bb0\u0b99", + "\u0bae\u0bb0\u0b99\u0b95\u0bb3"]); + }, + testTurkish: function () { + doTest("tr_TR.UTF-8", "Ağaçlar", ["Agac"], ["ag", "aga", "agaclar"]); + } + }; +} + +//////////////////////////////////////////////////////////////////////////////// +/// @brief executes the test suite +//////////////////////////////////////////////////////////////////////////////// + +jsunity.run(arangoSearchStemmingLanguages); +if (false === jsunity.done().status) { + throw "fail"; +} diff --git a/test_data/tests/arangosearch/arangosearch-stored-values-compression-test.js b/test_data/tests/arangosearch/arangosearch-stored-values-compression-test.js new file mode 100644 index 000000000..34bc195fd --- /dev/null +++ b/test_data/tests/arangosearch/arangosearch-stored-values-compression-test.js @@ -0,0 +1,137 @@ +/*jshint globalstrict:false, strict:false, maxlen: 500 */ +/*global assertUndefined, assertEqual, assertNotEqual, assertTrue, assertFalse, fail*/ + +//////////////////////////////////////////////////////////////////////////////// +/// DISCLAIMER +/// +/// Copyright 2020 ArangoDB GmbH, Cologne, Germany +/// +/// Licensed under the Apache License, Version 2.0 (the "License"); +/// you may not use this file except in compliance with the License. +/// You may obtain a copy of the License at +/// +/// http://www.apache.org/licenses/LICENSE-2.0 +/// +/// Unless required by applicable law or agreed to in writing, software +/// distributed under the License is distributed on an "AS IS" BASIS, +/// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +/// See the License for the specific language governing permissions and +/// limitations under the License. +/// +/// Copyright holder is ArangoDB GmbH, Cologne, Germany +/// +/// @author Andrey Abramov +//////////////////////////////////////////////////////////////////////////////// + +const jsunity = require("jsunity"); +const { + assertEqual, assertNotEqual, + assertTrue, assertFalse, + assertNull, assertNotNull, + assertIdentical, assertNotIdentical, + assertMatch, assertNotMatch, + assertTypeOf, assertNotTypeOf, + assertInstanceOf, assertNotInstanceOf, + assertUndefined, assertNotUndefined, + assertNan, assertNotNan, + fail } = jsunity.jsUnity.assertions; +const db = require("@arangodb").db; + +function getIndexSize(collectionName, viewName) { + let collection = db._collection(collectionName); + assertNotUndefined(collection); + let view = db._view(viewName); + assertNotUndefined(view); + + return collection.getIndexes(true, true) + .filter(v => v.type === "arangosearch") + .filter(v => view._id === v.view.split("/")[1])[0] + .figures.indexSize; +} + +function ArangoSearch_StoredValuesCompression() { + return { + setUpAll: function () { + db._dropView("v_wiki_stored_compressed"); + db._dropView("v_wiki_stored_raw"); + db._dropView("v_wiki_sorted_compressed"); + db._dropView("v_wiki_sorted_raw"); + }, + + tearDownAll: function() { + db._dropView("v_wiki_stored_compressed"); + db._dropView("v_wiki_stored_raw"); + db._dropView("v_wiki_sorted_compressed"); + db._dropView("v_wiki_sorted_raw"); + }, + + testCheckStoredValuesCompression: function() { + db._createView( + "v_wiki_stored_compressed", + "arangosearch", + { + storedValues: [ { fields: [ "body", "title" ], compression:"lz4" } ], + links : { wikipedia : { includeAllFields:true } }, + cleanupIntervalStep:0, + consolidationIntervalMsec:0 + }); + let compressedSize = getIndexSize("wikipedia", "v_wiki_stored_compressed"); + db._dropView("v_wiki_stored_compressed"); + + db._createView( + "v_wiki_stored_raw", + "arangosearch", + { + storedValues: [ { fields: [ "body", "title" ], compression:"none" } ], + links : { wikipedia : { includeAllFields:true } }, + cleanupIntervalStep:0, + consolidationIntervalMsec:0 + }); + let rawSize = getIndexSize("wikipedia", "v_wiki_stored_raw"); + db._dropView("v_wiki_stored_raw"); + + print("Compressed=" + compressedSize + " Raw=" + rawSize + " Ratio=" + compressedSize / rawSize); + assertTrue(compressedSize < rawSize); + }, + + testCheckPrimarySortCompression: function() { + db._createView( + "v_wiki_sorted_compressed", + "arangosearch", + { + primarySort: [ { field:"body", asc:true }, { field:"title", asc:true } ], + primarySortCompression: "lz4", + links : { wikipedia : { includeAllFields:true } }, + cleanupIntervalStep:0, + consolidationIntervalMsec:0 + }); + let compressedSize = getIndexSize("wikipedia", "v_wiki_sorted_compressed"); + db._dropView("v_wiki_sorted_compressed"); + + db._createView( + "v_wiki_sorted_raw", + "arangosearch", + { + primarySort: [ { field:"body", asc:true }, { field:"title", asc:true } ], + primarySortCompression: "none", + links : { wikipedia : { includeAllFields:true } }, + cleanupIntervalStep:0, + consolidationIntervalMsec:0 + }); + let rawSize = getIndexSize("wikipedia", "v_wiki_sorted_raw"); + db._dropView("v_wiki_sorted_raw"); + + print("Compressed=" + compressedSize + " Raw=" + rawSize + " Ratio=" + compressedSize / rawSize); + assertTrue(compressedSize < rawSize); + }, + } +} + +//////////////////////////////////////////////////////////////////////////////// +/// @brief executes the test suite +//////////////////////////////////////////////////////////////////////////////// + +jsunity.run(ArangoSearch_StoredValuesCompression); +if (false === jsunity.done().status) { + throw "fail"; +} diff --git a/test_data/tests/arangosearch/arangosearch-stored-values-test-setup.js b/test_data/tests/arangosearch/arangosearch-stored-values-test-setup.js new file mode 100644 index 000000000..c18d6dbe2 --- /dev/null +++ b/test_data/tests/arangosearch/arangosearch-stored-values-test-setup.js @@ -0,0 +1,58 @@ +/*jshint globalstrict:false, strict:false, maxlen: 500 */ + +//////////////////////////////////////////////////////////////////////////////// +/// DISCLAIMER +/// +/// Copyright 2020 ArangoDB GmbH, Cologne, Germany +/// +/// Licensed under the Apache License, Version 2.0 (the "License"); +/// you may not use this file except in compliance with the License. +/// You may obtain a copy of the License at +/// +/// http://www.apache.org/licenses/LICENSE-2.0 +/// +/// Unless required by applicable law or agreed to in writing, software +/// distributed under the License is distributed on an "AS IS" BASIS, +/// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +/// See the License for the specific language governing permissions and +/// limitations under the License. +/// +/// Copyright holder is ArangoDB GmbH, Cologne, Germany +/// +/// @author Andrey Abramov +//////////////////////////////////////////////////////////////////////////////// + +const db = require("@arangodb").db; +const analyzers = require("@arangodb/analyzers"); + +let wiki = db._collection("wikipedia"); +if (wiki === null || wiki === undefined) { + db._create("wikipedia"); +} + +let links = db._collection("links"); +if (links === null || links === undefined) { + db._createEdgeCollection("links"); +} + +db._createView( + "v_wiki_stored", "arangosearch", { + storedValues: [ + ["created"], + ["title", "created", "count", "_id"], + ["invalidField"] + ], + links : { wikipedia : { includeAllFields: true } } }); + +db._createView( + "v_wiki_sorted", "arangosearch", { + primarySort: [ + { field: "_key", asc:true }, + { field: "body", asc:true }, + { field: "created", asc:true }, + { field: "title", asc:true }, + { field: "count", asc:true }, + { field: "_id", asc:true }, + { field: "_rev", asc:true }, + ], + links : { wikipedia : { includeAllFields: true } } }); diff --git a/test_data/tests/arangosearch/arangosearch-stored-values-test.js b/test_data/tests/arangosearch/arangosearch-stored-values-test.js new file mode 100644 index 000000000..b4d921658 --- /dev/null +++ b/test_data/tests/arangosearch/arangosearch-stored-values-test.js @@ -0,0 +1,843 @@ +/*jshint globalstrict:false, strict:false, maxlen: 500 */ +/*global assertUndefined, assertEqual, assertNotEqual, assertTrue, assertFalse, fail*/ + +//////////////////////////////////////////////////////////////////////////////// +/// DISCLAIMER +/// +/// Copyright 2020 ArangoDB GmbH, Cologne, Germany +/// +/// Licensed under the Apache License, Version 2.0 (the "License"); +/// you may not use this file except in compliance with the License. +/// You may obtain a copy of the License at +/// +/// http://www.apache.org/licenses/LICENSE-2.0 +/// +/// Unless required by applicable law or agreed to in writing, software +/// distributed under the License is distributed on an "AS IS" BASIS, +/// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +/// See the License for the specific language governing permissions and +/// limitations under the License. +/// +/// Copyright holder is ArangoDB GmbH, Cologne, Germany +/// +/// @author Andrey Abramov +//////////////////////////////////////////////////////////////////////////////// + +const jsunity = require("jsunity"); +const _ = require('lodash'); +const { + assertEqual, assertNotEqual, + assertTrue, assertFalse, + assertNull, assertNotNull, + assertIdentical, assertNotIdentical, + assertMatch, assertNotMatch, + assertTypeOf, assertNotTypeOf, + assertInstanceOf, assertNotInstanceOf, + assertUndefined, assertNotUndefined, + assertNan, assertNotNan, + fail } = jsunity.jsUnity.assertions; +const db = require("@arangodb").db; +const analyzers = require("@arangodb/analyzers"); + +const noOptimization = {optimizer: {rules:["-all"]}}; +const noSubquerySplicing= {optimizer: {rules:["-splice-subqueries"]}}; +const noLateMaterialization = {optimizer: {rules:["-late-document-materialization-arangosearch"]}}; +const materialize = {noMaterialization:false}; +const doNotMaterialize = {noMaterialization:true}; + +function getNodes(query, type, bindVars, options) { + let stmt = db._createStatement(query); + if (typeof bindVars === "object") { + stmt.bind(bindVars) + } + if (typeof options === "object") { + stmt.setOptions(options) + } + return stmt.explain() + .plan + .nodes + .filter(node => node.type === type); +} + +//////////////////////////////////////////////////////////////////////////////// +/// @brief test suite +/// @note the suite expects the following database objects to be available +/// - document collection 'wikipedia' +/// - edge collection 'links' +/// - arangosearch view 'v_wiki_stored' +/// { +/// storedValues: [ +/// ["created"], +/// ["title", "created", "count", "_id"], +/// ["invalidField"] +/// ], +/// links : { wikipedia : { includeAllFields: true, } } +/// } +/// - arangosearch view 'v_wiki_sorted' +/// { +/// primarySort: [ +/// { field: "_key", asc:true }, +/// { field: "body", asc:true }, +/// { field: "created", asc:true }, +/// { field: "title", asc:true }, +/// { field: "count", asc:true }, +/// { field: "_id", asc:true }, +/// { field: "_rev", asc:true }, +/// ], +/// links : { wikipedia : { includeAllFields: true } } +/// } +//////////////////////////////////////////////////////////////////////////////// + +function ArangoSearch_StoredValues() { + return { + //////////////////////////////////////////////////////////////////////////////// + /// 1. ensure document isn't materialized + /// 2. ensure subquery splicing is disabled + /// 3. ensure arangosearch reads all values from ["title", "created", "count", "_id"] + /// and primarySort columns + /// 4. ensure results are same as for collection with filter + /// 5. ensure results are same as for view without stored values + //////////////////////////////////////////////////////////////////////////////// + testNoMaterializationForSubQueryWithoutSplicing: function () { + let query = ` + FOR d IN v_wiki_stored + SEARCH IN_RANGE(d.count, 99, 9999, true, true) + OPTIONS { noMaterialization: @noMaterialization } + SORT d._id + RETURN { title: d.title, counts: (FOR j IN d.count..d.count+10 RETURN j) }`; + + assertEqual(0, getNodes(query, "SubqueryStartNode", doNotMaterialize, noSubquerySplicing).length); + assertEqual(0, getNodes(query, "SubqueryStartNode", materialize, noSubquerySplicing).length); + + { + let viewNodes = getNodes(query, "EnumerateViewNode", doNotMaterialize, noSubquerySplicing) + assertEqual(1, viewNodes.length); + + let viewWithStoredValues = viewNodes.find(v => v.view === "v_wiki_stored"); + assertNotUndefined(viewWithStoredValues); + assertTrue(viewWithStoredValues.noMaterialization); + assertEqual(1, viewWithStoredValues.viewValuesVars.length); + assertEqual(["_id", "count", "title"], + viewWithStoredValues.viewValuesVars[0].viewStoredValuesVars.map(v => v.field).sort()); + } + + { + let viewNodes = getNodes(query, "EnumerateViewNode", materialize, noSubquerySplicing); + assertEqual(1, viewNodes.length); + assertFalse(viewNodes[0].noMaterialization); + assertEqual(0, viewNodes[0].viewValuesVars.length); + } + + let expected = db._query(` + FOR d IN wikipedia FILTER IN_RANGE(d.count, 99, 9999, true, true) + SORT d._id + RETURN { title: d.title, counts: (FOR j IN d.count..d.count+10 RETURN j) }`, {}, noOptimization).toArray(); + assertNotEqual(expected.length, 0); + + let actual = db._query(query, doNotMaterialize, noSubquerySplicing).toArray(); + assertEqual(expected.length, actual.length); + actual.forEach((rhs, i) => assertTrue(_.isEqual(expected[i],rhs))); + + let actualEarlyMaterialization = db._query(query, materialize, noSubquerySplicing).toArray(); + assertEqual(expected.length, actualEarlyMaterialization.length); + actualEarlyMaterialization.forEach((rhs, i) => assertTrue(_.isEqual(expected[i],rhs))); + }, + + //////////////////////////////////////////////////////////////////////////////// + /// 1. ensure document isn't materialized + /// 2. ensure arangosearch reads all values from ["title", "created", "count", "_id"] + /// and primarySort columns + /// 3. ensure results are same as for collection with filter + /// 4. ensure results are same as for view without stored values + //////////////////////////////////////////////////////////////////////////////// + testNoMaterializationForSubQuery: function () { + let query = ` + FOR d IN v_wiki_stored + SEARCH IN_RANGE(d.count, 99, 9999, true, true) + OPTIONS { noMaterialization: @noMaterialization } + SORT d._id + RETURN { title: d.title, counts: (FOR j IN d.count..d.count+10 RETURN j) }`; + + { + let viewNodes = getNodes(query, "EnumerateViewNode", doNotMaterialize) + assertEqual(1, viewNodes.length); + + let viewWithStoredValues = viewNodes.find(v => v.view === "v_wiki_stored"); + assertNotUndefined(viewWithStoredValues); + assertTrue(viewWithStoredValues.noMaterialization); + assertEqual(1, viewWithStoredValues.viewValuesVars.length); + assertEqual(["_id", "count", "title"], + viewWithStoredValues.viewValuesVars[0].viewStoredValuesVars.map(v => v.field).sort()); + } + + { + let viewNodes = getNodes(query, "EnumerateViewNode", materialize); + assertEqual(1, viewNodes.length); + assertFalse(viewNodes[0].noMaterialization); + assertEqual(0, viewNodes[0].viewValuesVars.length); + } + + let expected = db._query(` + FOR d IN wikipedia FILTER IN_RANGE(d.count, 99, 9999, true, true) + SORT d._id + RETURN { title: d.title, counts: (FOR j IN d.count..d.count+10 RETURN j) }`, {}, noOptimization).toArray(); + assertNotEqual(expected.length, 0); + + let actual = db._query(query, doNotMaterialize).toArray(); + assertEqual(expected.length, actual.length); + actual.forEach((rhs, i) => assertTrue(_.isEqual(expected[i],rhs))); + + let actualEarlyMaterialization = db._query(query, materialize).toArray(); + assertEqual(expected.length, actualEarlyMaterialization.length); + actualEarlyMaterialization.forEach((rhs, i) => assertTrue(_.isEqual(expected[i],rhs))); + }, + + + //////////////////////////////////////////////////////////////////////////////// + /// 1. ensure view node doesn't materialize documents + /// 2. ensure arangosearch reads all values from ['title'] column + /// 3. ensure MATERIALIZE node is present + /// 4. ensure results are same as for collection with filter + /// 5. ensure results are same as for query with late materialization disabled + //////////////////////////////////////////////////////////////////////////////// + testLateMaterialization: function () { + let query = ` + FOR d IN v_wiki_stored + SEARCH IN_RANGE(d.count, 99, 99999, true, true) + SORT d.title, d._id + LIMIT 2013 + RETURN d`; + + let viewNode = getNodes(query, "EnumerateViewNode")[0]; + assertNotUndefined(viewNode); + assertUndefined(viewNode.noMaterialization); + assertEqual(1, viewNode.viewValuesVars.length); + let viewValues = viewNode.viewValuesVars[0]; + assertEqual(2, viewValues.viewStoredValuesVars.length); + assertEqual("title", viewValues.viewStoredValuesVars[0].field); + assertEqual("_id", viewValues.viewStoredValuesVars[1].field); + let materializationNode = getNodes(query, "MaterializeNode"); + assertNotUndefined(materializationNode); + + let viewNodeNoLateMaterialization = getNodes(query, "EnumerateViewNode", {}, noOptimization)[0]; + assertNotUndefined(viewNodeNoLateMaterialization); + assertUndefined(viewNodeNoLateMaterialization.noMaterialization); + assertEqual(0, viewNodeNoLateMaterialization.viewValuesVars.length); + assertUndefined(getNodes(query, "MaterializeNode", {}, noOptimization)[0]); + + let expected = db._query(` + FOR d IN wikipedia + FILTER IN_RANGE(d.count, 99, 99999, true, true) + SORT d.title, d._id + LIMIT 2013 + RETURN d`, {}, noOptimization).toArray(); + assertNotEqual(expected.length, 0); + + let actual = db._query(query).toArray(); + assertEqual(expected.length, actual.length); + actual.forEach((rhs, i) => assertTrue(_.isEqual(expected[i],rhs))); + + let actualEarlyMaterialization = db._query(query, {}, noLateMaterialization).toArray(); + assertEqual(expected.length, actualEarlyMaterialization.length); + actualEarlyMaterialization.forEach((rhs, i) => assertTrue(_.isEqual(expected[i],rhs))); + }, + + //////////////////////////////////////////////////////////////////////////////// + /// 1. ensure document isn't materialized + /// 2. ensure arangosearch reads all values from ['title'] column + /// 3. ensure results are same as for collection with filter + /// 4. ensure results are same as for view without stored values + //////////////////////////////////////////////////////////////////////////////// + testNoMaterializationForReturn: function () { + let query = ` + FOR d IN v_wiki_stored + SEARCH IN_RANGE(d.count, 99, 99999, true, true) + OPTIONS { noMaterialization: @noMaterialization } + RETURN d.title`; + + let noMaterializationNode = getNodes(query, "EnumerateViewNode", doNotMaterialize)[0]; + assertNotUndefined(noMaterializationNode); + assertTrue(noMaterializationNode.noMaterialization); + assertEqual(1, noMaterializationNode.viewValuesVars.length); + let viewValues = noMaterializationNode.viewValuesVars[0]; + assertEqual(1, viewValues.viewStoredValuesVars.length); + assertEqual("title", viewValues.viewStoredValuesVars[0].field); + + let node = getNodes(query, "EnumerateViewNode", materialize)[0]; + assertNotUndefined(node); + assertFalse(node.noMaterialization); + assertEqual(0, node.viewValuesVars.length); + + let expected = db._query(` + FOR d IN wikipedia FILTER IN_RANGE(d.count, 99, 99999, true, true) + RETURN d.title`, {}, noOptimization).toArray().sort(); + assertNotEqual(expected.length, 0); + + let actual = db._query(query, doNotMaterialize).toArray().sort(); + assertEqual(expected, actual); + assertEqual(db._query(query, materialize).toArray().sort(), actual); + }, + + //////////////////////////////////////////////////////////////////////////////// + /// 1. ensure document isn't materialized + /// 2. ensure SORT node is present + /// 3. ensure arangosearch reads all values from prmarySort column + /// 4. ensure results are same as for collection with filter + /// 5. ensure results are same as for view without stored values + //////////////////////////////////////////////////////////////////////////////// + testNoMaterializationPrimarySort: function () { + let query = ` + FOR d IN v_wiki_sorted + SEARCH IN_RANGE(d.count, 99, 99999, true, true) + OPTIONS { noMaterialization: @noMaterialization } + SORT d.title, d._key + RETURN { body: d.body, title: d.title }`; + + assertEqual(1, getNodes(query, "SortNode", doNotMaterialize).length); + assertEqual(1, getNodes(query, "SortNode", materialize).length); + + let noMaterializationNode = getNodes(query, "EnumerateViewNode", doNotMaterialize)[0]; + assertNotUndefined(noMaterializationNode); + assertTrue(noMaterializationNode.noMaterialization); + assertEqual(["_key", "body","title"], + noMaterializationNode.viewValuesVars.map(v => v.field).sort()); + + let node = getNodes(query, "EnumerateViewNode", materialize)[0]; + assertNotUndefined(node); + assertFalse(node.noMaterialization); + assertEqual(0, node.viewValuesVars.length); + + let expected = db._query(` + FOR d IN wikipedia FILTER IN_RANGE(d.count, 99, 99999, true, true) + SORT d.title, d._key + RETURN { title: d.title, body: d.body }`, {}, noOptimization).toArray(); + assertNotEqual(expected.length, 0); + + let actual = db._query(query, doNotMaterialize).toArray().sort(); + assertEqual(expected.length, actual.length); + actual.forEach((rhs, i) => assertTrue(_.isEqual(expected[i],rhs))); + + let actualEarlyMaterialization = db._query(query, materialize).toArray().sort(); + assertEqual(expected.length, actualEarlyMaterialization.length); + actualEarlyMaterialization.forEach((rhs, i) => assertTrue(_.isEqual(expected[i],rhs))); + }, + + //////////////////////////////////////////////////////////////////////////////// + /// 1. ensure document isn't materialized + /// 2. ensure SORT node isn't present + /// 3. ensure arangosearch reads all values from prmarySort column + /// 4. ensure results are same as for collection with filter + /// 5. ensure results are same as for view without stored values + //////////////////////////////////////////////////////////////////////////////// + testNoMaterializationOptimizedSortPrimarySort: function () { + let query = ` + FOR d IN v_wiki_sorted + SEARCH IN_RANGE(d.count, 99, 99999, true, true) + OPTIONS { noMaterialization: @noMaterialization } + SORT d._key + RETURN { body: d.body, title: d.title }`; + + assertEqual(0, getNodes(query, "SortNode", doNotMaterialize).length); + assertEqual(0, getNodes(query, "SortNode", materialize).length); + + let noMaterializationNode = getNodes(query, "EnumerateViewNode", doNotMaterialize)[0]; + assertNotUndefined(noMaterializationNode); + assertTrue(noMaterializationNode.noMaterialization); + assertEqual(["body","title"], + noMaterializationNode.viewValuesVars.map(v => v.field).sort()); + + let node = getNodes(query, "EnumerateViewNode", materialize)[0]; + assertNotUndefined(node); + assertFalse(node.noMaterialization); + assertEqual(0, node.viewValuesVars.length); + + let expected = db._query(` + FOR d IN wikipedia FILTER IN_RANGE(d.count, 99, 99999, true, true) + SORT d._key + RETURN { title: d.title, body: d.body }`, {}, noOptimization).toArray(); + assertNotEqual(expected.length, 0); + + let actual = db._query(query, doNotMaterialize).toArray().sort(); + assertEqual(expected.length, actual.length); + actual.forEach((rhs, i) => assertTrue(_.isEqual(expected[i],rhs))); + + let actualEarlyMaterialization = db._query(query, materialize).toArray().sort(); + assertEqual(expected.length, actualEarlyMaterialization.length); + actualEarlyMaterialization.forEach((rhs, i) => assertTrue(_.isEqual(expected[i],rhs))); + }, + + //////////////////////////////////////////////////////////////////////////////// + /// 1. ensure document isn't materialized + /// 2. ensure arangosearch reads all values from ['invalidField'] column + /// 3. ensure results are same as for collection with filter + /// 4. ensure results are same as for view without stored values + //////////////////////////////////////////////////////////////////////////////// + testNoMaterializationForReturnNonExistentField: function () { + let query = ` + FOR d IN v_wiki_stored + SEARCH IN_RANGE(d.count, 99, 99999, true, true) + OPTIONS { noMaterialization: @noMaterialization } + RETURN d.invalidField`; + + let noMaterializationNode = getNodes(query, "EnumerateViewNode", doNotMaterialize)[0]; + assertNotUndefined(noMaterializationNode); + assertTrue(noMaterializationNode.noMaterialization); + assertEqual(1, noMaterializationNode.viewValuesVars.length); + let viewValues = noMaterializationNode.viewValuesVars[0]; + assertEqual(1, viewValues.viewStoredValuesVars.length); + assertEqual("invalidField", viewValues.viewStoredValuesVars[0].field); + + let node = getNodes(query, "EnumerateViewNode", materialize)[0]; + assertNotUndefined(node); + assertFalse(node.noMaterialization); + assertEqual(0, node.viewValuesVars.length); + + let expected = db._query(` + FOR d IN wikipedia FILTER IN_RANGE(d.count, 99, 99999, true, true) + RETURN d.invalidField`, {}, noOptimization).toArray(); + assertNotEqual(expected.length, 0); + expected.forEach(v => assertNull(v)); + + let actual = db._query(query, doNotMaterialize).toArray(); + assertEqual(expected, actual); + assertEqual(db._query(query, materialize).toArray(), actual); + }, + + //////////////////////////////////////////////////////////////////////////////// + /// 1. ensure document isn't materialized + /// 2. ensure arangosearch reads all values from ["title", "created", "count", "_id"] column + /// 3. ensure results are same as for collection with filter + /// 4. ensure results are same as for view without stored values + //////////////////////////////////////////////////////////////////////////////// + testNoMaterializationForReturnMultipleValues: function () { + let query = ` + FOR d IN v_wiki_stored + SEARCH IN_RANGE(d.count, 99, 99999, true, true) + OPTIONS { noMaterialization: @noMaterialization } + RETURN { title:d.title, id: d._id, created:d.created }`; + + let noMaterializationNode = getNodes(query, "EnumerateViewNode", doNotMaterialize)[0]; + assertNotUndefined(noMaterializationNode); + assertTrue(noMaterializationNode.noMaterialization); + assertEqual(1, noMaterializationNode.viewValuesVars.length); + let viewValues = noMaterializationNode.viewValuesVars[0]; + assertEqual(3, viewValues.viewStoredValuesVars.length); + assertEqual(["_id","created","title"], + viewValues.viewStoredValuesVars.map(v => v.field).sort()); + let node = getNodes(query, "EnumerateViewNode", materialize)[0]; + assertNotUndefined(node); + assertFalse(node.noMaterialization); + assertEqual(0, node.viewValuesVars.length); + + let less = function(lhs, rhs) { + if (lhs.id < rhs.id) { + return -1; + } + if (lhs.id > rhs.id) { + return 1; + } + return 0; + }; + + let expected = db._query(` + FOR d IN wikipedia FILTER IN_RANGE(d.count, 99, 99999, true, true) + RETURN { title:d.title, id: d._id, created:d.created }`, {}, noOptimization).toArray().sort(less); + assertNotEqual(expected.length, 0); + + let actual = db._query(query, doNotMaterialize).toArray().sort(less); + assertEqual(expected.length, actual.length); + actual.forEach((rhs, i) => assertTrue(_.isEqual(expected[i],rhs))); + + let actualEarlyMaterialization = db._query(query, materialize).toArray().sort(less); + assertEqual(expected.length, actualEarlyMaterialization.length); + actualEarlyMaterialization.forEach((rhs, i) => assertTrue(_.isEqual(expected[i],rhs))); + }, + + //////////////////////////////////////////////////////////////////////////////// + /// 1. ensure document isn't materialized + /// 2. ensure arangosearch reads all values from ['title'] column + /// 3. ensure results are same as for collection with filter + /// 4. ensure results are same as for view without stored values + //////////////////////////////////////////////////////////////////////////////// + testNoMaterializationForSort: function () { + let query = ` + FOR d IN v_wiki_stored + SEARCH IN_RANGE(d.count, 99, 99999, true, true) + OPTIONS { noMaterialization: @noMaterialization } + SORT d.title + RETURN d.title`; + + let noMaterializationNode = getNodes(query, "EnumerateViewNode", doNotMaterialize)[0]; + assertNotUndefined(noMaterializationNode); + assertTrue(noMaterializationNode.noMaterialization); + assertEqual(1, noMaterializationNode.viewValuesVars.length); + let viewValues = noMaterializationNode.viewValuesVars[0]; + assertEqual(1, viewValues.viewStoredValuesVars.length); + assertEqual("title", viewValues.viewStoredValuesVars[0].field); + + let node = getNodes(query, "EnumerateViewNode", materialize)[0]; + assertNotUndefined(node); + assertFalse(node.noMaterialization); + assertEqual(0, node.viewValuesVars.length); + + let expected = db._query(` + FOR d IN wikipedia FILTER IN_RANGE(d.count, 99, 99999, true, true) + SORT d.title + RETURN d.title`, {}, noOptimization).toArray(); + assertNotEqual(expected.length, 0); + + let actual = db._query(query, doNotMaterialize).toArray(); + assertEqual(expected, actual); + assertEqual(db._query(query, materialize).toArray(), actual); + }, + + //////////////////////////////////////////////////////////////////////////////// + /// 1. ensure document isn't materialized + /// 2. ensure arangosearch reads all values from ['title'] column + /// 3. ensure results are same as for collection with filter + /// 4. ensure results are same as for view without stored values + //////////////////////////////////////////////////////////////////////////////// + testNoMaterializationForSortMultipleValues: function () { + let query = ` + FOR d IN v_wiki_stored + SEARCH IN_RANGE(d.count, 99, 99999, true, true) + OPTIONS { noMaterialization: @noMaterialization } + SORT d.title, d._id + RETURN d.title`; + + let noMaterializationNode = getNodes(query, "EnumerateViewNode", doNotMaterialize)[0]; + assertNotUndefined(noMaterializationNode); + assertTrue(noMaterializationNode.noMaterialization); + assertEqual(1, noMaterializationNode.viewValuesVars.length); + let viewValues = noMaterializationNode.viewValuesVars[0]; + assertEqual(2, viewValues.viewStoredValuesVars.length); + assertEqual(["_id","title"], + viewValues.viewStoredValuesVars.map(v => v.field).sort()); + + let node = getNodes(query, "EnumerateViewNode", materialize)[0]; + assertNotUndefined(node); + assertFalse(node.noMaterialization); + assertEqual(0, node.viewValuesVars.length); + + let expected = db._query(` + FOR d IN wikipedia FILTER IN_RANGE(d.count, 99, 99999, true, true) + SORT d.title, d._id + RETURN d.title`, {}, noOptimization).toArray(); + assertNotEqual(expected.length, 0); + + let actual = db._query(query, doNotMaterialize).toArray(); + assertEqual(expected, actual); + assertEqual(db._query(query, materialize).toArray(), actual); + }, + + //////////////////////////////////////////////////////////////////////////////// + /// 1. ensure document isn't materialized + /// 2. ensure arangosearch reads all values from ['title'] column + /// 3. ensure results are same as for collection with filter + /// 4. ensure results are same as for view without stored values + //////////////////////////////////////////////////////////////////////////////// + testNoMaterializationForCollect: function () { + let query = ` + FOR d IN v_wiki_stored + SEARCH IN_RANGE(d.count, 99, 99999, true, true) + OPTIONS { noMaterialization: @noMaterialization } + COLLECT title = d.title WITH COUNT INTO count + RETURN { title, count }`; + + let noMaterializationNode = getNodes(query, "EnumerateViewNode", doNotMaterialize)[0]; + assertNotUndefined(noMaterializationNode); + assertTrue(noMaterializationNode.noMaterialization); + assertEqual(1, noMaterializationNode.viewValuesVars.length); + let viewValues = noMaterializationNode.viewValuesVars[0]; + assertEqual(1, viewValues.viewStoredValuesVars.length); + assertEqual(["title"], + viewValues.viewStoredValuesVars.map(v => v.field).sort()); + + let node = getNodes(query, "EnumerateViewNode", materialize)[0]; + assertNotUndefined(node); + assertFalse(node.noMaterialization); + assertEqual(0, node.viewValuesVars.length); + + let expected = db._query(` + FOR d IN wikipedia FILTER IN_RANGE(d.count, 99, 99999, true, true) + COLLECT title = d.title WITH COUNT INTO count + RETURN { title, count }`, {}, noOptimization).toArray(); + assertNotEqual(expected.length, 0); + + let actual = db._query(query, doNotMaterialize).toArray(); + assertEqual(expected, actual); + assertEqual(db._query(query, materialize).toArray(), actual); + }, + + //////////////////////////////////////////////////////////////////////////////// + /// 1. ensure document isn't materialized + /// 2. ensure arangosearch reads all values from ["title", "created", "count", "_id"] column + /// 3. ensure results are same as for collection with filter + /// 4. ensure results are same as for view without stored values + //////////////////////////////////////////////////////////////////////////////// + testNoMaterializationForCollectAggregate: function () { + let query = ` + FOR d IN v_wiki_stored + SEARCH IN_RANGE(d.count, 99, 99999, true, true) + OPTIONS { noMaterialization: @noMaterialization } + COLLECT title = d.title + AGGREGATE max = MAX(d.count) + INTO groups = { count: d.count } + RETURN { title, max, groups }`; + + let noMaterializationNode = getNodes(query, "EnumerateViewNode", doNotMaterialize)[0]; + assertNotUndefined(noMaterializationNode); + assertTrue(noMaterializationNode.noMaterialization); + assertEqual(1, noMaterializationNode.viewValuesVars.length); + let viewValues = noMaterializationNode.viewValuesVars[0]; + assertEqual(2, viewValues.viewStoredValuesVars.length); + assertEqual(["count", "title"], + viewValues.viewStoredValuesVars.map(v => v.field).sort()); + + let node = getNodes(query, "EnumerateViewNode", materialize)[0]; + assertNotUndefined(node); + assertFalse(node.noMaterialization); + assertEqual(0, node.viewValuesVars.length); + + let less = function(lhs, rhs) { + if (lhs.count < rhs.count) { + return 1; + } + + if (lhs.count > rhs.count) { + return -1; + } + + return 0; + }; + + let expected = db._query(` + FOR d IN wikipedia FILTER IN_RANGE(d.count, 99, 99999, true, true) + COLLECT title = d.title + AGGREGATE max = MAX(d.count) + INTO groups = { count: d.count } + RETURN { title, max, groups }`, {}, noOptimization).toArray(); + expected.forEach(v => v.groups.sort(less)); + assertNotEqual(expected.length, 0); + + let actual = db._query(query, doNotMaterialize).toArray(); + actual.forEach(v => v.groups.sort(less)); + assertEqual(expected.length, actual.length); + actual.forEach((rhs, i) => assertTrue(_.isEqual(expected[i],rhs))); + + let actualEarlyMaterialization = db._query(query, materialize).toArray(); + actualEarlyMaterialization.forEach(v => v.groups = v.groups.sort(less)); + assertEqual(expected.length, actualEarlyMaterialization.length); + actualEarlyMaterialization.forEach((rhs, i) => assertTrue(_.isEqual(expected[i],rhs))); + }, + + //////////////////////////////////////////////////////////////////////////////// + /// 1. ensure document isn't materialized + /// 2. ensure arangosearch reads all values from ["title", "created", "count", "_id"] column + /// 3. ensure results are same as for collection with filter + /// 4. ensure results are same as for view without stored values + //////////////////////////////////////////////////////////////////////////////// + testNoMaterializationForTraversal: function () { + let query = ` + FOR d IN v_wiki_stored + SEARCH IN_RANGE(d.count, 99, 99999, true, true) + OPTIONS { noMaterialization: @noMaterialization } + FOR v,e,p IN 1..2 OUTBOUND d._id links + RETURN p`; + + let noMaterializationNode = getNodes(query, "EnumerateViewNode", doNotMaterialize)[0]; + assertNotUndefined(noMaterializationNode); + assertTrue(noMaterializationNode.noMaterialization); + assertEqual(1, noMaterializationNode.viewValuesVars.length); + let viewValues = noMaterializationNode.viewValuesVars[0]; + assertEqual(1, viewValues.viewStoredValuesVars.length); + assertEqual(["_id"], + viewValues.viewStoredValuesVars.map(v => v.field).sort()); + + let node = getNodes(query, "EnumerateViewNode", materialize)[0]; + assertNotUndefined(node); + assertFalse(node.noMaterialization); + assertEqual(0, node.viewValuesVars.length); + + let expected = db._query(` + FOR d IN wikipedia FILTER IN_RANGE(d.count, 99, 99999, true, true) + FOR v,e,p IN 1..2 OUTBOUND d._id links + RETURN p`, {}, noOptimization).toArray(); + assertNotEqual(expected.length, 0); + + let actual = db._query(query, doNotMaterialize).toArray(); + assertEqual(expected, actual); + assertEqual(db._query(query, materialize).toArray(), actual); + }, + + //////////////////////////////////////////////////////////////////////////////// + /// 1. ensure document isn't materialized + /// 2. ensure arangosearch reads all values from ["title", "created", "count", "_id"] + /// and primarySort columns + /// 3. ensure results are same as for collection with filter + /// 4. ensure results are same as for view without stored values + //////////////////////////////////////////////////////////////////////////////// + testNoMaterializationForJoinWithView0: function () { + let query = ` + FOR d IN v_wiki_stored + SEARCH IN_RANGE(d.count, 99, 9999, true, true) + OPTIONS { noMaterialization: @noMaterialization } + FOR j IN v_wiki_sorted + SEARCH IN_RANGE(j.count, 99, 9999, true, true) AND j._id == d._id + OPTIONS { noMaterialization: @noMaterialization } + SORT j._key + RETURN { body: j.body, key: j._key, title: d.title }`; + + { + let viewNodes = getNodes(query, "EnumerateViewNode", doNotMaterialize) + assertEqual(2, viewNodes.length); + + let viewWithStoredValues = viewNodes.find(v => v.view === "v_wiki_stored"); + assertNotUndefined(viewWithStoredValues); + assertTrue(viewWithStoredValues.noMaterialization); + assertEqual(1, viewWithStoredValues.viewValuesVars.length); + assertEqual(["_id", "title"], + viewWithStoredValues.viewValuesVars[0].viewStoredValuesVars.map(v => v.field).sort()); + + let viewWithSortedValues = viewNodes.find(v => v.view === "v_wiki_sorted"); + assertNotUndefined(viewWithSortedValues); + assertTrue(viewWithSortedValues.noMaterialization); + assertEqual(2, viewWithSortedValues.viewValuesVars.length); + assertEqual(["_key", "body"], + viewWithSortedValues.viewValuesVars.map(v => v.field).sort()); + } + + { + let viewNodes = getNodes(query, "EnumerateViewNode", materialize); + assertEqual(2, viewNodes.length); + assertFalse(viewNodes[0].noMaterialization); + assertEqual(0, viewNodes[0].viewValuesVars.length); + assertFalse(viewNodes[1].noMaterialization); + assertEqual(0, viewNodes[1].viewValuesVars.length); + } + + let expected = db._query(` + FOR d IN wikipedia FILTER IN_RANGE(d.count, 99, 9999, true, true) + SORT d._key + RETURN { body: d.body, title: d.title, key: d._key }`, {}, noOptimization).toArray(); + assertNotEqual(expected.length, 0); + + let actual = db._query(query, doNotMaterialize).toArray(); + assertEqual(expected.length, actual.length); + actual.forEach((rhs, i) => assertTrue(_.isEqual(expected[i],rhs))); + + let actualEarlyMaterialization = db._query(query, materialize).toArray(); + assertEqual(expected.length, actualEarlyMaterialization.length); + actualEarlyMaterialization.forEach((rhs, i) => assertTrue(_.isEqual(expected[i],rhs))); + }, + + //////////////////////////////////////////////////////////////////////////////// + /// 1. ensure document isn't materialized + /// 2. ensure arangosearch reads all values from ["title", "created", "count", "_id"] + /// and primarySort columns + /// 3. ensure results are same as for collection with filter + /// 4. ensure results are same as for view without stored values + //////////////////////////////////////////////////////////////////////////////// + testNoMaterializationForJoinWithView1: function () { + let query = ` + FOR d IN v_wiki_stored + SEARCH IN_RANGE(d.count, 99, 9999, true, true) + OPTIONS { noMaterialization: @noMaterialization } + LET id = NOOPT(d._id) + FOR j IN v_wiki_sorted + SEARCH IN_RANGE(j.count, 99, 9999, true, true) AND j._id == id + OPTIONS { noMaterialization: @noMaterialization } + SORT j._key + RETURN { body: j.body, key: j._key, title: d.title }`; + + { + let viewNodes = getNodes(query, "EnumerateViewNode", doNotMaterialize) + assertEqual(2, viewNodes.length); + + let viewWithStoredValues = viewNodes.find(v => v.view === "v_wiki_stored"); + assertNotUndefined(viewWithStoredValues); + assertTrue(viewWithStoredValues.noMaterialization); + assertEqual(1, viewWithStoredValues.viewValuesVars.length); + assertEqual(["_id", "title"], + viewWithStoredValues.viewValuesVars[0].viewStoredValuesVars.map(v => v.field).sort()); + + let viewWithSortedValues = viewNodes.find(v => v.view === "v_wiki_sorted"); + assertNotUndefined(viewWithSortedValues); + assertTrue(viewWithSortedValues.noMaterialization); + assertEqual(2, viewWithSortedValues.viewValuesVars.length); + assertEqual(["_key", "body"], + viewWithSortedValues.viewValuesVars.map(v => v.field).sort()); + } + + { + let viewNodes = getNodes(query, "EnumerateViewNode", materialize); + assertEqual(2, viewNodes.length); + assertFalse(viewNodes[0].noMaterialization); + assertEqual(0, viewNodes[0].viewValuesVars.length); + assertFalse(viewNodes[1].noMaterialization); + assertEqual(0, viewNodes[1].viewValuesVars.length); + } + + let expected = db._query(` + FOR d IN wikipedia FILTER IN_RANGE(d.count, 99, 9999, true, true) + SORT d._key + RETURN { body: d.body, title: d.title, key: d._key }`, {}, noOptimization).toArray(); + assertNotEqual(expected.length, 0); + + let actual = db._query(query, doNotMaterialize).toArray(); + assertEqual(expected.length, actual.length); + actual.forEach((rhs, i) => assertTrue(_.isEqual(expected[i],rhs))); + + let actualEarlyMaterialization = db._query(query, materialize).toArray(); + assertEqual(expected.length, actualEarlyMaterialization.length); + actualEarlyMaterialization.forEach((rhs, i) => assertTrue(_.isEqual(expected[i],rhs))); + }, + + //////////////////////////////////////////////////////////////////////////////// + /// 1. ensure document isn't materialized + /// 2. ensure arangosearch reads all values from ["title", "created", "count", "_id"] column + /// 3. ensure results are same as for collection with filter + /// 4. ensure results are same as for view without stored values + //////////////////////////////////////////////////////////////////////////////// + testNoMaterializationForJoinWithIndex: function () { + let query = ` + FOR d IN v_wiki_stored + SEARCH IN_RANGE(d.count, 99, 99999, true, true) + OPTIONS { noMaterialization: @noMaterialization } + FOR j IN wikipedia FILTER j._id == d._id + RETURN j.title`; + + let noMaterializationNode = getNodes(query, "EnumerateViewNode", doNotMaterialize)[0]; + assertNotUndefined(noMaterializationNode); + assertTrue(noMaterializationNode.noMaterialization); + assertEqual(1, noMaterializationNode.viewValuesVars.length); + let viewValues = noMaterializationNode.viewValuesVars[0]; + assertEqual(2, viewValues.viewStoredValuesVars.length); + assertEqual(["_id"], + viewValues.viewStoredValuesVars.map(v => v.field).sort()); + + let node = getNodes(query, "EnumerateViewNode", materialize)[0]; + assertNotUndefined(node); + assertFalse(node.noMaterialization); + assertEqual(0, node.viewValuesVars.length); + + let expected = db._query(` + FOR d IN wikipedia FILTER IN_RANGE(d.count, 99, 99999, true, true) + RETURN d.title`, {}, noOptimization).toArray(); + assertNotEqual(expected.length, 0); + + let actual = db._query(query, doNotMaterialize).toArray(); + assertEqual(expected, actual); + assertEqual(db._query(query, materialize).toArray(), actual); + }, + }; +} + +//////////////////////////////////////////////////////////////////////////////// +/// @brief executes the test suite +//////////////////////////////////////////////////////////////////////////////// + +jsunity.run(ArangoSearch_StoredValues); +if (false === jsunity.done().status) { + throw "fail"; +} diff --git a/test_data/tests/arangosearch/arangosearch-wildcard-levenshtein-starts-test.js b/test_data/tests/arangosearch/arangosearch-wildcard-levenshtein-starts-test.js new file mode 100644 index 000000000..9f92297a9 --- /dev/null +++ b/test_data/tests/arangosearch/arangosearch-wildcard-levenshtein-starts-test.js @@ -0,0 +1,452 @@ +/*jshint globalstrict:false, strict:false, maxlen: 500 */ + +//////////////////////////////////////////////////////////////////////////////// +/// DISCLAIMER +/// +/// Copyright 2020 ArangoDB GmbH, Cologne, Germany +/// +/// Licensed under the Apache License, Version 2.0 (the "License"); +/// you may not use this file except in compliance with the License. +/// You may obtain a copy of the License at +/// +/// http://www.apache.org/licenses/LICENSE-2.0 +/// +/// Unless required by applicable law or agreed to in writing, software +/// distributed under the License is distributed on an "AS IS" BASIS, +/// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +/// See the License for the specific language governing permissions and +/// limitations under the License. +/// +/// Copyright holder is ArangoDB GmbH, Cologne, Germany +/// +/// @author Andrei Lobov +//////////////////////////////////////////////////////////////////////////////// + +const jsunity = require("jsunity"); +const {assertEqual, assertTrue} = jsunity.jsUnity.assertions; +const db = require("@arangodb").db; +const path = require('path'); +const levenshtein = require(path.join(__dirname, "/tests/arangosearch/3rdparty/js-levenshtein/index.js")); +const damlev = require(path.join(__dirname, "/tests/arangosearch/3rdparty/damerau-levenshtein-js/app.js")); +const {query} = require('@arangodb'); +const analyzers = require("@arangodb/analyzers"); + +const maxLevenshteinDistance = 4; +const maxDamerayLevenshteinDistance = 3; + +//////////////////////////////////////////////////////////////////////////////// +/// @brief test suite +//////////////////////////////////////////////////////////////////////////////// + +function arangoSearchMatchers () { + function doLevenshteinTest (resFilterDameray, allCursor, pattern, field, analyzer) { + let levenshteinMatched = {}; + let damerayLevenshteinMatched = {}; + let i = 0; + // process all collection and recalc distances with 3rdparty algorithms + while (allCursor.hasNext()) { + let val = allCursor.next(); + let minLevenshteinDistance = maxLevenshteinDistance + 1; // not matched by default + let minDamerayDistance = maxDamerayLevenshteinDistance + 1; + for (let t in val.tokens) { + let tokenDistance = levenshtein(pattern, val.tokens[t]); + if (tokenDistance <= maxLevenshteinDistance) { + if (minLevenshteinDistance > tokenDistance) { + minLevenshteinDistance = tokenDistance; + } + if (tokenDistance === 0) { + // best possible match found + minDamerayDistance = 0; + break; + } + } + if (minDamerayDistance > 0) { + let tokenDamerayDistance = damlev.distance(pattern, val.tokens[t]); + if (tokenDamerayDistance < minDamerayDistance) { + minDamerayDistance = tokenDamerayDistance; + } + } + } + if (minDamerayDistance <= maxDamerayLevenshteinDistance) { + // at same time we could check our dameray-levenshtein implementation + // FIXME: docs says LEVENSHTEIN_DISTANCE calculates levenshtein + // but actually it is dameray-levenshtein. So check is here. It is decided + // to fix in docs eventually. + assertTrue(i < resFilterDameray.length, val.cout + " is missing from FILTER"); + assertEqual(resFilterDameray[i].count, val.count, "FILTER failed"); + assertEqual(resFilterDameray[i].dist, minDamerayDistance, "FILTER min dist mismatch"); + damerayLevenshteinMatched[val.count] = minDamerayDistance; + i++; + } + if (minLevenshteinDistance <= maxLevenshteinDistance) { + levenshteinMatched[val.count] = minLevenshteinDistance; + } + } + assertEqual(i, resFilterDameray.length); + // now check search results for all supported distances + for (let distance = 0; distance <= maxLevenshteinDistance; ++distance) { + let res = db._query("FOR d IN view_content SEARCH ANALYZER(LEVENSHTEIN_MATCH(d." + + field + ", '" + pattern + "', " + distance + ", false, 0), '" + + analyzer + "') " + " SORT d.count RETURN d").toArray(); + let j = 0; + let count = 0; + for (let key in levenshteinMatched) { + if (levenshteinMatched[key] <= distance) { + count++; + } + } + assertEqual(res.length, count); + while (j < res.length) { + assertTrue(levenshteinMatched[res[j].count] <= distance); + ++j; + } + } + for (let distance = 0; distance <= maxDamerayLevenshteinDistance; ++distance) { + let res = db._query("FOR d IN view_content SEARCH ANALYZER(LEVENSHTEIN_MATCH(d." + + field + ", '" + pattern + "', " + distance + ", true, 0), '" + + analyzer + "') " + " SORT d.count RETURN d").toArray(); + let count = 0; + for (let key in damerayLevenshteinMatched) { + if (damerayLevenshteinMatched[key] <= distance) { + count++; + } + } + let j = 0; + assertEqual(res.length, count); + while (j < res.length) { + assertTrue(damerayLevenshteinMatched[res[j].count] <= distance); + ++j; + } + } + } + function doStartsWithTest (patterns, cursor, searchRes, match) { + let i = 0; + while (cursor.hasNext()) { + let found = 0; + let val = cursor.next(); + for (let t in val.tokens) { + let strToken = String(val.tokens[t]); + for (let p in patterns) { + if (strToken.indexOf(patterns[p]) === 0) { + found++; + if (found >= match) { + break; + } + } + } + if (found >= match) { + assertTrue(i < searchRes.length, "Document:" + val.count + " is missing from search"); + assertEqual(val.count, searchRes[i].count, "Document:" + val.count + " is missing from search"); + i++; + break; + } + } + } + assertEqual(i, searchRes.length); + } + return { + setUpAll: function () { + try { db._dropView("view_content"); } catch {} + try { analyzers.remove("my_delimiter", true); } catch {} + analyzers.save("my_delimiter", "delimiter", {"delimiter": " "}); + db._createView("view_content", "arangosearch", + { links: { wikipedia: { + includeAllFields: true, + fields: { + title: {analyzers: ['identity']}, + body: {analyzers: ['identity', 'my_delimiter']} + }}}}); + // view sync + db._query("FOR d IN view_content OPTIONS { waitForSync: true } LIMIT 1 RETURN d").toArray(); + }, + tearDownAll: function () { + try { db._dropView("view_content"); } catch {} + analyzers.remove("my_delimiter", true); + }, + testTestPrefix: function () { + let pattern = "'Anarch%'"; + let res = db._query("FOR d IN view_content SEARCH ANALYZER(LIKE(d.title, " + + pattern + "), 'identity') SORT d.count RETURN d").toArray(); + let resFilter = db._query("FOR d IN wikipedia FILTER LIKE(d.title, " + pattern + + ") SORT d.count RETURN d").toArray(); + assertEqual(resFilter.length, res.length); + for (let i = 0; i < resFilter.length; ++i) { + assertEqual(resFilter[i]._key, res[i]._key); + } + }, + testTestPostfix: function () { + let pattern = "'%arch'"; + let res = db._query("FOR d IN view_content SEARCH ANALYZER(LIKE(d.title, " + + pattern + "), 'identity') SORT d.count RETURN d").toArray(); + let resFilter = db._query("FOR d IN wikipedia FILTER LIKE(d.title, " + + pattern + ") SORT d.count RETURN d").toArray(); + assertEqual(resFilter.length, res.length); + for (let i = 0; i < resFilter.length; ++i) { + assertEqual(resFilter[i]._key, res[i]._key); + } + }, + testTestMiddle: function () { + let pattern = "'%arc%'"; + let res = db._query("FOR d IN view_content SEARCH ANALYZER(LIKE(d.title, " + + pattern + "), 'identity') SORT d.count RETURN d").toArray(); + let resFilter = db._query("FOR d IN wikipedia FILTER LIKE(d.title, " + + pattern + ") SORT d.count RETURN d").toArray(); + assertEqual(resFilter.length, res.length); + for (let i = 0; i < resFilter.length; ++i) { + assertEqual(resFilter[i]._key, res[i]._key); + } + }, + testTestMiddleWithVariables: function () { + let pattern = "'%arc%'"; + let res = db._query("FOR p IN [" + pattern + "] FOR d IN view_content SEARCH ANALYZER(LIKE(d.title, p), " + + "'identity') SORT d.count RETURN d").toArray(); + let resFilter = db._query("FOR d IN wikipedia FILTER LIKE(d.title, " + + pattern + ") SORT d.count RETURN d").toArray(); + assertEqual(resFilter.length, res.length); + for (let i = 0; i < resFilter.length; ++i) { + assertEqual(resFilter[i]._key, res[i]._key); + } + }, + testTestMiddleMixedPlaceholders: function () { + let pattern = "'%a%_r_c%'"; + let res = db._query("FOR d IN view_content SEARCH ANALYZER(d.title LIKE " + pattern + + ", 'identity') SORT d.count RETURN d").toArray(); + let resFilter = db._query("FOR d IN wikipedia FILTER LIKE(d.title, " + pattern + + ") SORT d.count RETURN d").toArray(); + assertEqual(resFilter.length, res.length); + for (let i = 0; i < resFilter.length; ++i) { + assertEqual(resFilter[i]._key, res[i]._key); + } + }, + testTestMiddleWithWords: function () { + let pattern = "'%Lord%Rings%'"; + let res = db._query("FOR d IN view_content SEARCH ANALYZER(d.title LIKE " + pattern + + ", 'identity') SORT d.count RETURN d").toArray(); + let resFilter = db._query("FOR d IN wikipedia FILTER LIKE(d.title, " + pattern + + ") SORT d.count RETURN d").toArray(); + assertEqual(resFilter.length, res.length); + for (let i = 0; i < resFilter.length; ++i) { + assertEqual(resFilter[i]._key, res[i]._key); + } + }, + testTestBody: function () { + let pattern = "'%Lord________Rings%'"; + let res = db._query("FOR d IN view_content SEARCH ANALYZER(d.body LIKE " + pattern + + ", 'identity') SORT d.count RETURN d").toArray(); + let resFilter = db._query("FOR d IN wikipedia FILTER LIKE(d.body, " + pattern + + ") SORT d.count RETURN d").toArray(); + assertEqual(resFilter.length, res.length); + for (let i = 0; i < resFilter.length; ++i) { + assertEqual(resFilter[i]._key, res[i]._key); + } + }, + testTestBodyPrefix: function () { + let pattern = "'The_%a%'"; + let res = db._query("FOR d IN view_content SEARCH ANALYZER(d.body LIKE " + pattern + + ", 'identity') SORT d.count RETURN d").toArray(); + let resFilter = db._query("FOR d IN wikipedia FILTER LIKE(d.body, " + pattern + + ") SORT d.count RETURN d").toArray(); + assertEqual(resFilter.length, res.length); + for (let i = 0; i < resFilter.length; ++i) { + assertEqual(resFilter[i]._key, res[i]._key); + } + }, + testTestBodySuffix: function () { + let pattern = "'_%!'"; + let res = db._query("FOR d IN view_content SEARCH ANALYZER(d.body LIKE " + pattern + + ", 'identity') SORT d.count RETURN d").toArray(); + let resFilter = db._query("FOR d IN wikipedia FILTER LIKE(d.body, " + pattern + + ") SORT d.count RETURN d").toArray(); + assertEqual(resFilter.length, res.length); + for (let i = 0; i < resFilter.length; ++i) { + assertEqual(resFilter[i]._key, res[i]._key); + } + }, + testLevenshteinMatch () { + const pattern = 'automaton'; + var resFilterDameray = db._query(" FOR d IN wikipedia LET matched = " + + " MIN(FOR t IN TOKENS(d.body, 'my_delimiter') " + + " LET dist = LEVENSHTEIN_DISTANCE(t, '" + pattern + "') " + + " FILTER dist <= " + maxDamerayLevenshteinDistance + + " RETURN dist) FILTER NOT IS_NULL(matched) SORT d.count " + + " RETURN {count: d.count, dist: matched}").toArray(); + const cursor = query`FOR d IN wikipedia SORT d.count RETURN {count: d.count, tokens: TOKENS(d.body, 'my_delimiter')}`; + doLevenshteinTest(resFilterDameray, cursor, pattern, "body", "my_delimiter"); + }, + testLevenshteinMatchFilterFallback () { + const pattern = 'automaton'; + const distance = maxLevenshteinDistance + 1; + // check fallback implementation of LEVENTSHTEIN_MATCH for long distance + let resFilter = db._query("FOR d IN wikipedia FILTER LEVENSHTEIN_MATCH(d.title, '" + + pattern + "', " + distance + ") SORT d.count RETURN d").toArray(); + const cursor = query`FOR d IN wikipedia SORT d.count RETURN d`; + let i = 0; + while (cursor.hasNext()) { + let val = cursor.next(); + let tokenDistance = levenshtein(pattern, val.title); + if (tokenDistance <= distance) { + assertTrue(i < resFilter.length); + assertEqual(val.count, resFilter[i].count); + ++i; + } + } + assertEqual(i, resFilter.length); + }, + testLevenshteinMatchFilterFallbackWithVariables () { + const pattern = 'automaton'; + const distance = maxLevenshteinDistance + 1; + // check fallback implementation of LEVENTSHTEIN_MATCH for long distance + let resFilter = db._query("LET p = '" + pattern + "' FOR dist IN " + distance + ".." + distance + + " FOR d IN wikipedia FILTER LEVENSHTEIN_MATCH(d.title, NOOPT(p), NOOPT(dist)) SORT d.count RETURN d").toArray(); + const cursor = query`FOR d IN wikipedia SORT d.count RETURN d`; + let i = 0; + while (cursor.hasNext()) { + let val = cursor.next(); + let tokenDistance = levenshtein(pattern, val.title); + if (tokenDistance <= distance) { + assertTrue(i < resFilter.length); + assertEqual(val.count, resFilter[i].count); + ++i; + } + } + assertEqual(i, resFilter.length); + }, + testBm25Relevance () { + const pattern = 'automaton'; + for (let distance = 0; distance <= maxLevenshteinDistance; ++distance) { + let res = db._query(" FOR d IN view_content SEARCH LEVENSHTEIN_MATCH(d.title, '" + + pattern + "', " + distance + ", false, 0 )" + + " SORT BM25(d) DESC RETURN d").toArray(); + let prevDistance = 0; + res.forEach(doc => { + let ld = levenshtein(doc.title, pattern); + assertTrue(ld >= prevDistance); + prevDistance = ld; + }); + } + for (let distance = 0; distance <= maxDamerayLevenshteinDistance; ++distance) { + let res = db._query(" FOR d IN view_content SEARCH LEVENSHTEIN_MATCH(d.title, '" + + pattern + "', " + distance + ", true, 0 )" + + " SORT BM25(d) DESC RETURN d").toArray(); + let prevDistance = 0; + res.forEach(doc => { + let dld = damlev.distance(doc.title, pattern); + assertTrue(dld >= prevDistance); + prevDistance = dld; + }); + } + }, + testTFIDFRelevance () { + const pattern = 'automaton'; + for (let distance = 0; distance <= maxLevenshteinDistance; ++distance) { + let res = db._query(" FOR d IN view_content SEARCH LEVENSHTEIN_MATCH(d.title, '" + + pattern + "', " + distance + ", false, 0 )" + + " SORT TFIDF(d) DESC RETURN d").toArray(); + let prevDistance = 0; + res.forEach(doc => { + let ld = levenshtein(doc.title, pattern); + assertTrue(ld >= prevDistance); + prevDistance = ld; + }); + } + for (let distance = 0; distance <= maxDamerayLevenshteinDistance; ++distance) { + let res = db._query(" FOR d IN view_content SEARCH LEVENSHTEIN_MATCH(d.title, '" + + pattern + "', " + distance + ", true, 0 )" + + " SORT TFIDF(d) DESC RETURN d").toArray(); + let prevDistance = 0; + res.forEach(doc => { + let dld = damlev.distance(doc.title, pattern); + assertTrue(dld >= prevDistance); + prevDistance = dld; + }); + } + }, + testFromSubquery: function () { + let patterns = db._query("RETURN TOKENS('Quick brown fox jumps over lazy dog', 'text_en')").toArray(); + let res = db._query(" LET tokens = " + + " (FOR c IN TOKENS('Quick brown fox jumps over lazy dog', 'text_en') RETURN c) " + + " FOR d in view_content " + + " SEARCH ANALYZER(STARTS_WITH(d.body, tokens, LENGTH(tokens)), 'my_delimiter') " + + " SORT d.count RETURN d").toArray(); + const cursor = query`FOR d IN wikipedia SORT d.count RETURN {count: d.count, tokens: TOKENS(d.body, 'my_delimiter')}`; + doStartsWithTest(patterns, cursor, res); + }, + testInSubquery: function () { + let patterns = ['Ameri', 'Quick', 'Slow']; + let res = db._query("FOR c IN ['Ameri', 'Quick', 'Slow'] " + + "FOR d IN view_content " + + "SEARCH ANALYZER(STARTS_WITH(d.title, c, 1), 'identity')" + + "SORT d.count RETURN d ").toArray(); + const cursor = query`FOR d IN wikipedia SORT d.count RETURN {count: d.count, tokens: [d.title]}`; + doStartsWithTest(patterns, cursor, res, 1); + }, + testArrayOfArrays: function () { + let patterns = [['Ameri', 'Quick', 'Slow'], ['Fast', 'Long'], ['Offs', 'Suff']]; + let res = db._query("FOR c IN [['Ameri', 'Quick', 'Slow'], ['Fast', 'Long'], ['Offs', 'Suff']] " + + "FOR d IN view_content " + + "SEARCH ANALYZER(STARTS_WITH(d.body, c, LENGTH(c)), 'my_delimiter') " + + " SORT d.count RETURN d").toArray(); + const cursor = query`FOR d IN wikipedia SORT d.count RETURN {count: d.count, tokens: TOKENS(d.body, 'my_delimiter')}`; + let i = 0; + while (cursor.hasNext()) { + let found = true; + let val = cursor.next(); + for (let t in val.tokens) { + let strToken = String(val.tokens[t]); + for (let p in patterns) { + found = true; + for (let s in patterns[p]) { + if (strToken.indexOf(patterns[p][s]) !== 0) { + found = false; + break; + } + } + if (!found) { + break; + } + } + if (found) { + assertTrue(i < res.length, "Document:" + val.count + " is missing from search"); + assertEqual(val.count, res[i].count); + i++; + break; + } + } + } + assertEqual(i, res.length); + }, + testFromConstArrayFullMatch: function () { + let patterns = ["Anarch", "Anar", "An"]; + let res = db._query(" FOR d in view_content " + + " SEARCH ANALYZER(STARTS_WITH(d.title, ['Anarch', 'Anar', 'An'], 3), 'identity') " + + " SORT d.count RETURN d").toArray(); + const cursor = query`FOR d IN wikipedia SORT d.count RETURN {count: d.count, tokens: [d.title]}`; + doStartsWithTest(patterns, cursor, res, 3); + }, + testFromConstArrayOneMatch: function () { + let patterns = ["Anarch", "Auto", "Ameri"]; + let res = db._query(" FOR d in view_content " + + " SEARCH ANALYZER(STARTS_WITH(d.title, ['Anarch', 'Auto', 'Ameri'], 1), 'identity') " + + " SORT d.count RETURN d").toArray(); + const cursor = query`FOR d IN wikipedia SORT d.count RETURN {count: d.count, tokens: [d.title]}`; + doStartsWithTest(patterns, cursor, res, 1); + }, + testStringTest: function () { + let pattern = "Anarch"; + let res = db._query(" FOR d in view_content " + + " SEARCH ANALYZER(STARTS_WITH(d.title, '" + pattern + "'), 'identity') " + + " SORT d.count RETURN d").toArray(); + const cursor = query`FOR d IN wikipedia SORT d.count RETURN {count: d.count, tokens: [d.title] }`; + doStartsWithTest([pattern], cursor, res, 1); + } + }; +} + +//////////////////////////////////////////////////////////////////////////////// +/// @brief executes the test suite +//////////////////////////////////////////////////////////////////////////////// + +jsunity.run(arangoSearchMatchers); +if (false === jsunity.done().status) { + throw "fail"; +}