From 4cdb5f2ace72336eaeffbe5dba56432b5a6f1a13 Mon Sep 17 00:00:00 2001 From: disservin Date: Wed, 30 Aug 2023 20:13:45 +0200 Subject: [PATCH] Switch from cutechess-cli to fast-chess fixes https://github.com/official-stockfish/fishtest/issues/2106 This PR switches from cutechess-cli to fast-chess. cutechess-cli has been serving us well in the past years, however, some issues have accumulated, namely the difficulty of compiling cutechess-cli, the observed timeouts at high concurrency and short TC, and e.g. slowness when indexing larger books. fast-chess https://github.com/Disservin/fast-chess has addressed these issues, and has now probably become mature enough to serve as the game manager for fishtest. As an example of its ability to deal with short TC and high concurrency: https://dfts-0.pigazzini.it/tests/view/669249cdbee8253775cede32 with concurrency 25, and TC 1+0.01s no timeouts are observed. fast-chess is built from sources, with the zip download as well as the binary cached as needed. There is fine-grained control over which version of fast-chess is used, so we can easily upgrade for new features. In this PR, fast-chess is built in cutechess compatibility to facilitate integration, and to benefit from the existing fishtest checks. Once validated, we should be able to switch easily to its native mode, which can output trinomial and pentanomial results, and we should be able significantly simplify the worker's book-keeping. Co-Authored-By: Disservin Co-Authored-By: Gahtan Nahdi <155860115+gahtan-syarif@users.noreply.github.com> --- AUTHORS | 1 + server/fishtest/api.py | 2 +- server/fishtest/rundb.py | 17 --- worker/games.py | 69 +++++------- worker/sri.txt | 2 +- worker/tests/test_worker.py | 8 +- worker/worker.py | 202 ++++++++++++++++++------------------ 7 files changed, 134 insertions(+), 167 deletions(-) diff --git a/AUTHORS b/AUTHORS index fd6026ad1..20855d96b 100644 --- a/AUTHORS +++ b/AUTHORS @@ -14,6 +14,7 @@ Fanael Linithien (Fanael) Fauzi Akram Dabat (FauziAkram) FieryDragonLord Gabe (MrBrain295) +Gahtan Nahdi (gahtan-syarif) Giacomo Lorenzetti (G-Lorenz) Gian-Carlo Pascutto (gcp) Henri Wiechers (hwiechers) diff --git a/server/fishtest/api.py b/server/fishtest/api.py index 22755e5fa..6a403e512 100644 --- a/server/fishtest/api.py +++ b/server/fishtest/api.py @@ -34,7 +34,7 @@ according to the route/URL mapping defined in `__init__.py`. """ -WORKER_VERSION = 241 +WORKER_VERSION = 242 @exception_view_config(HTTPException) diff --git a/server/fishtest/rundb.py b/server/fishtest/rundb.py index 838bf7fdb..731d1facf 100644 --- a/server/fishtest/rundb.py +++ b/server/fishtest/rundb.py @@ -1158,23 +1158,6 @@ def priority(run): # lower is better if not have_binary: continue - # To avoid time losses in the case of large concurrency and short TC, - # probably due to cutechess-cli as discussed in issue #822, - # assign linux workers to LTC or multi-threaded jobs - # and windows workers only to LTC jobs - if max_threads >= 29: - if "windows" in worker_info["uname"].lower(): - tc_too_short = get_tc_ratio(run["args"]["tc"], base="55+0.5") < 1.0 - else: - tc_too_short = ( - get_tc_ratio( - run["args"]["tc"], run["args"]["threads"], "35+0.3" - ) - < 1.0 - ) - if tc_too_short: - continue - # Limit the number of cores. # Currently this is only done for spsa. if "spsa" in run["args"]: diff --git a/worker/games.py b/worker/games.py index 64463a3c5..8d56e3cff 100644 --- a/worker/games.py +++ b/worker/games.py @@ -67,7 +67,7 @@ def is_64bit(): HTTP_TIMEOUT = 30.0 -CUTECHESS_KILL_TIMEOUT = 15.0 +FASTCHESS_KILL_TIMEOUT = 15.0 UPDATE_RETRY_TIME = 15.0 RAWCONTENT_HOST = "https://raw.githubusercontent.com" @@ -504,24 +504,6 @@ def unzip(blob, save_dir): return file_list -def convert_book_move_counters(book_file): - # converts files with complete FENs, leaving others (incl. converted ones) unchanged - epds = [] - with open(book_file, "r") as file: - for fen in file: - fields = fen.split() - if len(fields) == 6 and fields[4].isdigit() and fields[5].isdigit(): - fields[4] = f"hmvc {fields[4]};" - fields[5] = f"fmvn {fields[5]};" - epds.append(" ".join(fields)) - else: - return - - with open(book_file, "w") as file: - for epd in epds: - file.write(epd + "\n") - - def clang_props(): """Parse the output of clang++ -E - -march=native -### and extract the available clang properties""" with subprocess.Popen( @@ -958,7 +940,7 @@ def results_to_score(results): assert abs(s5 - s3) < epsilon -def parse_cutechess_output( +def parse_fastchess_output( p, current_state, remote, result, spsa_tuning, games_to_play, batch_size, tc_limit ): hash_pattern = re.compile(r"(Base|New)-[a-f0-9]+") @@ -1002,13 +984,13 @@ def shorten_hash(match): # Parse line like this: # Warning: New-SHA doesn't have option ThreatBySafePawn if "Warning:" in line and "doesn't have option" in line: - message = r'Cutechess-cli says: "{}"'.format(line) + message = r'fast-chess says: "{}"'.format(line) raise RunException(message) # Parse line like this: # Warning: Invalid value for option P: -354 if "Warning:" in line and "Invalid value" in line: - message = r'Cutechess-cli says: "{}"'.format(line) + message = r'fast-chess says: "{}"'.format(line) raise RunException(message) # Parse line like this: @@ -1032,7 +1014,7 @@ def shorten_hash(match): validate_pentanomial( wld, rounds - ) # check if cutechess-cli result is compatible with + ) # check if fast-chess result is compatible with # our own bookkeeping pentanomial = [ @@ -1123,7 +1105,7 @@ def shorten_hash(match): return True -def launch_cutechess( +def launch_fastchess( cmd, current_state, remote, result, spsa_tuning, games_to_play, batch_size, tc_limit ): if spsa_tuning: @@ -1154,7 +1136,7 @@ def launch_cutechess( w_params = [] b_params = [] - # Run cutechess-cli binary. + # Run fast-chess binary. # Stochastic rounding and probability for float N.p: (N, 1-p); (N+1, p) idx = cmd.index("_spsa_") cmd = ( @@ -1179,7 +1161,7 @@ def launch_cutechess( + cmd[idx + 1 :] ) - # print(cmd) + # print(cmd) try: with subprocess.Popen( cmd, @@ -1201,7 +1183,7 @@ def launch_cutechess( close_fds=not IS_WINDOWS, ) as p: try: - task_alive = parse_cutechess_output( + task_alive = parse_fastchess_output( p, current_state, remote, @@ -1212,15 +1194,15 @@ def launch_cutechess( tc_limit, ) finally: - # We nicely ask cutechess-cli to stop. + # We nicely ask fast-chess to stop. try: send_sigint(p) except Exception as e: print("\nException in send_sigint:\n", e, sep="", file=sys.stderr) # now wait... - print("\nWaiting for cutechess-cli to finish ... ", end="", flush=True) + print("\nWaiting for fast-chess to finish ... ", end="", flush=True) try: - p.wait(timeout=CUTECHESS_KILL_TIMEOUT) + p.wait(timeout=FASTCHESS_KILL_TIMEOUT) except subprocess.TimeoutExpired: print("timeout", flush=True) kill_process(p) @@ -1228,12 +1210,12 @@ def launch_cutechess( print("done", flush=True) except (OSError, subprocess.SubprocessError) as e: print( - "Exception starting cutechess:\n", + "Exception starting fast-chess:\n", e, sep="", file=sys.stderr, ) - raise WorkerException("Unable to start cutechess. Error: {}".format(str(e))) + raise WorkerException("Unable to start fast-chess. Error: {}".format(str(e))) return task_alive @@ -1249,7 +1231,7 @@ def run_games( clear_binaries, global_cache, ): - # This is the main cutechess-cli driver. + # This is the main fast-chess driver. # It is ok, and even expected, for this function to # raise exceptions, implicitly or explicitly, if a # task cannot be completed. @@ -1317,7 +1299,7 @@ def run_games( start_game_index = opening_offset + input_total_games run_seed = int(hashlib.sha1(run["_id"].encode("utf-8")).hexdigest(), 16) % (2**30) - # Format options according to cutechess syntax. + # Format options according to fastchess syntax. def parse_options(s): results = [] chunks = s.split("=") @@ -1404,11 +1386,6 @@ def parse_options(s): blob = download_from_github(zipball) unzip(blob, testing_dir) - # convert .epd containing FENs into .epd containing EPDs with move counters - # only needed as long as cutechess-cli is the game manager - if book.endswith(".epd"): - convert_book_move_counters(testing_dir / book) - # Clean up the old networks (keeping the num_bkps most recent) num_bkps = 10 for old_net in sorted( @@ -1424,7 +1401,7 @@ def parse_options(s): file=sys.stderr, ) - # Add EvalFile* with full path to cutechess options, and download the networks if missing. + # Add EvalFile* with full path to fast-chess options, and download the networks if missing. for option, net in required_nets(base_engine).items(): base_options.append("option.{}={}".format(option, net)) establish_validated_net(remote, testing_dir, net, global_cache) @@ -1554,15 +1531,17 @@ def make_player(arg): if any(substring in book.upper() for substring in ["FRC", "960"]): variant = "fischerandom" - # Run cutechess binary. - cutechess = "cutechess-cli" + EXE_SUFFIX + # Run fastchess binary. + fastchess = "fast-chess" + EXE_SUFFIX cmd = ( [ - os.path.join(testing_dir, cutechess), + os.path.join(testing_dir, fastchess), "-recover", "-repeat", "-games", - str(int(games_to_play)), + "2", + "-rounds", + str(int(games_to_play) // 2), "-tournament", "gauntlet", ] @@ -1618,7 +1597,7 @@ def make_player(arg): + book_cmd ) - task_alive = launch_cutechess( + task_alive = launch_fastchess( cmd, current_state, remote, diff --git a/worker/sri.txt b/worker/sri.txt index ee2cad5b6..59386f6ad 100644 --- a/worker/sri.txt +++ b/worker/sri.txt @@ -1 +1 @@ -{"__version": 241, "updater.py": "Mg+pWOgGA0gSo2TuXuuLCWLzwGwH91rsW1W3ixg3jYauHQpRMtNdGnCfuD1GqOhV", "worker.py": "BMuQUpxZAKF0aP6ByTZY1r06MfPoIbdG2xraTrDQQRKgvhzJo6CKmeX2P8vX/QDm", "games.py": "9dFaa914vpqT7q4LLx2LlDdYwK6QFVX3h7+XRt18ATX0lt737rvFeBIiqakkttNC"} +{"__version": 242, "updater.py": "Mg+pWOgGA0gSo2TuXuuLCWLzwGwH91rsW1W3ixg3jYauHQpRMtNdGnCfuD1GqOhV", "worker.py": "jztkW6Di1yD8hx6nABYoT+CbB4V7aMBjgSBO4YajKF9y3yhVZ0hsdI5sGs/1KBFT", "games.py": "7RjSD5X3UP0DIgQecSpaRKTVFmf/4POUAHpNskuPVWNkdscE7+2PTQGfqHklIwho"} diff --git a/worker/tests/test_worker.py b/worker/tests/test_worker.py index 91cc84674..b1114d370 100644 --- a/worker/tests/test_worker.py +++ b/worker/tests/test_worker.py @@ -70,8 +70,12 @@ def test_sri(self): def test_toolchain_verification(self): self.assertTrue(worker.verify_toolchain()) - def test_setup_cutechess(self): - self.assertTrue(worker.setup_cutechess(Path.cwd())) + def test_setup_fastchess(self): + self.assertTrue( + worker.setup_fastchess( + Path.cwd(), list(worker.detect_compilers().keys())[0], 1, "" + ) + ) if __name__ == "__main__": diff --git a/worker/worker.py b/worker/worker.py index bfc40c95d..af7977f11 100644 --- a/worker/worker.py +++ b/worker/worker.py @@ -16,6 +16,7 @@ import stat import subprocess import sys +import tempfile import threading import time import traceback @@ -41,9 +42,12 @@ RunException, WorkerException, backup_log, + cache_read, + cache_write, download_from_github, format_return_code, log, + requests_get, run_games, send_api_post_request, str_signal, @@ -68,7 +72,7 @@ MIN_CLANG_MAJOR = 8 MIN_CLANG_MINOR = 0 -WORKER_VERSION = 241 +WORKER_VERSION = 242 FILE_LIST = ["updater.py", "worker.py", "games.py"] HTTP_TIMEOUT = 30.0 INITIAL_RETRY_TIME = 15.0 @@ -103,8 +107,8 @@ worker.py : worker() worker.py : fetch_and_handle_task() [in loop] games.py : run_games() -games.py : launch_cutechess() [in loop for spsa] -games.py : parse_cutechess_output() +games.py : launch_fastchess() [in loop for spsa] +games.py : parse_fastchess_output() Apis used by the worker ======================= @@ -120,8 +124,8 @@ /api/request_task POST /api/nn/ GET /git/trees/master GET - /git/trees/master/blobs/ GET /git/trees/master/blobs/ GET + /repos/Disservin/fast-chess/zipball/ GET /repos//zipball/ GET Main loop /api/update_task POST @@ -392,40 +396,17 @@ def get_credentials(config, options, args): return username, password -def download_cutechess(cutechess, save_dir): - if len(EXE_SUFFIX) > 0: - zipball = "cutechess-cli-win.zip" - elif IS_MACOS: - zipball = "cutechess-cli-macos-64bit.zip" - else: - zipball = "cutechess-cli-linux-{}.zip".format(platform.architecture()[0]) - try: - blob = download_from_github(zipball) - unzip(blob, save_dir) - - os.chmod(cutechess, os.stat(cutechess).st_mode | stat.S_IEXEC) - except Exception as e: - print( - "Exception downloading or extracting {}:\n".format(zipball), - e, - sep="", - file=sys.stderr, - ) - else: - print("Finished downloading {}".format(cutechess)) - +def verify_required_fastchess(fastchess_path, fastchess_sha): + # Verify that fastchess is working and has the required minimum version. -def verify_required_cutechess(cutechess_path): - # Verify that cutechess is working and has the required minimum version. - - if not cutechess_path.exists(): + if not fastchess_path.exists(): return False - print("Obtaining version info for {} ...".format(cutechess_path)) + print("Obtaining version info for {} ...".format(fastchess_path)) try: with subprocess.Popen( - [cutechess_path, "--version"], + [fastchess_path, "--version"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True, @@ -433,98 +414,115 @@ def verify_required_cutechess(cutechess_path): close_fds=not IS_WINDOWS, ) as p: errors = p.stderr.read() - pattern = re.compile(r"cutechess-cli ([0-9]+)\.([0-9]+)\.([0-9]+)") - major, minor, patch = 0, 0, 0 + pattern = re.compile( + "fast-chess alpha [0-9]*.[0-9]*.[0-9]* [0-9]*-([0-9a-f-]*) \(compiled with cutechess output\)" + ) + short_sha = "" for line in iter(p.stdout.readline, ""): m = pattern.search(line) if m: print("Found", line.strip()) - major = int(m.group(1)) - minor = int(m.group(2)) - patch = int(m.group(3)) + short_sha = m.group(1) except (OSError, subprocess.SubprocessError) as e: - print("Unable to run cutechess-cli. Error: {}".format(str(e))) + print("Unable to run fast-chess. Error: {}".format(str(e))) return False if p.returncode != 0: print( - "Unable to run cutechess-cli. Return code: {}. Error: {}".format( + "Unable to run fast-chess. Return code: {}. Error: {}".format( format_return_code(p.returncode), errors ) ) return False - if major + minor + patch == 0: - print("Unable to find the version of cutechess-cli.") - return False + if len(short_sha) < 7: + print( + "Unable to find a suitable sha of length 7 or more in the fast-chess version." + ) - if (major, minor) < (1, 2): - print("Requires cutechess 1.2 or higher, found version doesn't match") + if not fastchess_sha.startswith(short_sha): + print( + "fast-chess sha {} required but the version shows {}".format( + fastchess_sha, short_sha + ) + ) return False return True -def setup_cutechess(worker_dir): +def setup_fastchess(worker_dir, compiler, concurrency, global_cache): # Create the testing directory if missing. testing_dir = worker_dir / "testing" testing_dir.mkdir(exist_ok=True) - curr_dir = Path.cwd() + fastchess_sha = "f65d3c375de5be0c2eb55c559f82ceaddadb87dc" + username = "Disservin" + fastchess = "fast-chess" + EXE_SUFFIX + if verify_required_fastchess(testing_dir / fastchess, fastchess_sha): + return True + + # build it ourselves try: - os.chdir(testing_dir) - except Exception as e: - print("Unable to enter {}. Error: {}".format(testing_dir, str(e))) - return False + item_url = ( + "https://api.github.com/repos/" + + username + + "/fast-chess/zipball/" + + fastchess_sha + ) - cutechess = "cutechess-cli" + EXE_SUFFIX - cutechess_path = testing_dir / cutechess + print("Building fast chess from sources at {}".format(item_url)) - # Download cutechess-cli if missing or overwrite if there are issues. - if not verify_required_cutechess(cutechess_path): - download_cutechess(cutechess, testing_dir) - else: - os.chdir(curr_dir) - return True + should_cache = False + blob = cache_read(global_cache, fastchess_sha + ".zip") - ret = True + if blob is None: + print("Downloading {}".format(item_url)) + blob = requests_get(item_url).content + should_cache = True + else: + print("Using {} from global cache".format(fastchess_sha + ".zip")) + + tmp_dir = Path(tempfile.mkdtemp(dir=testing_dir)) + file_list = unzip(blob, tmp_dir) + prefix = os.path.commonprefix([n.filename for n in file_list]) + + if should_cache: + cache_write(global_cache, fastchess_sha + ".zip", blob) - if not verify_required_cutechess(cutechess_path): + cd = os.getcwd() + os.chdir(tmp_dir / prefix) + + cmd = f"make -j{concurrency} USE_CUTE=true CXX={compiler} GIT_SHA={fastchess_sha[0:8]} GIT_DATE=01010101" + print(cmd) + with subprocess.Popen( + cmd, + shell=True, + env=os.environ, + stderr=subprocess.PIPE, + universal_newlines=True, + bufsize=1, + close_fds=not IS_WINDOWS, + ) as p: + errors = p.stderr.readlines() + + if p.returncode: + raise WorkerException("Executing {} failed. Error: {}".format(cmd, errors)) + + shutil.copy("fast-chess" + EXE_SUFFIX, testing_dir) + os.chdir(cd) + shutil.rmtree(tmp_dir) + + except Exception as e: print( - "The downloaded cutechess-cli is not working. Trying to restore a backup copy ..." - ) - bkp_cutechess_clis = sorted( - worker_dir.glob("_testing_*/" + cutechess), - key=os.path.getctime, - reverse=True, + "Exception downloading, extracting or building fast-chess:\n", + e, + sep="", + file=sys.stderr, ) - if bkp_cutechess_clis: - bkp_cutechess_cli = bkp_cutechess_clis[0] - try: - shutil.copy(bkp_cutechess_cli, testing_dir) - except Exception as e: - print( - "Unable to copy {} to {}. Error: {}".format( - bkp_cutechess_cli, testing_dir, str(e) - ) - ) - if not verify_required_cutechess(cutechess_path): - print( - "The backup copy {} doesn't work either ...".format( - bkp_cutechess_cli - ) - ) - print("No suitable cutechess-cli found") - ret = False - else: - print("No backup copy found") - print("No suitable cutechess-cli found") - ret = False - - os.chdir(curr_dir) - return ret + return verify_required_fastchess(testing_dir / fastchess, fastchess_sha) def validate(config, schema): @@ -821,7 +819,7 @@ def my_error(e): # Limit concurrency so that at least STC tests can run with the evailable memory # The memory need per engine is 16 for the TT Hash, 10 for the process 138 for the net and 16 per thread - # 60 is the need for cutechess-cli + # 60 is the need for fast-chess # These numbers need to be up-to-date with the server values STC_memory = 2 * (16 + 10 + 138 + 16) max_concurrency = int((options.max_memory - 60) / STC_memory) @@ -1512,12 +1510,21 @@ def worker(): print("Exception verifying worker version:\n", e, sep="", file=sys.stderr) return 1 + # Assemble the config/options data as well as some other data in a + # "worker_info" dictionary. + # This data will be sent to the server when a new task is requested. + + compiler, major, minor, patchlevel = options.compiler + print("Using {} {}.{}.{}".format(compiler, major, minor, patchlevel)) + # Check for common tool chain issues if not verify_toolchain(): return 1 - # Make sure we have a working cutechess-cli - if not setup_cutechess(worker_dir): + # Make sure we have a working fast-chess + if not setup_fastchess( + worker_dir, compiler, options.concurrency, options.global_cache + ): return 1 # Check if we are running an unmodified worker @@ -1525,13 +1532,6 @@ def worker(): if unmodified is None: return 1 - # Assemble the config/options data as well as some other data in a - # "worker_info" dictionary. - # This data will be sent to the server when a new task is requested. - - compiler, major, minor, patchlevel = options.compiler - print("Using {} {}.{}.{}".format(compiler, major, minor, patchlevel)) - uname = platform.uname() worker_info = { "uname": uname[0] + " " + uname[2] + (" (colab)" if IS_COLAB else ""),