From 32428a2510a5fb747fee07ad5aa81ccd43a06e00 Mon Sep 17 00:00:00 2001 From: hankcs Date: Tue, 8 Oct 2024 01:34:11 -0700 Subject: [PATCH] Avoid redundant downloading and decompressing across processes --- hanlp/utils/io_util.py | 23 ++++++++++++++++++----- hanlp/version.py | 2 +- 2 files changed, 19 insertions(+), 6 deletions(-) diff --git a/hanlp/utils/io_util.py b/hanlp/utils/io_util.py index 38182101e..ea87c671c 100644 --- a/hanlp/utils/io_util.py +++ b/hanlp/utils/io_util.py @@ -90,6 +90,13 @@ def tempdir_human(): return tempdir(now_filename()) +def temp_lock(path): + from filelock import FileLock + import hashlib + lock = FileLock(f"{tempdir()}/.{hashlib.md5(path.encode('utf8')).hexdigest()}.lock") + return lock + + def hanlp_home_default(): """Default data directory depending on the platform and environment variables""" if windows(): @@ -292,6 +299,7 @@ def get_resource(path: str, save_dir=hanlp_home(), extract=True, prefix=HANLP_UR The real path to the resource. """ + _path = path path = hanlp.pretrained.ALL.get(path, path) anchor: str = None compressed = None @@ -333,12 +341,17 @@ def get_resource(path: str, save_dir=hanlp_home(), extract=True, prefix=HANLP_UR # realpath is where its path after exaction if compressed: realpath += compressed - if not os.path.isfile(realpath): - path = download(url=path, save_path=realpath, verbose=verbose) - else: - path = realpath + with temp_lock(path): + if not os.path.isfile(realpath): + path = download(url=path, save_path=realpath, verbose=verbose) + else: + path = realpath if extract and compressed: - path = uncompress(path, verbose=verbose) + with temp_lock(path): + if os.path.isfile(path): + path = uncompress(path, verbose=verbose) + else: # other process must have already decompressed it and deleted it + return get_resource(_path, save_dir, extract, prefix, append_location, verbose) if anchor: path = path_join(path, anchor) diff --git a/hanlp/version.py b/hanlp/version.py index 97d0286d9..25dd2fef3 100644 --- a/hanlp/version.py +++ b/hanlp/version.py @@ -2,7 +2,7 @@ # Author: hankcs # Date: 2019-12-28 19:26 -__version__ = '2.1.0-beta.61' +__version__ = '2.1.0-beta.62' """HanLP version"""