From 862166439198a1e5e3b7330d2e02ad4ce6bd1fef Mon Sep 17 00:00:00 2001 From: Dan King Date: Mon, 16 Oct 2023 23:28:50 -0400 Subject: [PATCH 01/62] [fs] basic sync tool CHANGELOG: Introduce `hailctl fs sync` which robustly transfers one or more files between Amazon S3, Azure Blob Storage, and Google Cloud Storage. There are really two distinct conceptual changes remaining here. Given my waning time available, I am not going to split them into two pull requests. The changes are: 1. `basename` always agrees with the the [`basename` UNIX utility](https://en.wikipedia.org/wiki/Basename). In particular, the folder `/foo/bar/baz/`'s basename is *not* `''` it is `'baz'`. The only folders or objects whose basename is `''` are objects whose name literally ends in a slash, e.g. an *object* named `gs://foo/bar/baz/`. 2. `hailctl fs sync`, a robust copying tool with a user-friendly CLI. `hailctl fs sync` comprises two pieces: `plan.py` and `sync.py`. The latter, `sync.py` is simple: it delegates to our existing copy infrastructure. That copy infastructure has been lightly modified to support this use-case. The former, `plan.py`, is concurrent file system `diff`. `plan.py` generates and `sync.py` consumes a "plan folder" containing these files: 1. `matches` files whose names and sizes match. Two columns: source URL, destination URL. 2. `differs` files or folders whose names match but either differ in size or differ in type. Four columns: source URL, destination URL, source state, destination state. The states are either: `file`, `dif`, or a size. If either state is a size, both states are sizes. 3. `srconly` files only present in the source. One column: source URL. 4. `dstonly` files only present in the destination. One column: destination URL. 5. `plan` a proposed set of object-to-object copies. Two columns: source URL, destination URL. 6. `sumary` a one-line file containing the total number of copies in plan and the total number of bytes which would be copied. As described in the CLI documentation, the intended use of these commands is: ``` hailctl fs sync --make-plan plan1 --copy-to gs://gcs-bucket/a s3://s3-bucket/b hailctl fs sync --use-plan plan1 ``` The first command generates a plan folder and the second command executes the plan. Separating this process into two commands allows the user to verify what exactly will be copied including the exact destination URLs. Moreover, if `hailctl fs sync --use-plan` fails, the user can re-run `hailctl fs sync --make-plan` to generate a new plan which will avoid copying already successfully copied files. Moreover, the user can re-run `hailctl fs sync --make-plan` to verify that every file was indeed successfully copied. Testing. This change has a few sync-specific tests but largely reuses the tests for `hailtop.aiotools.copy`. Future Work. Propagating a consistent kind of hash across all clouds and using that for detecting differences is a better solution than the file-size based difference used here. If all the clouds always provided the same type of hash value, this would be trivial to add. Alas, at time of writing, S3 and Google both support CRC32C for every blob (though, in S3, you must explicitly request it at object creation time), but *Azure Blob Storage does not*. ABS only supports MD5 sums which Google does not support for multi-part uploads. --- hail/Makefile | 2 + hail/python/hailtop/aiocloud/aioaws/fs.py | 24 +- hail/python/hailtop/aiocloud/aioazure/fs.py | 24 +- .../aiogoogle/client/storage_client.py | 19 +- hail/python/hailtop/aiotools/copy.py | 30 +- hail/python/hailtop/aiotools/fs/copier.py | 13 +- hail/python/hailtop/aiotools/plan.py | 331 +++++++++++++++ hail/python/hailtop/aiotools/sync.py | 71 ++++ hail/python/hailtop/hailctl/__main__.py | 7 +- hail/python/hailtop/hailctl/fs/__init__.py | 0 hail/python/hailtop/hailctl/fs/cli.py | 116 +++++ .../test/hailtop/inter_cloud/test_copy.py | 399 +++++++++++++----- .../test/hailtop/inter_cloud/test_fs.py | 21 + .../inter_cloud/test_hailctl_fs_sync.py | 128 ++++++ 14 files changed, 1067 insertions(+), 118 deletions(-) create mode 100644 hail/python/hailtop/aiotools/plan.py create mode 100644 hail/python/hailtop/aiotools/sync.py create mode 100644 hail/python/hailtop/hailctl/fs/__init__.py create mode 100644 hail/python/hailtop/hailctl/fs/cli.py create mode 100644 hail/python/test/hailtop/inter_cloud/test_hailctl_fs_sync.py diff --git a/hail/Makefile b/hail/Makefile index e70b1cd1eb4..de697546dda 100644 --- a/hail/Makefile +++ b/hail/Makefile @@ -186,6 +186,8 @@ pytest-inter-cloud: upload-remote-test-resources install-editable HAIL_TEST_S3_BUCKET=hail-test-dy5rg \ HAIL_TEST_AZURE_ACCOUNT=hailtest \ HAIL_TEST_AZURE_CONTAINER=hail-test-4nxei \ + HAIL_TEST_AZURE_RESGRP=hail-dev \ + HAIL_TEST_AZURE_SUBID=22cd45fe-f996-4c51-af67-ef329d977519 \ $(HAIL_PYTHON3) -m pytest \ -Werror:::hail -Werror:::hailtop -Werror::ResourceWarning \ --log-cli-level=INFO \ diff --git a/hail/python/hailtop/aiocloud/aioaws/fs.py b/hail/python/hailtop/aiocloud/aioaws/fs.py index 8c22c851692..0994f3f46ed 100644 --- a/hail/python/hailtop/aiocloud/aioaws/fs.py +++ b/hail/python/hailtop/aiocloud/aioaws/fs.py @@ -93,8 +93,11 @@ def __init__(self, head_object_resp, url: str): self.head_object_resp = head_object_resp self._url = url + def __repr__(self): + return f'S3HeadObjectFileStatus({self.head_object_resp}, {self._url})' + def basename(self) -> str: - return os.path.basename(self._url.rstrip('/')) + return os.path.basename(self._url) def url(self) -> str: return self._url @@ -121,8 +124,11 @@ def __init__(self, item: Dict[str, Any], url: str): self._item = item self._url = url + def __repr__(self): + return f'S3ListFilesFileStatus({self._item}, {self._url})' + def basename(self) -> str: - return os.path.basename(self._url.rstrip('/')) + return os.path.basename(self._url) def url(self) -> str: return self._url @@ -195,8 +201,15 @@ def __init__(self, bucket: str, key: str, item: Optional[Dict[str, Any]]): self._item = item self._status: Optional[S3ListFilesFileStatus] = None + def __repr__(self): + return f'S3FileListEntry({self._bucket}, {self._key}, {self._item})' + def basename(self) -> str: - return os.path.basename(self._key.rstrip('/')) + object_name = self._key + if self._is_dir(): + assert object_name[-1] == '/' + object_name = object_name[:-1] + return os.path.basename(object_name) async def url(self) -> str: return f's3://{self._bucket}/{self._key}' @@ -204,9 +217,12 @@ async def url(self) -> str: async def is_file(self) -> bool: return self._item is not None - async def is_dir(self) -> bool: + def _is_dir(self) -> bool: return self._item is None + async def is_dir(self) -> bool: + return self._is_dir() + async def status(self) -> FileStatus: if self._status is None: if self._item is None: diff --git a/hail/python/hailtop/aiocloud/aioazure/fs.py b/hail/python/hailtop/aiocloud/aioazure/fs.py index 185bbb8e384..e9a61a55407 100644 --- a/hail/python/hailtop/aiocloud/aioazure/fs.py +++ b/hail/python/hailtop/aiocloud/aioazure/fs.py @@ -243,8 +243,15 @@ def __init__(self, url: 'AzureAsyncFSURL', blob_props: Optional[BlobProperties]) self._blob_props = blob_props self._status: Optional[AzureFileStatus] = None + def __repr__(self): + return f'AzureFileListEntry({self._url}, {self._blob_props})' + def basename(self) -> str: - return os.path.basename(self._url.base.rstrip('/')) + url_no_sas = self._url.base + if self._is_dir(): + assert url_no_sas[-1] == '/' + url_no_sas = url_no_sas[:-1] + return os.path.basename(url_no_sas) async def url(self) -> str: return self._url.base @@ -255,9 +262,12 @@ async def url_full(self) -> str: async def is_file(self) -> bool: return self._blob_props is not None - async def is_dir(self) -> bool: + def _is_dir(self) -> bool: return self._blob_props is None + async def is_dir(self) -> bool: + return self._is_dir() + async def status(self) -> FileStatus: if self._status is None: if self._blob_props is None: @@ -271,8 +281,11 @@ def __init__(self, blob_props: BlobProperties, url: 'AzureAsyncFSURL'): self.blob_props = blob_props self._url = url + def __repr__(self): + return f'AzureFileStatus({self.blob_props}, {self._url})' + def basename(self) -> str: - return os.path.basename(self._url.base.rstrip('/')) + return os.path.basename(self._url.base) def url(self) -> str: return str(self._url) @@ -548,7 +561,10 @@ async def isdir(self, url: str) -> bool: fs_url = self.parse_url(url, error_if_bucket=True) assert not fs_url.path or fs_url.path.endswith('/'), fs_url.path client = self.get_container_client(fs_url) - async for _ in client.walk_blobs(name_starts_with=fs_url.path, include=['metadata'], delimiter='/'): + path = fs_url.path + if path[-1] != '/': + path = path + '/' + async for _ in client.walk_blobs(name_starts_with=path, include=['metadata'], delimiter='/'): return True return False diff --git a/hail/python/hailtop/aiocloud/aiogoogle/client/storage_client.py b/hail/python/hailtop/aiocloud/aiogoogle/client/storage_client.py index acb7e2d53fa..224506e0419 100644 --- a/hail/python/hailtop/aiocloud/aiogoogle/client/storage_client.py +++ b/hail/python/hailtop/aiocloud/aiogoogle/client/storage_client.py @@ -445,8 +445,11 @@ def __init__(self, items: Dict[str, str], url: str): self._items = items self._url = url + def __repr__(self): + return f'GetObjectFileStatus({self._items}, {self._url})' + def basename(self) -> str: - return os.path.basename(self._url.rstrip('/')) + return os.path.basename(self._url) def url(self) -> str: return self._url @@ -471,8 +474,15 @@ def __init__(self, bucket: str, name: str, items: Optional[Dict[str, Any]]): self._items = items self._status: Optional[GetObjectFileStatus] = None + def __repr__(self): + return f'GoogleStorageFileListEntry({self._bucket}, {self._name}, {self._items})' + def basename(self) -> str: - return os.path.basename(self._name.rstrip('/')) + object_name = self._name + if self._is_dir(): + assert object_name[-1] == '/' + object_name = object_name[:-1] + return os.path.basename(object_name) async def url(self) -> str: return f'gs://{self._bucket}/{self._name}' @@ -480,9 +490,12 @@ async def url(self) -> str: async def is_file(self) -> bool: return self._items is not None - async def is_dir(self) -> bool: + def _is_dir(self) -> bool: return self._items is None + async def is_dir(self) -> bool: + return self._is_dir() + async def status(self) -> FileStatus: if self._status is None: if self._items is None: diff --git a/hail/python/hailtop/aiotools/copy.py b/hail/python/hailtop/aiotools/copy.py index 26c438be6dc..bb0c3eabf8b 100644 --- a/hail/python/hailtop/aiotools/copy.py +++ b/hail/python/hailtop/aiotools/copy.py @@ -57,6 +57,14 @@ async def __aexit__(self, exc_type, exc, tb): self.task.cancel() +def only_update_completions(progress: Progress, tid): + def listen(delta: int): + if delta < 0: + progress.update(tid, advance=-delta) + + return listen + + async def copy( *, max_simultaneous_transfers: Optional[int] = None, @@ -66,6 +74,7 @@ async def copy( s3_kwargs: Optional[dict] = None, transfers: List[Transfer], verbose: bool = False, + totals: Optional[Tuple[int, int]] = None, ) -> None: with ThreadPoolExecutor() as thread_pool: if max_simultaneous_transfers is None: @@ -98,15 +107,22 @@ async def copy( ) as sema: file_tid = progress.add_task(description='files', total=0, visible=verbose) bytes_tid = progress.add_task(description='bytes', total=0, visible=verbose) + + if totals: + n_files, n_bytes = totals + progress.update(file_tid, total=n_files) + progress.update(bytes_tid, total=n_bytes) + file_listener = only_update_completions(progress, file_tid) + bytes_listener = only_update_completions(progress, bytes_tid) + else: + file_listener = make_listener(progress, file_tid) + bytes_listener = make_listener(progress, bytes_tid) + copy_report = await Copier.copy( - fs, - sema, - transfers, - files_listener=make_listener(progress, file_tid), - bytes_listener=make_listener(progress, bytes_tid), + fs, sema, transfers, files_listener=file_listener, bytes_listener=bytes_listener ) if verbose: - copy_report.summarize() + copy_report.summarize(include_sources=totals is None) def make_transfer(json_object: Dict[str, str]) -> Transfer: @@ -159,7 +175,7 @@ async def main() -> None: parser.add_argument( '-v', '--verbose', action='store_const', const=True, default=False, help='show logging information' ) - parser.add_argument('--timeout', type=str, default=None, help='show logging information') + parser.add_argument('--timeout', type=str, default=None, help='Set the total timeout for HTTP requests.') args = parser.parse_args() if args.verbose: diff --git a/hail/python/hailtop/aiotools/fs/copier.py b/hail/python/hailtop/aiotools/fs/copier.py index def7933a115..b934675f08f 100644 --- a/hail/python/hailtop/aiotools/fs/copier.py +++ b/hail/python/hailtop/aiotools/fs/copier.py @@ -143,7 +143,7 @@ def mark_done(self): self._end_time = time_msecs() self._duration = self._end_time - self._start_time - def summarize(self): + def summarize(self, include_sources: bool = True): source_reports = [] def add_source_reports(transfer_report): @@ -177,9 +177,10 @@ def add_source_reports(transfer_report): file_rate = total_files / (self._duration / 1000) print(f' Average file rate: {file_rate:,.1f}/s') - print('Sources:') - for sr in source_reports: - print(f' {sr._source}: {sr._files} files, {humanize.naturalsize(sr._bytes)}') + if include_sources: + print('Sources:') + for sr in source_reports: + print(f' {sr._source}: {sr._files} files, {humanize.naturalsize(sr._bytes)}') class SourceCopier: @@ -239,6 +240,7 @@ async def _copy_part( part_creator: MultiPartCreate, return_exceptions: bool, ) -> None: + total_written = 0 try: async with self.xfer_sema.acquire_manager(min(Copier.BUFFER_SIZE, this_part_size)): async with await self.router_fs.open_from( @@ -254,8 +256,9 @@ async def _copy_part( raise UnexpectedEOFError() written = await destf.write(b) assert written == len(b) - source_report.finish_bytes(written) + total_written += written n -= len(b) + source_report.finish_bytes(total_written) except Exception as e: if return_exceptions: source_report.set_exception(e) diff --git a/hail/python/hailtop/aiotools/plan.py b/hail/python/hailtop/aiotools/plan.py new file mode 100644 index 00000000000..1a9bdd6a64e --- /dev/null +++ b/hail/python/hailtop/aiotools/plan.py @@ -0,0 +1,331 @@ +from typing import List, Tuple, Optional +import asyncio +import os +import sys +from contextlib import AsyncExitStack +from hailtop.aiotools import FileAndDirectoryError + +from .router_fs import RouterAsyncFS +from .fs import FileListEntry, FileStatus, AsyncFS, WritableStream +from ..utils.rich_progress_bar import CopyToolProgressBar, Progress + +try: + import uvloop + + uvloop_install = uvloop.install +except ImportError as e: + if not sys.platform.startswith('win32'): + raise e + + def uvloop_install(): + pass + + +class PlanError(ValueError): + pass + + +async def plan( + folder: str, + copy_to: List[Tuple[str, str]], + copy_into: List[Tuple[str, str]], + gcs_requester_pays_project: Optional[str], + verbose: bool, + max_parallelism: int, +): + if gcs_requester_pays_project: + gcs_kwargs = {'gcs_requester_pays_configuration': gcs_requester_pays_project} + else: + gcs_kwargs = {} + + total_n_files = 0 + total_n_bytes = 0 + + async with AsyncExitStack() as cleanups: + fs = await cleanups.enter_async_context(RouterAsyncFS(gcs_kwargs=gcs_kwargs)) + source_destination_pairs = [ + *copy_to, + *((src, copied_into_path(fs, source_path=src, folder_path=dst)) for src, dst in copy_into), + ] + + if any(await asyncio.gather(fs.isfile(folder), fs.isdir(folder.rstrip('/') + '/'))): + raise PlanError(f'plan folder already exists: {folder}', 1) + + await fs.mkdir(folder) + matches = await cleanups.enter_async_context(await fs.create(os.path.join(folder, 'matches'))) + differs = await cleanups.enter_async_context(await fs.create(os.path.join(folder, 'differs'))) + srconly = await cleanups.enter_async_context(await fs.create(os.path.join(folder, 'srconly'))) + dstonly = await cleanups.enter_async_context(await fs.create(os.path.join(folder, 'dstonly'))) + plan = await cleanups.enter_async_context(await fs.create(os.path.join(folder, 'plan'))) + + progress = cleanups.enter_context(CopyToolProgressBar(transient=True, disable=not verbose)) + + for src, dst in source_destination_pairs: + n_files, n_bytes = await find_all_copy_pairs( + fs, + matches, + differs, + srconly, + dstonly, + plan, + src, + dst, + progress, + asyncio.Semaphore(max_parallelism), + source_must_exist=True, + ) + total_n_files += n_files + total_n_bytes += n_bytes + + summary = await cleanups.enter_async_context(await fs.create(os.path.join(folder, 'summary'))) + await summary.write((f'{total_n_files}\t{total_n_bytes}\n').encode('utf-8')) + + +def copied_into_path(fs: AsyncFS, *, source_path: str, folder_path: str) -> str: + src_url = fs.parse_url(source_path) + dest_url = fs.parse_url(folder_path) + src_basename = os.path.basename(src_url.path.rstrip('/')) + return str(dest_url.with_new_path_component(src_basename)) + + +class FileStat: + def __init__(self, basename, url, is_dir, size): + self.basename = basename + self.url = url + self.is_dir = is_dir + self.size = size + + def __repr__(self) -> str: + return f'FileStat({self.basename}, {self.url}, {self.is_dir}, {self.size})' + + @staticmethod + async def from_file_list_entry(x: FileListEntry) -> 'FileStat': + url, is_dir = await asyncio.gather(x.url(), x.is_dir()) + if is_dir: + size = 0 + else: + size = await (await x.status()).size() + return FileStat(x.basename(), url, is_dir, size) + + @staticmethod + async def from_file_status(x: FileStatus) -> 'FileStat': + return FileStat(x.basename(), x.url(), False, await x.size()) + + +async def listfiles(fs: AsyncFS, x: str) -> List[FileStat]: + try: + it = await fs.listfiles(x) + return [await FileStat.from_file_list_entry(x) async for x in it] + except (FileNotFoundError, NotADirectoryError): + return [] + + +async def statfile(fs: AsyncFS, x: str) -> Optional[FileStat]: + try: + file_status = await fs.statfile(x) + return await FileStat.from_file_status(file_status) + except FileNotFoundError: + return None + + +async def writeline(file: WritableStream, *columns: str): + await file.write(('\t'.join(columns) + '\n').encode('utf-8')) + + +def relativize_url(folder: str, file: str) -> str: + if folder[-1] != '/': + folder = folder + '/' + relative_path = file.removeprefix(folder) + assert relative_path[0] != '/' + return relative_path + + +async def find_all_copy_pairs( + fs: AsyncFS, + matches: WritableStream, + differs: WritableStream, + srconly: WritableStream, + dstonly: WritableStream, + plan: WritableStream, + src: str, + dst: str, + progress: Progress, + sema: asyncio.Semaphore, + source_must_exist: bool, +) -> Tuple[int, int]: + async with sema: + srcstat, srcfiles, dststat, dstfiles = await asyncio.gather( + statfile(fs, src), + listfiles(fs, src), + statfile(fs, dst), + listfiles(fs, dst), + ) + + if srcstat: + if srcstat.url[-1] == '/': + if srcstat.size == 0: + srcstat = None + if srcfiles: + print(f'Not copying size-zero source file which shares a name with a source directory. {src}') + else: + print(f'Not copying size-zero source file whose name looks like a directory. {src}') + else: + raise PlanError( + f'Source is a non-size-zero file whose name looks like a directory. This is not supported. {src} {srcstat}', + 1, + ) + elif srcfiles: + raise PlanError( + f'Source is both a directory and a file (other than a size-zero file whose name ends in "/"). ' + f'This is not supported. {src} {srcstat}', + 1, + ) from FileAndDirectoryError(src) + if dststat: + if dststat.url[-1] == '/': + if dststat.size == 0: + dststat = None + if dstfiles: + print( + f'Ignoring size-zero destination file which shares a name with a destination directory. {dst}' + ) + else: + print(f'Ignoring size-zero destination file whose name looks like a directory. {dst}') + else: + raise PlanError( + f'Destination is a non-size-zero file whose name looks like a directory. This is not supported. {dst} {dststat}', + 1, + ) + elif dstfiles: + raise PlanError( + f'Destination identifies both a directory and a file (other than a size-zero file whose name ends ' + f'in "/"). This is not supported. {dst} {dststat}', + 1, + ) from FileAndDirectoryError(dst) + if srcstat and dstfiles: + raise PlanError( + f'Source is a file but destination is a directory. This is not supported. {src} -> {dst}', 1 + ) from IsADirectoryError(dst) + if srcfiles and dststat: + raise PlanError( + f'Source is a directory but destination is a file. This is not supported. {src} -> {dst}', 1 + ) from IsADirectoryError(src) + if source_must_exist and not srcstat and not srcfiles: + raise PlanError(f'Source is neither a folder nor a file: {src}', 1) from FileNotFoundError(src) + + if srcstat: + assert len(srcfiles) == 0 + assert len(dstfiles) == 0 + + if dststat: + if srcstat.size == dststat.size: + await writeline(matches, srcstat.url, dststat.url) + return 0, 0 + await writeline(differs, srcstat.url, dststat.url, str(srcstat.size), str(dststat.size)) + return 0, 0 + await writeline(srconly, srcstat.url) + await writeline(plan, srcstat.url, dst) + return 1, srcstat.size + + srcfiles.sort(key=lambda x: x.basename) + dstfiles.sort(key=lambda x: x.basename) + + tid = progress.add_task(description=src, total=len(srcfiles) + len(dstfiles)) + + srcidx = 0 + dstidx = 0 + + n_files = 0 + n_bytes = 0 + + async def process_child_directory(new_srcurl: str, new_dsturl: str) -> Tuple[int, int]: + return await find_all_copy_pairs( + fs, + matches, + differs, + srconly, + dstonly, + plan, + new_srcurl, + new_dsturl, + progress, + sema, + source_must_exist=False, + ) + + async def retrieve_child_directory_results(child_directory_task): + nonlocal n_files + nonlocal n_bytes + dir_n_files, dir_n_bytes = await child_directory_task + n_files += dir_n_files + n_bytes += dir_n_bytes + + async with AsyncExitStack() as child_directory_callbacks: + + def background_process_child_dir(new_srcurl: str, new_dsturl: str): + t = asyncio.create_task(process_child_directory(new_srcurl, new_dsturl)) + child_directory_callbacks.push_async_callback(retrieve_child_directory_results, t) + + while srcidx < len(srcfiles) and dstidx < len(dstfiles): + srcf = srcfiles[srcidx] + dstf = dstfiles[dstidx] + if srcf.basename == dstf.basename: + if srcf.is_dir and dstf.is_dir: + background_process_child_dir(srcf.url, dstf.url) + elif srcf.is_dir and not dstf.is_dir: + await writeline(differs, srcf.url, dstf.url, 'dir', 'file') + elif not srcf.is_dir and dstf.is_dir: + await writeline(differs, srcf.url, dstf.url, 'file', 'dir') + elif srcf.size == dstf.size: + await writeline(matches, srcf.url, dstf.url) + else: + await writeline(differs, srcf.url, dstf.url, str(srcf.size), str(dstf.size)) + dstidx += 1 + srcidx += 1 + progress.update(tid, advance=2) + elif srcf.basename < dstf.basename: + if srcf.is_dir: + background_process_child_dir( + srcf.url, + os.path.join(dst, relativize_url(folder=src, file=srcf.url)), + ) + else: + await writeline(srconly, srcf.url) + await writeline(plan, srcf.url, os.path.join(dst, srcf.basename)) + n_files += 1 + n_bytes += srcf.size + srcidx += 1 + progress.update(tid, advance=1) + else: + assert srcf.basename >= dstf.basename + dstidx += 1 + progress.update(tid, advance=1) + await writeline(dstonly, dstf.url) + + while srcidx < len(srcfiles): + srcf = srcfiles[srcidx] + + if srcf.is_dir: + background_process_child_dir( + srcf.url, + os.path.join(dst, relativize_url(folder=src, file=srcf.url)), + ) + else: + await writeline(srconly, srcf.url) + await writeline(plan, srcf.url, os.path.join(dst, srcf.basename)) + n_files += 1 + n_bytes += srcf.size + srcidx += 1 + progress.update(tid, advance=1) + + while dstidx < len(dstfiles): + dstf = dstfiles[dstidx] + + await writeline(dstonly, dstf.url) + dstidx += 1 + progress.update(tid, advance=1) + + # a short sleep ensures the progress bar is visible for a moment to the user + await asyncio.sleep(0.150) + progress.remove_task(tid) + + return n_files, n_bytes diff --git a/hail/python/hailtop/aiotools/sync.py b/hail/python/hailtop/aiotools/sync.py new file mode 100644 index 00000000000..4a509a12b16 --- /dev/null +++ b/hail/python/hailtop/aiotools/sync.py @@ -0,0 +1,71 @@ +from typing import Optional +import asyncio +import os +import sys + +from .fs.copier import Transfer +from .router_fs import RouterAsyncFS, AsyncFS +from .copy import copy + +try: + import uvloop + + uvloop_install = uvloop.install +except ImportError as e: + if not sys.platform.startswith('win32'): + raise e + + def uvloop_install(): + pass + + +class SyncError(ValueError): + pass + + +async def sync( + plan_folder: str, + gcs_requester_pays_project: Optional[str], + verbose: bool, + max_parallelism: int, +) -> None: + gcs_kwargs = {'gcs_requester_pays_configuration': gcs_requester_pays_project} + s3_kwargs = {'max_pool_connections': max_parallelism * 5, 'max_workers': max_parallelism} + + async with RouterAsyncFS(gcs_kwargs=gcs_kwargs, s3_kwargs=s3_kwargs) as fs: + if not all( + await asyncio.gather( + *( + fs.exists(os.path.join(plan_folder, x)) + for x in ('matches', 'differs', 'srconly', 'dstonly', 'plan', 'summary') + ) + ) + ): + raise SyncError('Run hailctl fs sync --make-plan first.', 1) + results = (await fs.read(os.path.join(plan_folder, 'summary'))).decode('utf-8') + n_files, n_bytes = (int(x) for x in results.split('\t')) + await copy( + max_simultaneous_transfers=max_parallelism, + local_kwargs=None, + gcs_kwargs=gcs_kwargs, + azure_kwargs={}, + s3_kwargs=s3_kwargs, + transfers=[ + Transfer(src, dst, treat_dest_as=Transfer.DEST_IS_TARGET) + async for src, dst in iterate_plan_file(plan_folder, fs) + ], + verbose=verbose, + totals=(n_files, n_bytes), + ) + + +async def iterate_plan_file(plan_folder: str, fs: AsyncFS): + lineno = 0 + plan = (await fs.read(os.path.join(plan_folder, 'plan'))).decode('utf-8') + for line in plan.split('\n'): + if not line: + continue + parts = line.strip().split('\t') + if len(parts) != 2: + raise SyncError(f'Malformed plan line, {lineno}, must have exactly one tab: {line}', 1) + yield parts diff --git a/hail/python/hailtop/hailctl/__main__.py b/hail/python/hailtop/hailctl/__main__.py index 999c10c60d2..14426f0948a 100644 --- a/hail/python/hailtop/hailctl/__main__.py +++ b/hail/python/hailtop/hailctl/__main__.py @@ -1,4 +1,6 @@ +from typing import cast import typer +import click import os from .auth import cli as auth_cli @@ -8,6 +10,7 @@ from .dataproc import cli as dataproc_cli from .dev import cli as dev_cli from .hdinsight import cli as hdinsight_cli +from .fs import cli as fs_cli app = typer.Typer( @@ -69,4 +72,6 @@ async def _curl( def main(): - app() + click_app = cast(click.Group, typer.main.get_command(app)) + click_app.add_command(fs_cli.app) + click_app() diff --git a/hail/python/hailtop/hailctl/fs/__init__.py b/hail/python/hailtop/hailctl/fs/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/hail/python/hailtop/hailctl/fs/cli.py b/hail/python/hailtop/hailctl/fs/cli.py new file mode 100644 index 00000000000..1c02f7604f2 --- /dev/null +++ b/hail/python/hailtop/hailctl/fs/cli.py @@ -0,0 +1,116 @@ +from typing import Optional, List, Tuple, cast +import asyncio +import click +import sys +import typer + +from hailtop.aiotools.plan import plan, PlanError +from hailtop.aiotools.sync import sync as aiotools_sync, SyncError + + +app_without_click = typer.Typer( + name='fs', + no_args_is_help=True, + help='Use cloud object stores and local filesystems.', + pretty_exceptions_show_locals=False, +) + + +@app_without_click.callback() +def callback(): + """ + Typer app, including Click subapp + """ + + +@click.command() +@click.option( + '--copy-to', + help='Pairs of source and destination URL. May be specified multiple times. The destination is always treated as a file. See --copy-into to copy into a directory', + type=(str, str), + required=False, + multiple=True, + default=(), +) +@click.option( + '--copy-into', + help='Copies the source path into the target path. The target must not be a file.', + type=(str, str), + required=False, + multiple=True, + default=(), +) +@click.option( + '-v', + '--verbose', + help='The Google project to which to charge egress costs.', + is_flag=True, + required=False, + default=False, +) +@click.option( + '--max-parallelism', help='The maximum number of concurrent requests.', type=int, required=False, default=75 +) +@click.option( + '--make-plan', + help='The folder in which to create a new synchronization plan. Must not exist.', + type=str, + required=False, +) +@click.option('--use-plan', help='The plan to execute. Must exist.', type=str, required=False) +@click.option( + '--gcs-requester-pays-project', help='The Google project to which to charge egress costs.', type=str, required=False +) +def sync( + copy_to: List[Tuple[str, str]], + copy_into: List[Tuple[str, str]], + verbose: bool, + max_parallelism: int, + make_plan: Optional[str] = None, + use_plan: Optional[str] = None, + gcs_requester_pays_project: Optional[str] = None, +): + """Synchronize files between one or more pairs of locations. + + If a corresponding file already exists at the destination with the same size in bytes, this + command will not copy it. If you want to replace files that have the exact same size in bytes, + delete the destination files first. THIS COMMAND DOES NOT CHECK MD5s OR SHAs! + + First generate a plan with --make-plan, then use the plan with --use-plan. + + Copy all the files under gs://gcs-bucket/a to s3://s3-bucket/b. For example, + gs://gcs-bucket/a/b/c will appear at s3://s3-bucket/b/b/c: + + + + $ hailctl fs sync --make-plan plan1 --copy-to gs://gcs-bucket/a s3://s3-bucket/b + + + + $ hailctl fs sync --use-plan plan1 + """ + if (make_plan is None and use_plan is None) or (make_plan is not None and use_plan is not None): + print('Must specify one of --make-plan or --use-plan. See hailctl fs sync --help.') + raise typer.Exit(1) + + if make_plan: + try: + asyncio.run(plan(make_plan, copy_to, copy_into, gcs_requester_pays_project, verbose, max_parallelism)) + except PlanError as err: + print('ERROR: ' + err.args[0]) + sys.exit(err.args[1]) + if use_plan: + if copy_to or copy_into: + print( + 'Do not specify --copy-to or --copy-into with --use-plan. Create the plan with --make-plan then call --use-plan without any --copy-to and --copy-into.' + ) + raise typer.Exit(1) + try: + asyncio.run(aiotools_sync(use_plan, gcs_requester_pays_project, verbose, max_parallelism)) + except SyncError as err: + print('ERROR: ' + err.args[0]) + sys.exit(err.args[1]) + + +app = cast(click.Group, typer.main.get_command(app_without_click)) +app.add_command(sync) diff --git a/hail/python/test/hailtop/inter_cloud/test_copy.py b/hail/python/test/hailtop/inter_cloud/test_copy.py index 271aed58f0e..d2dc4fb3f67 100644 --- a/hail/python/test/hailtop/inter_cloud/test_copy.py +++ b/hail/python/test/hailtop/inter_cloud/test_copy.py @@ -3,14 +3,20 @@ import secrets from concurrent.futures import ThreadPoolExecutor import asyncio +import tempfile import functools import pytest from hailtop.utils import url_scheme, bounded_gather2 -from hailtop.aiotools import LocalAsyncFS, Transfer, FileAndDirectoryError, Copier, AsyncFS, FileListEntry +from hailtop.aiotools.plan import plan, PlanError +from hailtop.aiotools.sync import sync, SyncError from hailtop.aiotools.router_fs import RouterAsyncFS -from hailtop.aiocloud.aiogoogle import GoogleStorageAsyncFS -from hailtop.aiocloud.aioaws import S3AsyncFS -from hailtop.aiocloud.aioazure import AzureAsyncFS +from hailtop.aiotools import ( + Transfer, + FileAndDirectoryError, + Copier, + AsyncFS, + FileListEntry, +) from .generate_copy_test_specs import run_test_spec, create_test_file, create_test_dir @@ -97,6 +103,61 @@ async def expect_file(fs, path, expected): assert actual == expected, (actual, expected) +async def copier_copy(fs, sema, transfer): + await Copier.copy(fs, sema, transfer) + + +async def sync_tool(fs: RouterAsyncFS, sema, transfer: Transfer): + max_parallelism = sema._value + + sources = transfer.src + if isinstance(sources, str): + sources = [sources] + + copy_to = [] + copy_into = [] + + if transfer.treat_dest_as == Transfer.DEST_DIR or ( + transfer.treat_dest_as == Transfer.INFER_DEST and await fs.isdir(transfer.dest) + ): + copy_into = [(src, transfer.dest) for src in sources] + elif transfer.treat_dest_as == Transfer.DEST_IS_TARGET or ( + transfer.treat_dest_as == Transfer.INFER_DEST and not await fs.isdir(transfer.dest) + ): + if await fs.isfile(transfer.dest): + if len(sources) > 1 or await fs.isdir(sources[0]): + raise NotADirectoryError(transfer.dest) + copy_to = [(src, transfer.dest) for src in sources] + else: + raise ValueError(f'unsupported treat_dest_as: {transfer.treat_dest_as}') + + with tempfile.TemporaryDirectory() as folder: + try: + await plan( + os.path.join(folder, 'planfolder'), + copy_to, + copy_into, + None, + True, + max_parallelism, + ) + await sync(os.path.join(folder, 'planfolder'), None, True, max_parallelism) + except (PlanError, SyncError) as err: + if err.__cause__: + raise err.__cause__ + else: + raise err + + +@pytest.fixture(params=['Copier.copy', 'hailctl_sync']) +def copy_tool(request): + if request.param == 'Copier.copy': + return copier_copy + if request.param == 'hailctl_sync': + return sync_tool + raise ValueError('bad: ' + request.param) + + class DidNotRaiseError(Exception): pass @@ -126,24 +187,24 @@ def __exit__(self, type, value, traceback): return True -async def test_copy_doesnt_exist(copy_test_context): +async def test_copy_doesnt_exist(copy_test_context, copy_tool): sema, fs, src_base, dest_base = copy_test_context with pytest.raises(FileNotFoundError): - await Copier.copy(fs, sema, Transfer(f'{src_base}a', dest_base)) + await copy_tool(fs, sema, Transfer(f'{src_base}a', dest_base)) -async def test_copy_file(copy_test_context): +async def test_copy_file(copy_test_context, copy_tool): sema, fs, src_base, dest_base = copy_test_context await create_test_file(fs, 'src', src_base, 'a') - await Copier.copy(fs, sema, Transfer(f'{src_base}a', dest_base.rstrip('/'))) + await copy_tool(fs, sema, Transfer(f'{src_base}a', dest_base.rstrip('/'))) await expect_file(fs, f'{dest_base}a', 'src/a') -async def test_copy_large_file(copy_test_context): +async def test_copy_large_file(copy_test_context, copy_tool): sema, fs, src_base, dest_base = copy_test_context # mainly needs to be larger than the transfer block size (8K) @@ -151,44 +212,54 @@ async def test_copy_large_file(copy_test_context): async with await fs.create(f'{src_base}a') as f: await f.write(contents) - await Copier.copy(fs, sema, Transfer(f'{src_base}a', dest_base.rstrip('/'))) + await copy_tool(fs, sema, Transfer(f'{src_base}a', dest_base.rstrip('/'))) async with await fs.open(f'{dest_base}a') as f: copy_contents = await f.read() assert copy_contents == contents -async def test_copy_rename_file(copy_test_context): +async def test_copy_rename_file(copy_test_context, copy_tool): sema, fs, src_base, dest_base = copy_test_context await create_test_file(fs, 'src', src_base, 'a') - await Copier.copy(fs, sema, Transfer(f'{src_base}a', f'{dest_base}x')) + await copy_tool(fs, sema, Transfer(f'{src_base}a', f'{dest_base}x')) await expect_file(fs, f'{dest_base}x', 'src/a') -async def test_copy_rename_file_dest_target_file(copy_test_context): +async def test_copy_rename_file_dest_target_file(copy_test_context, copy_tool): sema, fs, src_base, dest_base = copy_test_context await create_test_file(fs, 'src', src_base, 'a') - await Copier.copy(fs, sema, Transfer(f'{src_base}a', f'{dest_base}x', treat_dest_as=Transfer.DEST_IS_TARGET)) + await copy_tool( + fs, + sema, + Transfer(f'{src_base}a', f'{dest_base}x', treat_dest_as=Transfer.DEST_IS_TARGET), + ) await expect_file(fs, f'{dest_base}x', 'src/a') -async def test_copy_file_dest_target_directory_doesnt_exist(copy_test_context): +async def test_copy_file_dest_target_directory_doesnt_exist(copy_test_context, copy_tool): sema, fs, src_base, dest_base = copy_test_context await create_test_file(fs, 'src', src_base, 'a') # SourceCopier._copy_file creates destination directories as needed - await Copier.copy(fs, sema, Transfer(f'{src_base}a', f'{dest_base}x', treat_dest_as=Transfer.DEST_DIR)) + await copy_tool( + fs, + sema, + Transfer(f'{src_base}a', f'{dest_base}x', treat_dest_as=Transfer.DEST_DIR), + ) await expect_file(fs, f'{dest_base}x/a', 'src/a') -async def test_overwrite_rename_file(copy_test_context): +async def test_overwrite_rename_file( + copy_test_context, +): # hailctl fs sync does not support overwriting sema, fs, src_base, dest_base = copy_test_context await create_test_file(fs, 'src', src_base, 'a') @@ -199,153 +270,183 @@ async def test_overwrite_rename_file(copy_test_context): await expect_file(fs, f'{dest_base}x', 'src/a') -async def test_copy_rename_dir(copy_test_context): +async def test_copy_rename_dir(copy_test_context, copy_tool): sema, fs, src_base, dest_base = copy_test_context await create_test_dir(fs, 'src', src_base, 'a/') - await Copier.copy(fs, sema, Transfer(f'{src_base}a', f'{dest_base}x')) + await copy_tool(fs, sema, Transfer(f'{src_base}a', f'{dest_base}x')) await expect_file(fs, f'{dest_base}x/file1', 'src/a/file1') await expect_file(fs, f'{dest_base}x/subdir/file2', 'src/a/subdir/file2') -async def test_copy_rename_dir_dest_is_target(copy_test_context): +async def test_copy_rename_dir_dest_is_target(copy_test_context, copy_tool): sema, fs, src_base, dest_base = copy_test_context await create_test_dir(fs, 'src', src_base, 'a/') - await Copier.copy(fs, sema, Transfer(f'{src_base}a', f'{dest_base}x', treat_dest_as=Transfer.DEST_IS_TARGET)) + await copy_tool( + fs, + sema, + Transfer(f'{src_base}a', f'{dest_base}x', treat_dest_as=Transfer.DEST_IS_TARGET), + ) await expect_file(fs, f'{dest_base}x/file1', 'src/a/file1') await expect_file(fs, f'{dest_base}x/subdir/file2', 'src/a/subdir/file2') async def test_overwrite_rename_dir(copy_test_context): + copy_tool = Copier.copy # hailctl fs sync does not support overwrite sema, fs, src_base, dest_base = copy_test_context await create_test_dir(fs, 'src', src_base, 'a/') await create_test_dir(fs, 'dest', dest_base, 'x/') - await Copier.copy(fs, sema, Transfer(f'{src_base}a', f'{dest_base}x', treat_dest_as=Transfer.DEST_IS_TARGET)) + await copy_tool( + fs, + sema, + Transfer(f'{src_base}a', f'{dest_base}x', treat_dest_as=Transfer.DEST_IS_TARGET), + ) await expect_file(fs, f'{dest_base}x/file1', 'src/a/file1') await expect_file(fs, f'{dest_base}x/subdir/file2', 'src/a/subdir/file2') await expect_file(fs, f'{dest_base}x/file3', 'dest/x/file3') -async def test_copy_file_dest_trailing_slash_target_dir(copy_test_context): +async def test_copy_file_dest_trailing_slash_target_dir(copy_test_context, copy_tool): sema, fs, src_base, dest_base = copy_test_context await create_test_file(fs, 'src', src_base, 'a') - await Copier.copy(fs, sema, Transfer(f'{src_base}a', dest_base, treat_dest_as=Transfer.DEST_DIR)) + await copy_tool(fs, sema, Transfer(f'{src_base}a', dest_base, treat_dest_as=Transfer.DEST_DIR)) await expect_file(fs, f'{dest_base}a', 'src/a') -async def test_copy_file_dest_target_dir(copy_test_context): +async def test_copy_file_dest_target_dir(copy_test_context, copy_tool): sema, fs, src_base, dest_base = copy_test_context await create_test_file(fs, 'src', src_base, 'a') - await Copier.copy(fs, sema, Transfer(f'{src_base}a', dest_base.rstrip('/'), treat_dest_as=Transfer.DEST_DIR)) + await copy_tool( + fs, + sema, + Transfer(f'{src_base}a', dest_base.rstrip('/'), treat_dest_as=Transfer.DEST_DIR), + ) await expect_file(fs, f'{dest_base}a', 'src/a') -async def test_copy_file_dest_target_file(copy_test_context): +async def test_copy_file_dest_target_file(copy_test_context, copy_tool): sema, fs, src_base, dest_base = copy_test_context await create_test_file(fs, 'src', src_base, 'a') - await Copier.copy(fs, sema, Transfer(f'{src_base}a', f'{dest_base}a', treat_dest_as=Transfer.DEST_IS_TARGET)) + await copy_tool( + fs, + sema, + Transfer(f'{src_base}a', f'{dest_base}a', treat_dest_as=Transfer.DEST_IS_TARGET), + ) await expect_file(fs, f'{dest_base}a', 'src/a') -async def test_copy_dest_target_file_is_dir(copy_test_context): +async def test_copy_dest_target_file_is_dir(copy_test_context, copy_tool): sema, fs, src_base, dest_base = copy_test_context await create_test_file(fs, 'src', src_base, 'a') with RaisesOrObjectStore(dest_base, IsADirectoryError): - await Copier.copy( - fs, sema, Transfer(f'{src_base}a', dest_base.rstrip('/'), treat_dest_as=Transfer.DEST_IS_TARGET) + await copy_tool( + fs, + sema, + Transfer( + f'{src_base}a', + dest_base.rstrip('/'), + treat_dest_as=Transfer.DEST_IS_TARGET, + ), ) async def test_overwrite_file(copy_test_context): + copy_tool = Copier.copy # hailctl fs sync does not support overwriting sema, fs, src_base, dest_base = copy_test_context await create_test_file(fs, 'src', src_base, 'a') await create_test_file(fs, 'dest', dest_base, 'a') - await Copier.copy(fs, sema, Transfer(f'{src_base}a', dest_base.rstrip('/'))) + await copy_tool(fs, sema, Transfer(f'{src_base}a', dest_base.rstrip('/'))) await expect_file(fs, f'{dest_base}a', 'src/a') async def test_copy_file_src_trailing_slash(copy_test_context): + copy_tool = Copier.copy # hailctl fs sync does not support trailing slashes on files sema, fs, src_base, dest_base = copy_test_context await create_test_file(fs, 'src', src_base, 'a') with pytest.raises(FileNotFoundError): - await Copier.copy(fs, sema, Transfer(f'{src_base}a/', dest_base)) + await copy_tool(fs, sema, Transfer(f'{src_base}a/', dest_base)) -async def test_copy_dir(copy_test_context): +async def test_copy_dir(copy_test_context, copy_tool): sema, fs, src_base, dest_base = copy_test_context await create_test_dir(fs, 'src', src_base, 'a/') - await Copier.copy(fs, sema, Transfer(f'{src_base}a', dest_base.rstrip('/'))) + await copy_tool(fs, sema, Transfer(f'{src_base}a', dest_base.rstrip('/'))) await expect_file(fs, f'{dest_base}a/file1', 'src/a/file1') await expect_file(fs, f'{dest_base}a/subdir/file2', 'src/a/subdir/file2') async def test_overwrite_dir(copy_test_context): + copy_tool = Copier.copy # hailctl fs sync does not support overwrite sema, fs, src_base, dest_base = copy_test_context await create_test_dir(fs, 'src', src_base, 'a/') await create_test_dir(fs, 'dest', dest_base, 'a/') - await Copier.copy(fs, sema, Transfer(f'{src_base}a', dest_base.rstrip('/'))) + await copy_tool(fs, sema, Transfer(f'{src_base}a', dest_base.rstrip('/'))) await expect_file(fs, f'{dest_base}a/file1', 'src/a/file1') await expect_file(fs, f'{dest_base}a/subdir/file2', 'src/a/subdir/file2') await expect_file(fs, f'{dest_base}a/file3', 'dest/a/file3') -async def test_copy_multiple(copy_test_context): +async def test_copy_multiple(copy_test_context, copy_tool): sema, fs, src_base, dest_base = copy_test_context await create_test_file(fs, 'src', src_base, 'a') await create_test_file(fs, 'src', src_base, 'b') - await Copier.copy(fs, sema, Transfer([f'{src_base}a', f'{src_base}b'], dest_base.rstrip('/'))) + await copy_tool(fs, sema, Transfer([f'{src_base}a', f'{src_base}b'], dest_base.rstrip('/'))) await expect_file(fs, f'{dest_base}a', 'src/a') await expect_file(fs, f'{dest_base}b', 'src/b') -async def test_copy_multiple_dest_target_file(copy_test_context): +async def test_copy_multiple_dest_target_file(copy_test_context, copy_tool): sema, fs, src_base, dest_base = copy_test_context await create_test_file(fs, 'src', src_base, 'a') await create_test_file(fs, 'src', src_base, 'b') with RaisesOrObjectStore(dest_base, NotADirectoryError): - await Copier.copy( + await copy_tool( fs, sema, - Transfer([f'{src_base}a', f'{src_base}b'], dest_base.rstrip('/'), treat_dest_as=Transfer.DEST_IS_TARGET), + Transfer( + [f'{src_base}a', f'{src_base}b'], + dest_base.rstrip('/'), + treat_dest_as=Transfer.DEST_IS_TARGET, + ), ) -async def test_copy_multiple_dest_file(copy_test_context): +async def test_copy_multiple_dest_file(copy_test_context, copy_tool): sema, fs, src_base, dest_base = copy_test_context await create_test_file(fs, 'src', src_base, 'a') @@ -353,22 +454,28 @@ async def test_copy_multiple_dest_file(copy_test_context): await create_test_file(fs, 'dest', dest_base, 'x') with RaisesOrObjectStore(dest_base, NotADirectoryError): - await Copier.copy(fs, sema, Transfer([f'{src_base}a', f'{src_base}b'], f'{dest_base}x')) + await copy_tool(fs, sema, Transfer([f'{src_base}a', f'{src_base}b'], f'{dest_base}x')) -async def test_file_overwrite_dir(copy_test_context): +async def test_file_overwrite_dir(copy_test_context, copy_tool): sema, fs, src_base, dest_base = copy_test_context await create_test_file(fs, 'src', src_base, 'a') with RaisesOrObjectStore(dest_base, IsADirectoryError): - await Copier.copy( - fs, sema, Transfer(f'{src_base}a', dest_base.rstrip('/'), treat_dest_as=Transfer.DEST_IS_TARGET) + await copy_tool( + fs, + sema, + Transfer( + f'{src_base}a', + dest_base.rstrip('/'), + treat_dest_as=Transfer.DEST_IS_TARGET, + ), ) async def test_file_and_directory_error( - router_filesystem: Tuple[asyncio.Semaphore, AsyncFS, Dict[str, str]], cloud_scheme: str + router_filesystem: Tuple[asyncio.Semaphore, AsyncFS, Dict[str, str]], cloud_scheme: str, copy_tool ): sema, fs, bases = router_filesystem @@ -379,18 +486,22 @@ async def test_file_and_directory_error( await create_test_file(fs, 'src', src_base, 'a/subfile') with pytest.raises(FileAndDirectoryError): - await Copier.copy(fs, sema, Transfer(f'{src_base}a', dest_base.rstrip('/'))) + await copy_tool(fs, sema, Transfer(f'{src_base}a', dest_base.rstrip('/'))) -async def test_copy_src_parts(copy_test_context): +async def test_copy_src_parts(copy_test_context, copy_tool): sema, fs, src_base, dest_base = copy_test_context await create_test_dir(fs, 'src', src_base, 'a/') - await Copier.copy( + await copy_tool( fs, sema, - Transfer([f'{src_base}a/file1', f'{src_base}a/subdir'], dest_base.rstrip('/'), treat_dest_as=Transfer.DEST_DIR), + Transfer( + [f'{src_base}a/file1', f'{src_base}a/subdir'], + dest_base.rstrip('/'), + treat_dest_as=Transfer.DEST_DIR, + ), ) await expect_file(fs, f'{dest_base}file1', 'src/a/file1') @@ -406,39 +517,72 @@ async def collect_files(it: AsyncIterator[FileListEntry]) -> List[str]: return [await x.url() async for x in it] -async def test_file_and_directory_error_with_slash_empty_file( - router_filesystem: Tuple[asyncio.Semaphore, AsyncFS, Dict[str, str]], cloud_scheme: str +async def test_size_zero_directory_named_files_are_not_copied_but_do_not_impair_other_copies( + router_filesystem: Tuple[asyncio.Semaphore, AsyncFS, Dict[str, str]], + cloud_scheme: str, + copy_tool, ): sema, fs, bases = router_filesystem src_base = await fresh_dir(fs, bases, cloud_scheme) - await write_file(fs, f'{src_base}empty/', '') - await write_file(fs, f'{src_base}empty/foo', b'foo') + await write_file(fs, f'{src_base}folder-and-file/', '') + await write_file(fs, f'{src_base}folder-and-file/just-a-file', b'just-a-file') + await write_file(fs, f'{src_base}folder-and-file/nested-folder-and-file/', '') + await write_file(fs, f'{src_base}folder-and-file/nested-folder-and-file/nested-just-a-file', b'nested-just-a-file') + await write_file(fs, f'{src_base}folder-and-file/file-with-trailing-slash-but-not-a-folder/', '') await collect_files(await fs.listfiles(f'{src_base}')) await collect_files(await fs.listfiles(f'{src_base}', recursive=True)) - await collect_files(await fs.listfiles(f'{src_base}empty/')) - await collect_files(await fs.listfiles(f'{src_base}empty/', recursive=True)) + await collect_files(await fs.listfiles(f'{src_base}folder-and-file/')) + await collect_files(await fs.listfiles(f'{src_base}folder-and-file/', recursive=True)) - for transfer_type in (Transfer.DEST_IS_TARGET, Transfer.DEST_DIR, Transfer.INFER_DEST): - dest_base = await fresh_dir(fs, bases, cloud_scheme) + for transfer_type in ( + Transfer.DEST_IS_TARGET, + Transfer.DEST_DIR, + Transfer.INFER_DEST, + ): + destination = await fresh_dir(fs, bases, cloud_scheme) - await Copier.copy(fs, sema, Transfer(f'{src_base}empty/', dest_base.rstrip('/'), treat_dest_as=transfer_type)) + await copy_tool( + fs, + sema, + Transfer(f'{src_base}folder-and-file/', destination, treat_dest_as=transfer_type), + ) - await collect_files(await fs.listfiles(f'{dest_base}')) - await collect_files(await fs.listfiles(f'{dest_base}', recursive=True)) + if transfer_type == Transfer.DEST_DIR or (transfer_type == Transfer.INFER_DEST and destination[-1] == '/'): + destination = destination + 'folder-and-file/' - if transfer_type == Transfer.DEST_DIR: - exp_dest = f'{dest_base}empty/foo' - await expect_file(fs, exp_dest, 'foo') - assert not await fs.isfile(f'{dest_base}empty/') - assert await fs.isdir(f'{dest_base}empty/') - await collect_files(await fs.listfiles(f'{dest_base}empty/')) - await collect_files(await fs.listfiles(f'{dest_base}empty/', recursive=True)) - else: - exp_dest = f'{dest_base}foo' - await expect_file(fs, exp_dest, 'foo') + assert not await fs.isfile(destination), (transfer_type, destination) + assert await fs.isdir(destination), (transfer_type, destination) + + just_a_file_url = f'{destination}just-a-file' + assert await fs.read(just_a_file_url) == b'just-a-file', (transfer_type, destination) + + nested_folder_url = f'{destination}nested-folder-and-file/' + assert not await fs.exists(nested_folder_url), (transfer_type, destination) + + nested_just_a_file_url = f'{destination}nested-folder-and-file/nested-just-a-file' + assert await fs.read(nested_just_a_file_url) == b'nested-just-a-file', (transfer_type, destination) + + file_with_trailing_slash_url = f'{destination}file-with-trailing-slash-but-not-a-folder/' + assert not await fs.exists(file_with_trailing_slash_url), (transfer_type, destination) + + expected = [nested_folder_url, just_a_file_url] + actual = await collect_files(await fs.listfiles(destination)) + assert actual == expected, (transfer_type, destination) + + expected = [just_a_file_url, nested_just_a_file_url] + actual = await collect_files(await fs.listfiles(destination, recursive=True)) + assert actual == expected, (transfer_type, destination) + + expected = [nested_just_a_file_url] + actual = await collect_files(await fs.listfiles(nested_folder_url)) + assert actual == expected, (transfer_type, destination) + + expected = [nested_just_a_file_url] + actual = await collect_files(await fs.listfiles(nested_folder_url, recursive=True)) + assert actual == expected, (transfer_type, destination) async def test_file_and_directory_error_with_slash_non_empty_file_for_google_non_recursive( @@ -459,7 +603,9 @@ async def test_file_and_directory_error_with_slash_non_empty_file_for_google_non async def test_file_and_directory_error_with_slash_non_empty_file( - router_filesystem: Tuple[asyncio.Semaphore, AsyncFS, Dict[str, str]], cloud_scheme: str + router_filesystem: Tuple[asyncio.Semaphore, AsyncFS, Dict[str, str]], + cloud_scheme: str, + copy_tool, ): sema, fs, bases = router_filesystem @@ -474,11 +620,21 @@ async def test_file_and_directory_error_with_slash_non_empty_file( with pytest.raises(FileAndDirectoryError): await collect_files(await fs.listfiles(f'{src_base}not-empty/', recursive=True)) - for transfer_type in (Transfer.DEST_IS_TARGET, Transfer.DEST_DIR, Transfer.INFER_DEST): + for transfer_type in ( + Transfer.DEST_IS_TARGET, + Transfer.DEST_DIR, + Transfer.INFER_DEST, + ): dest_base = await fresh_dir(fs, bases, cloud_scheme) - await Copier.copy( - fs, sema, Transfer(f'{src_base}not-empty/bar', dest_base.rstrip('/'), treat_dest_as=transfer_type) + await copy_tool( + fs, + sema, + Transfer( + f'{src_base}not-empty/bar', + dest_base.rstrip('/'), + treat_dest_as=transfer_type, + ), ) if transfer_type == Transfer.DEST_DIR: exp_dest = f'{dest_base}bar' @@ -492,16 +648,26 @@ async def test_file_and_directory_error_with_slash_non_empty_file( with pytest.raises(FileAndDirectoryError): dest_base = await fresh_dir(fs, bases, cloud_scheme) - await Copier.copy( - fs, sema, Transfer(f'{src_base}not-empty/', dest_base.rstrip('/'), treat_dest_as=transfer_type) + await copy_tool( + fs, + sema, + Transfer( + f'{src_base}not-empty/', + dest_base.rstrip('/'), + treat_dest_as=transfer_type, + ), ) with pytest.raises(FileAndDirectoryError): dest_base = await fresh_dir(fs, bases, cloud_scheme) - await Copier.copy(fs, sema, Transfer(f'{src_base}', dest_base.rstrip('/'), treat_dest_as=transfer_type)) + await copy_tool( + fs, + sema, + Transfer(f'{src_base}', dest_base.rstrip('/'), treat_dest_as=transfer_type), + ) -async def test_file_and_directory_error_with_slash_non_empty_file_only_for_google_non_recursive( +async def test_copying_a_folder_with_only_a_size_zero_directory_named_file_copies_nothing_and_does_not_error_in_google( router_filesystem: Tuple[asyncio.Semaphore, AsyncFS, Dict[str, str]], ): sema, fs, bases = router_filesystem @@ -513,10 +679,22 @@ async def test_file_and_directory_error_with_slash_non_empty_file_only_for_googl await collect_files(await fs.listfiles(f'{src_base}')) await collect_files(await fs.listfiles(f'{src_base}empty-only/')) - for transfer_type in (Transfer.DEST_IS_TARGET, Transfer.DEST_DIR, Transfer.INFER_DEST): + for transfer_type in ( + Transfer.DEST_IS_TARGET, + Transfer.DEST_DIR, + Transfer.INFER_DEST, + ): dest_base = await fresh_dir(fs, bases, 'gs') - await Copier.copy( - fs, sema, Transfer(f'{src_base}empty-only/', dest_base.rstrip('/'), treat_dest_as=transfer_type) + await Copier.copy( # hailctl fs sync errors when the source is an empty directory or + # non-extant file (excluding the size-zero, ending in trailing slash + # files). + fs, + sema, + Transfer( + f'{src_base}empty-only/', + dest_base.rstrip('/'), + treat_dest_as=transfer_type, + ), ) # We ignore empty directories when copying @@ -524,8 +702,9 @@ async def test_file_and_directory_error_with_slash_non_empty_file_only_for_googl await collect_files(await fs.listfiles(f'{dest_base}empty-only/')) -async def test_file_and_directory_error_with_slash_empty_file_only( - router_filesystem: Tuple[asyncio.Semaphore, AsyncFS, Dict[str, str]], cloud_scheme: str +async def test_copying_a_folder_with_only_a_size_zero_directory_named_file_copies_nothing_and_does_not_error( + router_filesystem: Tuple[asyncio.Semaphore, AsyncFS, Dict[str, str]], + cloud_scheme: str, ): sema, fs, bases = router_filesystem @@ -536,17 +715,35 @@ async def test_file_and_directory_error_with_slash_empty_file_only( await collect_files(await fs.listfiles(f'{src_base}', recursive=True)) await collect_files(await fs.listfiles(f'{src_base}empty-only/', recursive=True)) - for transfer_type in (Transfer.DEST_IS_TARGET, Transfer.DEST_DIR, Transfer.INFER_DEST): + for transfer_type in ( + Transfer.DEST_IS_TARGET, + Transfer.DEST_DIR, + Transfer.INFER_DEST, + ): dest_base = await fresh_dir(fs, bases, cloud_scheme) - await Copier.copy( - fs, sema, Transfer(f'{src_base}empty-only/', dest_base.rstrip('/'), treat_dest_as=transfer_type) + await Copier.copy( # hailctl fs sync errors when the source is an empty directory or + # non-extant file (excluding the size-zero, ending in trailing slash + # files). + fs, + sema, + Transfer( + f'{src_base}empty-only/', + dest_base.rstrip('/'), + treat_dest_as=transfer_type, + ), ) with pytest.raises(FileNotFoundError): await collect_files(await fs.listfiles(f'{dest_base}empty-only/', recursive=True)) dest_base = await fresh_dir(fs, bases, cloud_scheme) - await Copier.copy(fs, sema, Transfer(f'{src_base}', dest_base.rstrip('/'), treat_dest_as=transfer_type)) + await Copier.copy( # hailctl fs sync errors when the source is an empty directory or + # non-extant file (excluding the size-zero, ending in trailing slash + # files). + fs, + sema, + Transfer(f'{src_base}', dest_base.rstrip('/'), treat_dest_as=transfer_type), + ) async def test_file_and_directory_error_with_slash_non_empty_file_only_google_non_recursive( @@ -566,7 +763,9 @@ async def test_file_and_directory_error_with_slash_non_empty_file_only_google_no async def test_file_and_directory_error_with_slash_non_empty_file_only( - router_filesystem: Tuple[asyncio.Semaphore, AsyncFS, Dict[str, str]], cloud_scheme: str + router_filesystem: Tuple[asyncio.Semaphore, AsyncFS, Dict[str, str]], + cloud_scheme: str, + copy_tool, ): sema, fs, bases = router_filesystem @@ -580,15 +779,27 @@ async def test_file_and_directory_error_with_slash_non_empty_file_only( with pytest.raises(FileAndDirectoryError): await collect_files(await fs.listfiles(f'{src_base}not-empty-file-w-slash/', recursive=True)) - for transfer_type in (Transfer.DEST_IS_TARGET, Transfer.DEST_DIR, Transfer.INFER_DEST): + for transfer_type in ( + Transfer.DEST_IS_TARGET, + Transfer.DEST_DIR, + Transfer.INFER_DEST, + ): with pytest.raises(FileAndDirectoryError): dest_base = await fresh_dir(fs, bases, cloud_scheme) - await Copier.copy( + await copy_tool( fs, sema, - Transfer(f'{src_base}not-empty-file-w-slash/', dest_base.rstrip('/'), treat_dest_as=transfer_type), + Transfer( + f'{src_base}not-empty-file-w-slash/', + dest_base.rstrip('/'), + treat_dest_as=transfer_type, + ), ) with pytest.raises(FileAndDirectoryError): dest_base = await fresh_dir(fs, bases, cloud_scheme) - await Copier.copy(fs, sema, Transfer(f'{src_base}', dest_base.rstrip('/'), treat_dest_as=transfer_type)) + await copy_tool( + fs, + sema, + Transfer(f'{src_base}', dest_base.rstrip('/'), treat_dest_as=transfer_type), + ) diff --git a/hail/python/test/hailtop/inter_cloud/test_fs.py b/hail/python/test/hailtop/inter_cloud/test_fs.py index 00bf2f4cba0..087be75b3b6 100644 --- a/hail/python/test/hailtop/inter_cloud/test_fs.py +++ b/hail/python/test/hailtop/inter_cloud/test_fs.py @@ -535,6 +535,27 @@ async def test_basename_is_not_path(filesystem: Tuple[asyncio.Semaphore, AsyncFS assert (await fs.statfile(str(base.with_new_path_component('abc123')))).basename() == 'abc123' +async def test_basename_of_file_ending_in_slash_is_empty_in_cloud( + filesystem: Tuple[asyncio.Semaphore, AsyncFS, AsyncFSURL], +): + _, fs, base = filesystem + + if base.scheme in ('', 'file'): + return + + await fs.write(str(base.with_new_path_component('file-is-folder/')), b'') + + files = [x async for x in await fs.listfiles(str(base))] + assert len(files) == 1 + file = files[0] + assert file.basename() == 'file-is-folder' and file.is_dir() + + files = [x async for x in await fs.listfiles(str(await file.url()), exclude_trailing_slash_files=False)] + assert len(files) == 1 + file = files[0] + assert file.basename() == '' and file.is_file() + + async def test_listfiles(filesystem: Tuple[asyncio.Semaphore, AsyncFS, AsyncFSURL]): _, fs, base = filesystem diff --git a/hail/python/test/hailtop/inter_cloud/test_hailctl_fs_sync.py b/hail/python/test/hailtop/inter_cloud/test_hailctl_fs_sync.py new file mode 100644 index 00000000000..e3bdd041aa6 --- /dev/null +++ b/hail/python/test/hailtop/inter_cloud/test_hailctl_fs_sync.py @@ -0,0 +1,128 @@ +from typing import Tuple, Dict, AsyncIterator, List +import pytest +import os.path +import tempfile +import secrets +import asyncio +import pytest +from hailtop.utils import url_scheme, check_exec_output +from hailtop.aiotools import Transfer, FileAndDirectoryError, Copier, AsyncFS, FileListEntry + + +from .generate_copy_test_specs import run_test_spec, create_test_file, create_test_dir +from .test_copy import cloud_scheme, router_filesystem, fresh_dir + + +@pytest.mark.asyncio +async def test_cli_file_and_dir(router_filesystem, cloud_scheme): + sema, fs, bases = router_filesystem + + test_dir = await fresh_dir(fs, bases, cloud_scheme) + plandir = await fresh_dir(fs, bases, cloud_scheme) + plandir2 = await fresh_dir(fs, bases, cloud_scheme) + + await fs.write(f"{test_dir}file1", b"hello world\n") + + await check_exec_output( + 'hailctl', + 'fs', + 'sync', + '--make-plan', + plandir, + '--copy-to', + f'{test_dir}file1', + f'{test_dir}file2', + '--copy-into', + f'{test_dir}file1', + f'{test_dir}dir1', + ) + + await check_exec_output( + 'hailctl', + 'fs', + 'sync', + '--use-plan', + plandir, + ) + + expected_files = [f"{test_dir}file1", f"{test_dir}file2", f"{test_dir}dir1/file1"] + for url in expected_files: + assert await fs.read(url) == b"hello world\n" + + await check_exec_output( + 'hailctl', + 'fs', + 'sync', + '--make-plan', + plandir2, + '--copy-to', + f'{test_dir}file1', + f'{test_dir}file2', + '--copy-into', + f'{test_dir}file1', + f'{test_dir}dir1', + ) + assert await fs.read(plandir2 + 'differs') == b'' + assert await fs.read(plandir2 + 'dstonly') == b'' + assert await fs.read(plandir2 + 'srconly') == b'' + + +@pytest.mark.asyncio +async def test_cli_subdir(router_filesystem, cloud_scheme): + sema, fs, bases = router_filesystem + + test_dir = await fresh_dir(fs, bases, cloud_scheme) + plandir = await fresh_dir(fs, bases, cloud_scheme) + plandir2 = await fresh_dir(fs, bases, cloud_scheme) + + await fs.makedirs(f"{test_dir}dir") + await fs.makedirs(f"{test_dir}dir/subdir") + await fs.write(f"{test_dir}dir/subdir/file1", b"hello world\n") + + await check_exec_output( + 'hailctl', 'fs', 'sync', '--make-plan', plandir, '--copy-to', f'{test_dir}dir', f'{test_dir}dir2' + ) + + await check_exec_output( + 'hailctl', + 'fs', + 'sync', + '--use-plan', + plandir, + ) + + assert await fs.read(f"{test_dir}dir2/subdir/file1") == b"hello world\n" + + await check_exec_output( + 'hailctl', 'fs', 'sync', '--make-plan', plandir2, '--copy-to', f'{test_dir}dir', f'{test_dir}dir2' + ) + + assert await fs.read(plandir2 + 'plan') == b'' + assert await fs.read(plandir2 + 'summary') == b'0\t0\n' + assert await fs.read(plandir2 + 'differs') == b'' + assert await fs.read(plandir2 + 'dstonly') == b'' + assert await fs.read(plandir2 + 'srconly') == b'' + assert await fs.read(plandir2 + 'matches') == f'{test_dir}dir/subdir/file1\t{test_dir}dir2/subdir/file1\n'.encode() + + +@pytest.mark.asyncio +async def test_cli_already_synced(router_filesystem, cloud_scheme): + sema, fs, bases = router_filesystem + + test_dir = await fresh_dir(fs, bases, cloud_scheme) + plandir = await fresh_dir(fs, bases, cloud_scheme) + + await fs.makedirs(f"{test_dir}dir") + await fs.write(f"{test_dir}dir/foo", b"hello world\n") + await fs.write(f"{test_dir}bar", b"hello world\n") + + await check_exec_output( + 'hailctl', 'fs', 'sync', '--make-plan', plandir, '--copy-to', f'{test_dir}dir/foo', f'{test_dir}bar' + ) + + assert await fs.read(plandir + 'plan') == b'' + assert await fs.read(plandir + 'summary') == b'0\t0\n' + assert await fs.read(plandir + 'differs') == b'' + assert await fs.read(plandir + 'dstonly') == b'' + assert await fs.read(plandir + 'srconly') == b'' + assert await fs.read(plandir + 'matches') == f'{test_dir}dir/foo\t{test_dir}bar\n'.encode() From 86b5a4b61500692789071f7ead487f083110c07d Mon Sep 17 00:00:00 2001 From: Dan King Date: Mon, 5 Feb 2024 12:45:52 -0500 Subject: [PATCH 02/62] more docs --- hail/python/hailtop/hailctl/fs/cli.py | 70 +++++++++++++++++++++++++++ 1 file changed, 70 insertions(+) diff --git a/hail/python/hailtop/hailctl/fs/cli.py b/hail/python/hailtop/hailctl/fs/cli.py index 1c02f7604f2..c30fa78ee07 100644 --- a/hail/python/hailtop/hailctl/fs/cli.py +++ b/hail/python/hailtop/hailctl/fs/cli.py @@ -78,6 +78,36 @@ def sync( First generate a plan with --make-plan, then use the plan with --use-plan. + ----- + + Plans + + ----- + + The command, `hailctl fs sync --make-plan` creates a "plan". A plan is a folder containing six + files: + + 1. matches: files whose names and sizes match. Two columns: source URL, destination URL. + + 2. differs: files or folders whose names match but either differ in size or differ in type. + Four columns: source URL, destination URL, source state, destination state. The states + are either: file, dif, or a size. If either state is a size, both states are sizes. + + 3. srconly: files only present in the source. One column: source URL. + + 4. dstonly: files only present in the destination. One column: destination URL. + + 5. plan: a proposed set of object-to-object copies. Two columns: source URL, destination URL. + + 6. summary: a one-line file containing the total number of copies in plan and the total number of bytes which would be copied. + + + -------- + + Examples + + -------- + Copy all the files under gs://gcs-bucket/a to s3://s3-bucket/b. For example, gs://gcs-bucket/a/b/c will appear at s3://s3-bucket/b/b/c: @@ -88,6 +118,46 @@ def sync( $ hailctl fs sync --use-plan plan1 + + + + + Copy all the files under gs://gcs-bucket/a into the (possibly not existing) s3://s3-bucket/b folder. + For example, gs://gcs-bucket/a/b/c will appear at s3://s3-bucket/b/a/b/c: + + + + $ hailctl fs sync --make-plan plan1 --copy-into gs://gcs-bucket/a s3://s3-bucket/b + + + + $ hailctl fs sync --use-plan plan1 + + + + + Sync to cloud locations and then regenerate a plan folder to verify that all copies completed + successfully: + + + + $ hailctl fs sync --make-plan plan1 --copy-to gs://gcs-bucket/a s3://s3-bucket/b + + + + $ hailctl fs sync --use-plan plan1 + + + + $ hailctl fs sync --make-plan plan1 --copy-to gs://gcs-bucket/a s3://s3-bucket/b + + + + $ cat plan1/summary + + + + $ less plan1/plan """ if (make_plan is None and use_plan is None) or (make_plan is not None and use_plan is not None): print('Must specify one of --make-plan or --use-plan. See hailctl fs sync --help.') From 8fbc0f218a363faf9e4e3636ce51492385025458 Mon Sep 17 00:00:00 2001 From: Dan King Date: Mon, 5 Feb 2024 12:46:40 -0500 Subject: [PATCH 03/62] fix bad help string --- hail/python/hailtop/hailctl/fs/cli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hail/python/hailtop/hailctl/fs/cli.py b/hail/python/hailtop/hailctl/fs/cli.py index c30fa78ee07..4bc5b14325b 100644 --- a/hail/python/hailtop/hailctl/fs/cli.py +++ b/hail/python/hailtop/hailctl/fs/cli.py @@ -43,7 +43,7 @@ def callback(): @click.option( '-v', '--verbose', - help='The Google project to which to charge egress costs.', + help='Enable extra logging information including a progress bar.', is_flag=True, required=False, default=False, From 861ca137933e80900be656ec1ea0c6c539b2a82a Mon Sep 17 00:00:00 2001 From: Dan King Date: Thu, 8 Feb 2024 16:19:10 -0500 Subject: [PATCH 04/62] use recursive=True for rapid listing --- hail/python/hailtop/aiotools/plan.py | 22 ++++++++-------------- 1 file changed, 8 insertions(+), 14 deletions(-) diff --git a/hail/python/hailtop/aiotools/plan.py b/hail/python/hailtop/aiotools/plan.py index 1a9bdd6a64e..54abbb569f0 100644 --- a/hail/python/hailtop/aiotools/plan.py +++ b/hail/python/hailtop/aiotools/plan.py @@ -114,7 +114,7 @@ async def from_file_status(x: FileStatus) -> 'FileStat': async def listfiles(fs: AsyncFS, x: str) -> List[FileStat]: try: - it = await fs.listfiles(x) + it = await fs.listfiles(x, recursive=True) return [await FileStat.from_file_list_entry(x) async for x in it] except (FileNotFoundError, NotADirectoryError): return [] @@ -136,7 +136,7 @@ def relativize_url(folder: str, file: str) -> str: if folder[-1] != '/': folder = folder + '/' relative_path = file.removeprefix(folder) - assert relative_path[0] != '/' + assert relative_path[0] != '/', (relative_path, folder, file) return relative_path @@ -268,21 +268,15 @@ def background_process_child_dir(new_srcurl: str, new_dsturl: str): while srcidx < len(srcfiles) and dstidx < len(dstfiles): srcf = srcfiles[srcidx] dstf = dstfiles[dstidx] - if srcf.basename == dstf.basename: - if srcf.is_dir and dstf.is_dir: - background_process_child_dir(srcf.url, dstf.url) - elif srcf.is_dir and not dstf.is_dir: - await writeline(differs, srcf.url, dstf.url, 'dir', 'file') - elif not srcf.is_dir and dstf.is_dir: - await writeline(differs, srcf.url, dstf.url, 'file', 'dir') - elif srcf.size == dstf.size: + if srcf.url == dstf.url: + if srcf.size == dstf.size: await writeline(matches, srcf.url, dstf.url) else: await writeline(differs, srcf.url, dstf.url, str(srcf.size), str(dstf.size)) dstidx += 1 srcidx += 1 progress.update(tid, advance=2) - elif srcf.basename < dstf.basename: + elif srcf.url < dstf.url: if srcf.is_dir: background_process_child_dir( srcf.url, @@ -290,13 +284,13 @@ def background_process_child_dir(new_srcurl: str, new_dsturl: str): ) else: await writeline(srconly, srcf.url) - await writeline(plan, srcf.url, os.path.join(dst, srcf.basename)) + await writeline(plan, srcf.url, os.path.join(dst, relativize_url(folder=src, file=srcf.url))) n_files += 1 n_bytes += srcf.size srcidx += 1 progress.update(tid, advance=1) else: - assert srcf.basename >= dstf.basename + assert srcf.url >= dstf.url dstidx += 1 progress.update(tid, advance=1) await writeline(dstonly, dstf.url) @@ -311,7 +305,7 @@ def background_process_child_dir(new_srcurl: str, new_dsturl: str): ) else: await writeline(srconly, srcf.url) - await writeline(plan, srcf.url, os.path.join(dst, srcf.basename)) + await writeline(plan, srcf.url, os.path.join(dst, relativize_url(folder=src, file=srcf.url))) n_files += 1 n_bytes += srcf.size srcidx += 1 From 7660033e81bfb4e0d18619aecdc70fe91e3dd7f3 Mon Sep 17 00:00:00 2001 From: Dan King Date: Thu, 8 Feb 2024 16:31:40 -0500 Subject: [PATCH 05/62] fix --- hail/python/hailtop/aiotools/plan.py | 44 ++++++++++++---------------- 1 file changed, 19 insertions(+), 25 deletions(-) diff --git a/hail/python/hailtop/aiotools/plan.py b/hail/python/hailtop/aiotools/plan.py index 54abbb569f0..384f3867248 100644 --- a/hail/python/hailtop/aiotools/plan.py +++ b/hail/python/hailtop/aiotools/plan.py @@ -226,8 +226,8 @@ async def find_all_copy_pairs( await writeline(plan, srcstat.url, dst) return 1, srcstat.size - srcfiles.sort(key=lambda x: x.basename) - dstfiles.sort(key=lambda x: x.basename) + srcfiles.sort(key=lambda x: x.url) + dstfiles.sort(key=lambda x: x.url) tid = progress.add_task(description=src, total=len(srcfiles) + len(dstfiles)) @@ -268,7 +268,13 @@ def background_process_child_dir(new_srcurl: str, new_dsturl: str): while srcidx < len(srcfiles) and dstidx < len(dstfiles): srcf = srcfiles[srcidx] dstf = dstfiles[dstidx] - if srcf.url == dstf.url: + relsrcf = relativize_url(folder=src, file=srcf.url) + reldstf = relativize_url(folder=dst, file=dstf.url) + print(srcf, relsrcf) + print(dstf, reldstf) + print(relsrcf == reldstf) + print(relsrcf < reldstf) + if relsrcf == reldstf: if srcf.size == dstf.size: await writeline(matches, srcf.url, dstf.url) else: @@ -276,21 +282,15 @@ def background_process_child_dir(new_srcurl: str, new_dsturl: str): dstidx += 1 srcidx += 1 progress.update(tid, advance=2) - elif srcf.url < dstf.url: - if srcf.is_dir: - background_process_child_dir( - srcf.url, - os.path.join(dst, relativize_url(folder=src, file=srcf.url)), - ) - else: - await writeline(srconly, srcf.url) - await writeline(plan, srcf.url, os.path.join(dst, relativize_url(folder=src, file=srcf.url))) - n_files += 1 - n_bytes += srcf.size + elif relsrcf < reldstf: + await writeline(srconly, srcf.url) + await writeline(plan, srcf.url, os.path.join(dst, relativize_url(folder=src, file=srcf.url))) + n_files += 1 + n_bytes += srcf.size srcidx += 1 progress.update(tid, advance=1) else: - assert srcf.url >= dstf.url + assert relsrcf >= reldstf dstidx += 1 progress.update(tid, advance=1) await writeline(dstonly, dstf.url) @@ -298,16 +298,10 @@ def background_process_child_dir(new_srcurl: str, new_dsturl: str): while srcidx < len(srcfiles): srcf = srcfiles[srcidx] - if srcf.is_dir: - background_process_child_dir( - srcf.url, - os.path.join(dst, relativize_url(folder=src, file=srcf.url)), - ) - else: - await writeline(srconly, srcf.url) - await writeline(plan, srcf.url, os.path.join(dst, relativize_url(folder=src, file=srcf.url))) - n_files += 1 - n_bytes += srcf.size + await writeline(srconly, srcf.url) + await writeline(plan, srcf.url, os.path.join(dst, relativize_url(folder=src, file=srcf.url))) + n_files += 1 + n_bytes += srcf.size srcidx += 1 progress.update(tid, advance=1) From f956f0a7de248a18dcc6ea29c5de998677a572c9 Mon Sep 17 00:00:00 2001 From: Dan King Date: Thu, 8 Feb 2024 16:32:35 -0500 Subject: [PATCH 06/62] no prints --- hail/python/hailtop/aiotools/plan.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/hail/python/hailtop/aiotools/plan.py b/hail/python/hailtop/aiotools/plan.py index 384f3867248..ae650057c35 100644 --- a/hail/python/hailtop/aiotools/plan.py +++ b/hail/python/hailtop/aiotools/plan.py @@ -270,10 +270,6 @@ def background_process_child_dir(new_srcurl: str, new_dsturl: str): dstf = dstfiles[dstidx] relsrcf = relativize_url(folder=src, file=srcf.url) reldstf = relativize_url(folder=dst, file=dstf.url) - print(srcf, relsrcf) - print(dstf, reldstf) - print(relsrcf == reldstf) - print(relsrcf < reldstf) if relsrcf == reldstf: if srcf.size == dstf.size: await writeline(matches, srcf.url, dstf.url) From 5b0c8a9f09ef51dc36d9f39d88c11fb9af2d27ed Mon Sep 17 00:00:00 2001 From: Dan King Date: Fri, 9 Feb 2024 10:57:16 -0500 Subject: [PATCH 07/62] allow isdir without trailing slash --- hail/python/hailtop/aiocloud/aioazure/fs.py | 9 ++++----- .../hailtop/aiocloud/aiogoogle/client/storage_client.py | 4 +++- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/hail/python/hailtop/aiocloud/aioazure/fs.py b/hail/python/hailtop/aiocloud/aioazure/fs.py index e9a61a55407..540f206ff3b 100644 --- a/hail/python/hailtop/aiocloud/aioazure/fs.py +++ b/hail/python/hailtop/aiocloud/aioazure/fs.py @@ -559,12 +559,11 @@ async def isfile(self, url: str) -> bool: @handle_public_access_error async def isdir(self, url: str) -> bool: fs_url = self.parse_url(url, error_if_bucket=True) - assert not fs_url.path or fs_url.path.endswith('/'), fs_url.path client = self.get_container_client(fs_url) - path = fs_url.path - if path[-1] != '/': - path = path + '/' - async for _ in client.walk_blobs(name_starts_with=path, include=['metadata'], delimiter='/'): + prefix = fs_url.path + if prefix[-1] != '/': + prefix = prefix + '/' + async for _ in client.walk_blobs(name_starts_with=prefix, include=['metadata'], delimiter='/'): return True return False diff --git a/hail/python/hailtop/aiocloud/aiogoogle/client/storage_client.py b/hail/python/hailtop/aiocloud/aiogoogle/client/storage_client.py index 224506e0419..112927507e1 100644 --- a/hail/python/hailtop/aiocloud/aiogoogle/client/storage_client.py +++ b/hail/python/hailtop/aiocloud/aiogoogle/client/storage_client.py @@ -830,7 +830,9 @@ async def isfile(self, url: str) -> bool: async def isdir(self, url: str) -> bool: fsurl = self.parse_url(url, error_if_bucket=True) - assert not fsurl._path or fsurl.path.endswith('/'), fsurl._path + prefix = fsurl._path + if len(prefix) > 0 and prefix[-1] != '/': + prefix += '/' params = {'prefix': fsurl._path, 'delimiter': '/', 'includeTrailingDelimiter': 'true', 'maxResults': 1} async for page in await self._storage_client.list_objects(fsurl._bucket, params=params): prefixes = page.get('prefixes') From 46b1b34514291ff42e82085f219060df3dc191b6 Mon Sep 17 00:00:00 2001 From: Dan King Date: Mon, 12 Feb 2024 18:08:13 -0500 Subject: [PATCH 08/62] simplify sync.py dramatically --- hail/python/hailtop/aiotools/copy.py | 2 +- hail/python/hailtop/aiotools/sync.py | 141 ++++++++++++++++++++++++--- 2 files changed, 127 insertions(+), 16 deletions(-) diff --git a/hail/python/hailtop/aiotools/copy.py b/hail/python/hailtop/aiotools/copy.py index bb0c3eabf8b..547a43d85f2 100644 --- a/hail/python/hailtop/aiotools/copy.py +++ b/hail/python/hailtop/aiotools/copy.py @@ -95,7 +95,7 @@ async def copy( local_kwargs=local_kwargs, gcs_kwargs=gcs_kwargs, azure_kwargs=azure_kwargs, s3_kwargs=s3_kwargs ) as fs: with CopyToolProgressBar(transient=True, disable=not verbose) as progress: - initial_simultaneous_transfers = 10 + initial_simultaneous_transfers = min(10, max_simultaneous_transfers) parallelism_tid = progress.add_task( description='parallelism', completed=initial_simultaneous_transfers, diff --git a/hail/python/hailtop/aiotools/sync.py b/hail/python/hailtop/aiotools/sync.py index 4a509a12b16..a34be09651b 100644 --- a/hail/python/hailtop/aiotools/sync.py +++ b/hail/python/hailtop/aiotools/sync.py @@ -2,10 +2,15 @@ import asyncio import os import sys +import functools -from .fs.copier import Transfer +from .copy import GrowingSempahore +from .fs.fs import MultiPartCreate +from .fs.copier import Copier +from .fs.exceptions import UnexpectedEOFError from .router_fs import RouterAsyncFS, AsyncFS -from .copy import copy +from ..utils import retry_transient_errors, bounded_gather2 +from ..utils.rich_progress_bar import make_listener, CopyToolProgressBar try: import uvloop @@ -23,6 +28,103 @@ class SyncError(ValueError): pass +async def _copy_file_one_part( + fs: RouterAsyncFS, + srcfile: str, + size: int, + destfile: str, + files_listener, + bytes_listener, +) -> None: + assert not destfile.endswith('/') + + total_written = 0 + async with await fs.open(srcfile) as srcf: + try: + dest_cm = await fs.create(destfile, retry_writes=False) + except FileNotFoundError: + await fs.makedirs(os.path.dirname(destfile), exist_ok=True) + dest_cm = await fs.create(destfile) + + async with dest_cm as destf: + while True: + b = await srcf.read(Copier.BUFFER_SIZE) + if not b: + return + written = await destf.write(b) + assert written == len(b) + total_written += written + files_listener(-1) + bytes_listener(-total_written) + + +async def _copy_part( + fs: RouterAsyncFS, + part_size: int, + srcfile: str, + part_number: int, + this_part_size: int, + part_creator: MultiPartCreate, + bytes_listener, +) -> None: + total_written = 0 + async with await fs.open_from(srcfile, part_number * part_size, length=this_part_size) as srcf: + async with await part_creator.create_part( + part_number, part_number * part_size, size_hint=this_part_size + ) as destf: + n = this_part_size + while n > 0: + b = await srcf.read(min(Copier.BUFFER_SIZE, n)) + if len(b) == 0: + raise UnexpectedEOFError() + written = await destf.write(b) + assert written == len(b) + total_written += written + + n -= len(b) + bytes_listener(-total_written) + + +async def _copy_file( + fs: RouterAsyncFS, + sema: asyncio.Semaphore, + srcfile: str, + destfile: str, + files_listener, + bytes_listener, +): + srcstat = await fs.statfile(srcfile) + + size = await srcstat.size() + part_size = fs.copy_part_size(destfile) + + if size <= part_size: + return await retry_transient_errors( + _copy_file_one_part, fs, srcfile, size, destfile, files_listener, bytes_listener + ) + + n_parts, rem = divmod(size, part_size) + if rem: + n_parts += 1 + + try: + part_creator = await fs.multi_part_create(sema, destfile, n_parts) + except FileNotFoundError: + await fs.makedirs(os.path.dirname(destfile), exist_ok=True) + part_creator = await fs.multi_part_create(sema, destfile, n_parts) + + async with part_creator: + + async def f(i): + this_part_size = rem if i == n_parts - 1 and rem else part_size + await retry_transient_errors( + _copy_part, fs, part_size, srcfile, i, this_part_size, part_creator, bytes_listener + ) + + await bounded_gather2(sema, *[functools.partial(f, i) for i in range(n_parts)]) + files_listener(-1) + + async def sync( plan_folder: str, gcs_requester_pays_project: Optional[str], @@ -44,19 +146,28 @@ async def sync( raise SyncError('Run hailctl fs sync --make-plan first.', 1) results = (await fs.read(os.path.join(plan_folder, 'summary'))).decode('utf-8') n_files, n_bytes = (int(x) for x in results.split('\t')) - await copy( - max_simultaneous_transfers=max_parallelism, - local_kwargs=None, - gcs_kwargs=gcs_kwargs, - azure_kwargs={}, - s3_kwargs=s3_kwargs, - transfers=[ - Transfer(src, dst, treat_dest_as=Transfer.DEST_IS_TARGET) - async for src, dst in iterate_plan_file(plan_folder, fs) - ], - verbose=verbose, - totals=(n_files, n_bytes), - ) + with CopyToolProgressBar(transient=True, disable=not verbose) as progress: + files_tid = progress.add_task(description='files', total=n_files, visible=verbose) + bytes_tid = progress.add_task(description='bytes', total=n_bytes, visible=verbose) + + files_listener = make_listener(progress, files_tid) + bytes_listener = make_listener(progress, bytes_tid) + + initial_parallelism = min(10, max_parallelism) + parallelism_tid = progress.add_task( + description='parallelism', + completed=initial_parallelism, + total=max_parallelism, + visible=verbose, + ) + async with GrowingSempahore(initial_parallelism, max_parallelism, (progress, parallelism_tid)) as sema: + await bounded_gather2( + sema, + *[ + functools.partial(_copy_file, fs, sema, src, dst, files_listener, bytes_listener) + async for src, dst in iterate_plan_file(plan_folder, fs) + ], + ) async def iterate_plan_file(plan_folder: str, fs: AsyncFS): From 55c83e0c19fc0748ae3aadbfba4efc5e5f4400eb Mon Sep 17 00:00:00 2001 From: Dan King Date: Mon, 12 Feb 2024 18:10:20 -0500 Subject: [PATCH 09/62] update listeners before return --- hail/python/hailtop/aiotools/sync.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hail/python/hailtop/aiotools/sync.py b/hail/python/hailtop/aiotools/sync.py index a34be09651b..b2a0d56c8ca 100644 --- a/hail/python/hailtop/aiotools/sync.py +++ b/hail/python/hailtop/aiotools/sync.py @@ -50,12 +50,12 @@ async def _copy_file_one_part( while True: b = await srcf.read(Copier.BUFFER_SIZE) if not b: + files_listener(-1) + bytes_listener(-total_written) return written = await destf.write(b) assert written == len(b) total_written += written - files_listener(-1) - bytes_listener(-total_written) async def _copy_part( From eac543cc126febe10709cb726b6791266d11da17 Mon Sep 17 00:00:00 2001 From: Dan King Date: Mon, 12 Feb 2024 18:14:19 -0500 Subject: [PATCH 10/62] use uvloop --- hail/python/hailtop/aiotools/plan.py | 5 +---- hail/python/hailtop/aiotools/sync.py | 5 +---- 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/hail/python/hailtop/aiotools/plan.py b/hail/python/hailtop/aiotools/plan.py index ae650057c35..b1d84f00cbc 100644 --- a/hail/python/hailtop/aiotools/plan.py +++ b/hail/python/hailtop/aiotools/plan.py @@ -12,14 +12,11 @@ try: import uvloop - uvloop_install = uvloop.install + uvloop.install() except ImportError as e: if not sys.platform.startswith('win32'): raise e - def uvloop_install(): - pass - class PlanError(ValueError): pass diff --git a/hail/python/hailtop/aiotools/sync.py b/hail/python/hailtop/aiotools/sync.py index b2a0d56c8ca..ad40e467a70 100644 --- a/hail/python/hailtop/aiotools/sync.py +++ b/hail/python/hailtop/aiotools/sync.py @@ -15,14 +15,11 @@ try: import uvloop - uvloop_install = uvloop.install + uvloop.install() except ImportError as e: if not sys.platform.startswith('win32'): raise e - def uvloop_install(): - pass - class SyncError(ValueError): pass From 2bc9d1df1a02d74d96cd3fd266d151647985ed41 Mon Sep 17 00:00:00 2001 From: Dan King Date: Tue, 13 Feb 2024 10:01:34 -0500 Subject: [PATCH 11/62] uvloopx --- hail/python/hailtop/aiotools/plan.py | 9 --------- hail/python/hailtop/aiotools/sync.py | 9 --------- hail/python/hailtop/hailctl/fs/cli.py | 4 +++- 3 files changed, 3 insertions(+), 19 deletions(-) diff --git a/hail/python/hailtop/aiotools/plan.py b/hail/python/hailtop/aiotools/plan.py index b1d84f00cbc..14d655527d8 100644 --- a/hail/python/hailtop/aiotools/plan.py +++ b/hail/python/hailtop/aiotools/plan.py @@ -1,7 +1,6 @@ from typing import List, Tuple, Optional import asyncio import os -import sys from contextlib import AsyncExitStack from hailtop.aiotools import FileAndDirectoryError @@ -9,14 +8,6 @@ from .fs import FileListEntry, FileStatus, AsyncFS, WritableStream from ..utils.rich_progress_bar import CopyToolProgressBar, Progress -try: - import uvloop - - uvloop.install() -except ImportError as e: - if not sys.platform.startswith('win32'): - raise e - class PlanError(ValueError): pass diff --git a/hail/python/hailtop/aiotools/sync.py b/hail/python/hailtop/aiotools/sync.py index ad40e467a70..a30d5f268ee 100644 --- a/hail/python/hailtop/aiotools/sync.py +++ b/hail/python/hailtop/aiotools/sync.py @@ -1,7 +1,6 @@ from typing import Optional import asyncio import os -import sys import functools from .copy import GrowingSempahore @@ -12,14 +11,6 @@ from ..utils import retry_transient_errors, bounded_gather2 from ..utils.rich_progress_bar import make_listener, CopyToolProgressBar -try: - import uvloop - - uvloop.install() -except ImportError as e: - if not sys.platform.startswith('win32'): - raise e - class SyncError(ValueError): pass diff --git a/hail/python/hailtop/hailctl/fs/cli.py b/hail/python/hailtop/hailctl/fs/cli.py index 4bc5b14325b..e91f5a4d85b 100644 --- a/hail/python/hailtop/hailctl/fs/cli.py +++ b/hail/python/hailtop/hailctl/fs/cli.py @@ -4,10 +4,10 @@ import sys import typer +from hailtop.utils import uvloopx from hailtop.aiotools.plan import plan, PlanError from hailtop.aiotools.sync import sync as aiotools_sync, SyncError - app_without_click = typer.Typer( name='fs', no_args_is_help=True, @@ -159,6 +159,8 @@ def sync( $ less plan1/plan """ + uvloopx.install() + if (make_plan is None and use_plan is None) or (make_plan is not None and use_plan is not None): print('Must specify one of --make-plan or --use-plan. See hailctl fs sync --help.') raise typer.Exit(1) From 9588f91a525ae4ebb0d626f28d693a35dd7f4002 Mon Sep 17 00:00:00 2001 From: Dan King Date: Tue, 13 Feb 2024 10:39:08 -0500 Subject: [PATCH 12/62] import fix --- hail/python/hailtop/hailctl/fs/cli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hail/python/hailtop/hailctl/fs/cli.py b/hail/python/hailtop/hailctl/fs/cli.py index e91f5a4d85b..dba1db5acea 100644 --- a/hail/python/hailtop/hailctl/fs/cli.py +++ b/hail/python/hailtop/hailctl/fs/cli.py @@ -4,7 +4,7 @@ import sys import typer -from hailtop.utils import uvloopx +from hailtop import uvloopx from hailtop.aiotools.plan import plan, PlanError from hailtop.aiotools.sync import sync as aiotools_sync, SyncError From b905f3017abeadeb906077b0e4cdbeae958414c8 Mon Sep 17 00:00:00 2001 From: Dan King Date: Tue, 13 Feb 2024 16:43:56 -0500 Subject: [PATCH 13/62] maybe get InsertObjectStream right --- .../aiocloud/aiogoogle/client/storage_client.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/hail/python/hailtop/aiocloud/aiogoogle/client/storage_client.py b/hail/python/hailtop/aiocloud/aiogoogle/client/storage_client.py index 112927507e1..04f7cc63b54 100644 --- a/hail/python/hailtop/aiocloud/aiogoogle/client/storage_client.py +++ b/hail/python/hailtop/aiocloud/aiogoogle/client/storage_client.py @@ -87,13 +87,10 @@ def __init__(self, it: FeedableAsyncIterable[bytes], request_task: asyncio.Task[ self._exit_stack = AsyncExitStack() async def cleanup_request_task(): - if not self._request_task.cancelled(): - try: - async with await self._request_task as response: - self._value = await response.json() - except AttributeError as err: - raise ValueError(repr(self._request_task)) from err - await _cleanup_future(self._request_task) + if self._request_task.done() and not self._request_task.cancelled(): + async with await self._request_task as response: + self._value = await response.json() + _cleanup_future(self._request_task) self._exit_stack.push_async_callback(cleanup_request_task) From c4162cb6c255b7da34414fb2dc0d37614c7fb39f Mon Sep 17 00:00:00 2001 From: Dan King Date: Tue, 13 Feb 2024 16:45:10 -0500 Subject: [PATCH 14/62] await _cleanup_future too --- hail/python/hailtop/aiocloud/aiogoogle/client/storage_client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hail/python/hailtop/aiocloud/aiogoogle/client/storage_client.py b/hail/python/hailtop/aiocloud/aiogoogle/client/storage_client.py index 04f7cc63b54..b0d28d26bf3 100644 --- a/hail/python/hailtop/aiocloud/aiogoogle/client/storage_client.py +++ b/hail/python/hailtop/aiocloud/aiogoogle/client/storage_client.py @@ -90,7 +90,7 @@ async def cleanup_request_task(): if self._request_task.done() and not self._request_task.cancelled(): async with await self._request_task as response: self._value = await response.json() - _cleanup_future(self._request_task) + await _cleanup_future(self._request_task) self._exit_stack.push_async_callback(cleanup_request_task) From c8388b1f6fa177be52155ed7fc8b09031e69631b Mon Sep 17 00:00:00 2001 From: Dan King Date: Tue, 13 Feb 2024 16:48:31 -0500 Subject: [PATCH 15/62] use async with instead of async with await --- .../hailtop/aiocloud/aiogoogle/client/storage_client.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/hail/python/hailtop/aiocloud/aiogoogle/client/storage_client.py b/hail/python/hailtop/aiocloud/aiogoogle/client/storage_client.py index b0d28d26bf3..d5b7825539e 100644 --- a/hail/python/hailtop/aiocloud/aiogoogle/client/storage_client.py +++ b/hail/python/hailtop/aiocloud/aiogoogle/client/storage_client.py @@ -88,7 +88,7 @@ def __init__(self, it: FeedableAsyncIterable[bytes], request_task: asyncio.Task[ async def cleanup_request_task(): if self._request_task.done() and not self._request_task.cancelled(): - async with await self._request_task as response: + async with self._request_task as response: self._value = await response.json() await _cleanup_future(self._request_task) @@ -171,7 +171,7 @@ async def _write_chunk_1(self): # https://cloud.google.com/storage/docs/performing-resumable-uploads#status-check # note: this retries - async with await self._session.put( + async with self._session.put( self._session_url, headers={'Content-Length': '0', 'Content-Range': f'bytes */{total_size_str}'}, raise_for_status=False, @@ -367,7 +367,7 @@ async def insert_object(self, bucket: str, name: str, **kwargs) -> WritableStrea assert upload_type == 'resumable' chunk_size = kwargs.get('bufsize', 8 * 1024 * 1024) - async with await self._session.post( + async with self._session.post( f'https://storage.googleapis.com/upload/storage/v1/b/{bucket}/o', **kwargs ) as resp: session_url = resp.headers['Location'] From 527488f3970df85a9eb91fd20cab309ba25adf08 Mon Sep 17 00:00:00 2001 From: Dan King Date: Tue, 13 Feb 2024 17:07:21 -0500 Subject: [PATCH 16/62] smaller part size maybe helps? --- hail/python/hailtop/aiotools/fs/fs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hail/python/hailtop/aiotools/fs/fs.py b/hail/python/hailtop/aiotools/fs/fs.py index 09267afb556..f911d355d9e 100644 --- a/hail/python/hailtop/aiotools/fs/fs.py +++ b/hail/python/hailtop/aiotools/fs/fs.py @@ -259,7 +259,7 @@ def schemes() -> Set[str]: def copy_part_size(url: str) -> int: # pylint: disable=unused-argument """Part size when copying using multi-part uploads. The part size of the destination filesystem is used.""" - return 128 * 1024 * 1024 + return 16 * 1024 * 1024 @staticmethod @abc.abstractmethod From c8891ecd3178bf2a429852d941009e9baa13a5e0 Mon Sep 17 00:00:00 2001 From: Dan King Date: Tue, 13 Feb 2024 17:12:26 -0500 Subject: [PATCH 17/62] prints --- hail/python/hailtop/aiotools/sync.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/hail/python/hailtop/aiotools/sync.py b/hail/python/hailtop/aiotools/sync.py index a30d5f268ee..d3a29ae8018 100644 --- a/hail/python/hailtop/aiotools/sync.py +++ b/hail/python/hailtop/aiotools/sync.py @@ -44,6 +44,7 @@ async def _copy_file_one_part( written = await destf.write(b) assert written == len(b) total_written += written + print(f'{srcfile}, {written}') async def _copy_part( @@ -68,7 +69,7 @@ async def _copy_part( written = await destf.write(b) assert written == len(b) total_written += written - + print(f'{srcfile}, part {part_number}, {written}') n -= len(b) bytes_listener(-total_written) From 0e6ae95fb6dc99fa1c4e47be27127e679c7f85b7 Mon Sep 17 00:00:00 2001 From: Dan King Date: Tue, 13 Feb 2024 17:25:13 -0500 Subject: [PATCH 18/62] use order of magnitude less file parallelism than partition parallelism --- hail/python/hailtop/aiotools/sync.py | 31 +++++++++++++++++++++------- 1 file changed, 23 insertions(+), 8 deletions(-) diff --git a/hail/python/hailtop/aiotools/sync.py b/hail/python/hailtop/aiotools/sync.py index d3a29ae8018..d8a99d2329d 100644 --- a/hail/python/hailtop/aiotools/sync.py +++ b/hail/python/hailtop/aiotools/sync.py @@ -76,7 +76,7 @@ async def _copy_part( async def _copy_file( fs: RouterAsyncFS, - sema: asyncio.Semaphore, + transfer_sema: asyncio.Semaphore, srcfile: str, destfile: str, files_listener, @@ -97,10 +97,10 @@ async def _copy_file( n_parts += 1 try: - part_creator = await fs.multi_part_create(sema, destfile, n_parts) + part_creator = await fs.multi_part_create(transfer_sema, destfile, n_parts) except FileNotFoundError: await fs.makedirs(os.path.dirname(destfile), exist_ok=True) - part_creator = await fs.multi_part_create(sema, destfile, n_parts) + part_creator = await fs.multi_part_create(transfer_sema, destfile, n_parts) async with part_creator: @@ -110,7 +110,7 @@ async def f(i): _copy_part, fs, part_size, srcfile, i, this_part_size, part_creator, bytes_listener ) - await bounded_gather2(sema, *[functools.partial(f, i) for i in range(n_parts)]) + await bounded_gather2(transfer_sema, *[functools.partial(f, i) for i in range(n_parts)]) files_listener(-1) @@ -142,18 +142,33 @@ async def sync( files_listener = make_listener(progress, files_tid) bytes_listener = make_listener(progress, bytes_tid) + max_file_parallelism = max(1, max_parallelism // 10) + initial_parallelism = min(10, max_parallelism) + initial_file_parallelism = min(10, max_file_parallelism) + parallelism_tid = progress.add_task( - description='parallelism', + description='transfer parallelism', completed=initial_parallelism, total=max_parallelism, visible=verbose, ) - async with GrowingSempahore(initial_parallelism, max_parallelism, (progress, parallelism_tid)) as sema: + file_parallelism_tid = progress.add_task( + description='file parallelism', + completed=initial_parallelism, + total=max_parallelism, + visible=verbose, + ) + + async with GrowingSempahore( + initial_parallelism, max_parallelism, (progress, parallelism_tid) + ) as file_sema, GrowingSempahore( + initial_file_parallelism, max_file_parallelism, (progress, file_parallelism_tid) + ) as transfer_sema: await bounded_gather2( - sema, + file_sema, *[ - functools.partial(_copy_file, fs, sema, src, dst, files_listener, bytes_listener) + functools.partial(_copy_file, fs, transfer_sema, src, dst, files_listener, bytes_listener) async for src, dst in iterate_plan_file(plan_folder, fs) ], ) From 04448edd087e3508d56e1d8349f40185c281fbdb Mon Sep 17 00:00:00 2001 From: Dan King Date: Tue, 13 Feb 2024 17:27:37 -0500 Subject: [PATCH 19/62] files are 1 --- hail/python/hailtop/aiotools/sync.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/hail/python/hailtop/aiotools/sync.py b/hail/python/hailtop/aiotools/sync.py index d8a99d2329d..8dc5839ffee 100644 --- a/hail/python/hailtop/aiotools/sync.py +++ b/hail/python/hailtop/aiotools/sync.py @@ -160,9 +160,7 @@ async def sync( visible=verbose, ) - async with GrowingSempahore( - initial_parallelism, max_parallelism, (progress, parallelism_tid) - ) as file_sema, GrowingSempahore( + async with GrowingSempahore(1, 1, (progress, parallelism_tid)) as file_sema, GrowingSempahore( initial_file_parallelism, max_file_parallelism, (progress, file_parallelism_tid) ) as transfer_sema: await bounded_gather2( From 22ef264b0e26f45c8b8bc8449228c63f005eb8bd Mon Sep 17 00:00:00 2001 From: Dan King Date: Tue, 13 Feb 2024 17:31:36 -0500 Subject: [PATCH 20/62] prints --- hail/python/hailtop/aiotools/sync.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/hail/python/hailtop/aiotools/sync.py b/hail/python/hailtop/aiotools/sync.py index 8dc5839ffee..0c1fdf31002 100644 --- a/hail/python/hailtop/aiotools/sync.py +++ b/hail/python/hailtop/aiotools/sync.py @@ -71,6 +71,9 @@ async def _copy_part( total_written += written print(f'{srcfile}, part {part_number}, {written}') n -= len(b) + print(f'{srcfile}, part {part_number}, complete') + print(f'{srcfile}, part {part_number}, complete 2') + print(f'{srcfile}, part {part_number}, complete 3') bytes_listener(-total_written) From 9a436271daedbcf41cee7bc7d2a2b3f28b4019b9 Mon Sep 17 00:00:00 2001 From: Dan King Date: Tue, 13 Feb 2024 17:36:20 -0500 Subject: [PATCH 21/62] debugging --- .../hailtop/aiocloud/aiogoogle/client/storage_client.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/hail/python/hailtop/aiocloud/aiogoogle/client/storage_client.py b/hail/python/hailtop/aiocloud/aiogoogle/client/storage_client.py index d5b7825539e..aab7a387a20 100644 --- a/hail/python/hailtop/aiocloud/aiogoogle/client/storage_client.py +++ b/hail/python/hailtop/aiocloud/aiogoogle/client/storage_client.py @@ -79,12 +79,13 @@ async def _cleanup_future(fut: asyncio.Future): class InsertObjectStream(WritableStream): - def __init__(self, it: FeedableAsyncIterable[bytes], request_task: asyncio.Task[aiohttp.ClientResponse]): + def __init__(self, it: FeedableAsyncIterable[bytes], request_task: asyncio.Task[aiohttp.ClientResponse], url: str): super().__init__() self._it = it self._request_task = request_task self._value = None self._exit_stack = AsyncExitStack() + self.url = url async def cleanup_request_task(): if self._request_task.done() and not self._request_task.cancelled(): @@ -108,9 +109,13 @@ async def write(self, b): async def _wait_closed(self): try: + print('_wait_closed', self.url, self._request_task) fut = asyncio.ensure_future(self._it.stop()) + print('_wait_closed ensure_future', self.url, fut, self._request_task) self._exit_stack.push_async_callback(_cleanup_future, fut) + print('_wait_closed async_callback', self.url, fut, self._request_task) await asyncio.wait([fut, self._request_task], return_when=asyncio.FIRST_COMPLETED) + print('_wait_closed async_callback', self.url, fut, self._request_task) finally: await self._exit_stack.aclose() @@ -360,7 +365,7 @@ async def insert_object(self, bucket: str, name: str, **kwargs) -> WritableStrea f'https://storage.googleapis.com/upload/storage/v1/b/{bucket}/o', retry=False, **kwargs ) ) - return InsertObjectStream(it, request_task) + return InsertObjectStream(it, request_task, 'gs://' + bucket + '/' + name) # Write using resumable uploads. See: # https://cloud.google.com/storage/docs/performing-resumable-uploads From f5a7af255de10ceb5962c965ba7a48d3037c8df7 Mon Sep 17 00:00:00 2001 From: Dan King Date: Tue, 13 Feb 2024 17:41:18 -0500 Subject: [PATCH 22/62] debugging --- .../hailtop/aiocloud/aiogoogle/client/storage_client.py | 4 ++-- hail/python/hailtop/aiotools/utils.py | 5 ++++- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/hail/python/hailtop/aiocloud/aiogoogle/client/storage_client.py b/hail/python/hailtop/aiocloud/aiogoogle/client/storage_client.py index aab7a387a20..f03c075943b 100644 --- a/hail/python/hailtop/aiocloud/aiogoogle/client/storage_client.py +++ b/hail/python/hailtop/aiocloud/aiogoogle/client/storage_client.py @@ -220,7 +220,7 @@ async def _write_chunk_1(self): # Upload a single chunk. See: # https://cloud.google.com/storage/docs/performing-resumable-uploads#chunked-upload - it: FeedableAsyncIterable[bytes] = FeedableAsyncIterable() + it: FeedableAsyncIterable[bytes] = FeedableAsyncIterable(self._session_url) async with _TaskManager( self._session.put( self._session_url, @@ -358,7 +358,7 @@ async def insert_object(self, bucket: str, name: str, **kwargs) -> WritableStrea params['uploadType'] = upload_type if upload_type == 'media': - it: FeedableAsyncIterable[bytes] = FeedableAsyncIterable() + it: FeedableAsyncIterable[bytes] = FeedableAsyncIterable('gs://' + bucket + '/' + name) kwargs['data'] = aiohttp.AsyncIterablePayload(it) request_task = asyncio.create_task( self._session.post( diff --git a/hail/python/hailtop/aiotools/utils.py b/hail/python/hailtop/aiotools/utils.py index 0a7ae6d96cf..09f7655fb45 100644 --- a/hail/python/hailtop/aiotools/utils.py +++ b/hail/python/hailtop/aiotools/utils.py @@ -12,9 +12,10 @@ class _StopFeedableAsyncIterator: class FeedableAsyncIterable(AsyncIterator[_T]): - def __init__(self, maxsize: int = 1): + def __init__(self, url: str, maxsize: int = 1): self._queue: asyncio.Queue = asyncio.Queue(maxsize=maxsize) self._stopped = False + self.url = url def __aiter__(self) -> 'FeedableAsyncIterable[_T]': return self @@ -22,7 +23,9 @@ def __aiter__(self) -> 'FeedableAsyncIterable[_T]': async def __anext__(self) -> _T: next = await self._queue.get() if isinstance(next, _StopFeedableAsyncIterator): + print('FeedableAsyncIterable.__anext__', self.url, "STOP") raise StopAsyncIteration + print('FeedableAsyncIterable.__anext__', self.url, len(next)) return next async def feed(self, next: _T) -> None: From e6fffd75d31644347db10830c646e2f70e6e5413 Mon Sep 17 00:00:00 2001 From: Dan King Date: Tue, 13 Feb 2024 17:47:41 -0500 Subject: [PATCH 23/62] fix --- hail/python/hailtop/aiocloud/aiogoogle/client/storage_client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hail/python/hailtop/aiocloud/aiogoogle/client/storage_client.py b/hail/python/hailtop/aiocloud/aiogoogle/client/storage_client.py index f03c075943b..0f0714e18b0 100644 --- a/hail/python/hailtop/aiocloud/aiogoogle/client/storage_client.py +++ b/hail/python/hailtop/aiocloud/aiogoogle/client/storage_client.py @@ -88,7 +88,7 @@ def __init__(self, it: FeedableAsyncIterable[bytes], request_task: asyncio.Task[ self.url = url async def cleanup_request_task(): - if self._request_task.done() and not self._request_task.cancelled(): + if not self._request_task.cancelled(): async with self._request_task as response: self._value = await response.json() await _cleanup_future(self._request_task) From 470884d1b6d1d81cdc2e20b815ab77e6d85d561d Mon Sep 17 00:00:00 2001 From: Dan King Date: Tue, 13 Feb 2024 17:48:58 -0500 Subject: [PATCH 24/62] revert prints --- hail/python/hailtop/aiotools/sync.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/hail/python/hailtop/aiotools/sync.py b/hail/python/hailtop/aiotools/sync.py index 0c1fdf31002..d7c235ee47b 100644 --- a/hail/python/hailtop/aiotools/sync.py +++ b/hail/python/hailtop/aiotools/sync.py @@ -44,7 +44,6 @@ async def _copy_file_one_part( written = await destf.write(b) assert written == len(b) total_written += written - print(f'{srcfile}, {written}') async def _copy_part( @@ -69,7 +68,6 @@ async def _copy_part( written = await destf.write(b) assert written == len(b) total_written += written - print(f'{srcfile}, part {part_number}, {written}') n -= len(b) print(f'{srcfile}, part {part_number}, complete') print(f'{srcfile}, part {part_number}, complete 2') From 01e0759597c3cfecae6fc89a6c78873cb316cbfb Mon Sep 17 00:00:00 2001 From: Dan King Date: Tue, 13 Feb 2024 17:49:48 -0500 Subject: [PATCH 25/62] fewre prints --- hail/python/hailtop/aiotools/utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/hail/python/hailtop/aiotools/utils.py b/hail/python/hailtop/aiotools/utils.py index 09f7655fb45..68f06630637 100644 --- a/hail/python/hailtop/aiotools/utils.py +++ b/hail/python/hailtop/aiotools/utils.py @@ -25,7 +25,6 @@ async def __anext__(self) -> _T: if isinstance(next, _StopFeedableAsyncIterator): print('FeedableAsyncIterable.__anext__', self.url, "STOP") raise StopAsyncIteration - print('FeedableAsyncIterable.__anext__', self.url, len(next)) return next async def feed(self, next: _T) -> None: From 52fd7b65e0ed8c5133ceccbb23e5f19e8ada91eb Mon Sep 17 00:00:00 2001 From: Dan King Date: Tue, 13 Feb 2024 17:57:36 -0500 Subject: [PATCH 26/62] debug --- hail/python/hailtop/aiotools/sync.py | 40 ++++++++++++++++------------ 1 file changed, 23 insertions(+), 17 deletions(-) diff --git a/hail/python/hailtop/aiotools/sync.py b/hail/python/hailtop/aiotools/sync.py index d7c235ee47b..43ebf97fb48 100644 --- a/hail/python/hailtop/aiotools/sync.py +++ b/hail/python/hailtop/aiotools/sync.py @@ -56,23 +56,29 @@ async def _copy_part( bytes_listener, ) -> None: total_written = 0 - async with await fs.open_from(srcfile, part_number * part_size, length=this_part_size) as srcf: - async with await part_creator.create_part( - part_number, part_number * part_size, size_hint=this_part_size - ) as destf: - n = this_part_size - while n > 0: - b = await srcf.read(min(Copier.BUFFER_SIZE, n)) - if len(b) == 0: - raise UnexpectedEOFError() - written = await destf.write(b) - assert written == len(b) - total_written += written - n -= len(b) - print(f'{srcfile}, part {part_number}, complete') - print(f'{srcfile}, part {part_number}, complete 2') - print(f'{srcfile}, part {part_number}, complete 3') - bytes_listener(-total_written) + try: + async with await fs.open_from(srcfile, part_number * part_size, length=this_part_size) as srcf: + async with await part_creator.create_part( + part_number, part_number * part_size, size_hint=this_part_size + ) as destf: + n = this_part_size + while n > 0: + b = await srcf.read(min(Copier.BUFFER_SIZE, n)) + if len(b) == 0: + raise UnexpectedEOFError() + written = await destf.write(b) + assert written == len(b) + total_written += written + n -= len(b) + print(f'{srcfile}, part {part_number}, complete') + print(f'{srcfile}, part {part_number}, complete 2') + print(f'{srcfile}, part {part_number}, complete 3') + bytes_listener(-total_written) + except Exception as exc: + import traceback + + traceback.format_exc() + print('error in copy part', exc) async def _copy_file( From e5ce4d6cd00a077bf2ffcf8f6661d8217622e4ca Mon Sep 17 00:00:00 2001 From: Dan King Date: Tue, 13 Feb 2024 17:58:25 -0500 Subject: [PATCH 27/62] wtf --- hail/python/hailtop/aiotools/sync.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hail/python/hailtop/aiotools/sync.py b/hail/python/hailtop/aiotools/sync.py index 43ebf97fb48..f64cf99b112 100644 --- a/hail/python/hailtop/aiotools/sync.py +++ b/hail/python/hailtop/aiotools/sync.py @@ -78,7 +78,7 @@ async def _copy_part( import traceback traceback.format_exc() - print('error in copy part', exc) + print('error in copy part', repr(exc)) async def _copy_file( From 279dcfac3a8fedad7c3a9fa7ee9e1059b35f9264 Mon Sep 17 00:00:00 2001 From: Dan King Date: Tue, 13 Feb 2024 18:01:06 -0500 Subject: [PATCH 28/62] debug --- .../python/hailtop/aiocloud/aiogoogle/client/storage_client.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/hail/python/hailtop/aiocloud/aiogoogle/client/storage_client.py b/hail/python/hailtop/aiocloud/aiogoogle/client/storage_client.py index 0f0714e18b0..113e52ee3dd 100644 --- a/hail/python/hailtop/aiocloud/aiogoogle/client/storage_client.py +++ b/hail/python/hailtop/aiocloud/aiogoogle/client/storage_client.py @@ -545,8 +545,11 @@ async def _compose(self, names: List[str], dest_name: str): async def __aexit__( self, exc_type: Optional[Type[BaseException]], exc_val: Optional[BaseException], exc_tb: Optional[TracebackType] ) -> None: + print('aexit 1', repr(exc_val)) async with OnlineBoundedGather2(self._sema) as pool: + print('aexit 2', repr(exc_val)) try: + print('aexit 3', repr(exc_val)) if exc_val is not None: return From d1ef63cac74f398d6ddd8b77c6c24f3c70a6c2ca Mon Sep 17 00:00:00 2001 From: Dan King Date: Tue, 13 Feb 2024 18:03:09 -0500 Subject: [PATCH 29/62] debug --- hail/python/hailtop/aiotools/sync.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hail/python/hailtop/aiotools/sync.py b/hail/python/hailtop/aiotools/sync.py index f64cf99b112..d4756644102 100644 --- a/hail/python/hailtop/aiotools/sync.py +++ b/hail/python/hailtop/aiotools/sync.py @@ -77,7 +77,7 @@ async def _copy_part( except Exception as exc: import traceback - traceback.format_exc() + traceback.print_exc() print('error in copy part', repr(exc)) From 41d9a9a3bf3517d16955f2a441ede7ad4e70a380 Mon Sep 17 00:00:00 2001 From: Dan King Date: Tue, 13 Feb 2024 18:05:48 -0500 Subject: [PATCH 30/62] debug --- hail/python/hailtop/aiotools/sync.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/hail/python/hailtop/aiotools/sync.py b/hail/python/hailtop/aiotools/sync.py index d4756644102..a0d2f5e6d5c 100644 --- a/hail/python/hailtop/aiotools/sync.py +++ b/hail/python/hailtop/aiotools/sync.py @@ -117,7 +117,10 @@ async def f(i): _copy_part, fs, part_size, srcfile, i, this_part_size, part_creator, bytes_listener ) - await bounded_gather2(transfer_sema, *[functools.partial(f, i) for i in range(n_parts)]) + try: + await bounded_gather2(transfer_sema, *[functools.partial(f, i) for i in range(n_parts)]) + except Exception as exc: + print('_copy_file saw error', repr(exc)) files_listener(-1) From 927ad35f95b0fa16291133598fd4fb949eeb7f58 Mon Sep 17 00:00:00 2001 From: Dan King Date: Tue, 13 Feb 2024 18:06:25 -0500 Subject: [PATCH 31/62] debug --- hail/python/hailtop/aiotools/sync.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/hail/python/hailtop/aiotools/sync.py b/hail/python/hailtop/aiotools/sync.py index a0d2f5e6d5c..9a5283179fa 100644 --- a/hail/python/hailtop/aiotools/sync.py +++ b/hail/python/hailtop/aiotools/sync.py @@ -79,6 +79,7 @@ async def _copy_part( traceback.print_exc() print('error in copy part', repr(exc)) + raise exc async def _copy_file( @@ -121,6 +122,7 @@ async def f(i): await bounded_gather2(transfer_sema, *[functools.partial(f, i) for i in range(n_parts)]) except Exception as exc: print('_copy_file saw error', repr(exc)) + raise exc files_listener(-1) From 8e0d44cb453a86db0a19120004695b131ddc8a1c Mon Sep 17 00:00:00 2001 From: Dan King Date: Tue, 13 Feb 2024 18:09:22 -0500 Subject: [PATCH 32/62] debug --- hail/python/hailtop/aiotools/sync.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/hail/python/hailtop/aiotools/sync.py b/hail/python/hailtop/aiotools/sync.py index 9a5283179fa..1876f9e8aa0 100644 --- a/hail/python/hailtop/aiotools/sync.py +++ b/hail/python/hailtop/aiotools/sync.py @@ -114,9 +114,19 @@ async def _copy_file( async def f(i): this_part_size = rem if i == n_parts - 1 and rem else part_size - await retry_transient_errors( - _copy_part, fs, part_size, srcfile, i, this_part_size, part_creator, bytes_listener - ) + + async def wee(): + try: + await _copy_part(fs, part_size, srcfile, i, this_part_size, part_creator, bytes_listener) + except Exception as exc: + print('wee saw error', repr(exc)) + raise exc + + try: + await retry_transient_errors(wee) + except Exception as exc: + print('f saw error', repr(exc)) + raise exc try: await bounded_gather2(transfer_sema, *[functools.partial(f, i) for i in range(n_parts)]) From cab8b8f07dcdfa4434bc0c1c392166ff63f17ba1 Mon Sep 17 00:00:00 2001 From: Dan King Date: Tue, 13 Feb 2024 18:11:22 -0500 Subject: [PATCH 33/62] debug --- hail/python/hailtop/utils/utils.py | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/hail/python/hailtop/utils/utils.py b/hail/python/hailtop/utils/utils.py index 315e08e37b3..b4fe56341f8 100644 --- a/hail/python/hailtop/utils/utils.py +++ b/hail/python/hailtop/utils/utils.py @@ -485,14 +485,30 @@ async def bounded_gather2_raise_exceptions( """ async def run_with_sema(pf: Callable[[], Awaitable[T]]): - async with sema: - return await pf() + try: + async with sema: + try: + return await pf() + except Exception as exc: + print('run with sema inner saw error', repr(exc)) + raise exc + except Exception as exc: + print('run with sema outer saw error', repr(exc)) + raise exc tasks = [asyncio.create_task(run_with_sema(pf)) for pf in pfs] if not cancel_on_error: - async with WithoutSemaphore(sema): - return await asyncio.gather(*tasks) + try: + async with WithoutSemaphore(sema): + try: + return await asyncio.gather(*tasks) + except Exception as exc: + print('bounded gather inner saw error', repr(exc)) + raise exc + except Exception as exc: + print('bounded gather outer saw error', repr(exc)) + raise exc try: async with WithoutSemaphore(sema): From 4a848b428363760030c99452713d3d5c77b184c1 Mon Sep 17 00:00:00 2001 From: Dan King Date: Tue, 13 Feb 2024 18:12:52 -0500 Subject: [PATCH 34/62] debug --- hail/python/hailtop/aiotools/sync.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/hail/python/hailtop/aiotools/sync.py b/hail/python/hailtop/aiotools/sync.py index 1876f9e8aa0..efd66927251 100644 --- a/hail/python/hailtop/aiotools/sync.py +++ b/hail/python/hailtop/aiotools/sync.py @@ -129,7 +129,11 @@ async def wee(): raise exc try: - await bounded_gather2(transfer_sema, *[functools.partial(f, i) for i in range(n_parts)]) + await bounded_gather2( + transfer_sema, + *[functools.partial(f, i) for i in range(n_parts)], + cancel_on_error=True, + ) except Exception as exc: print('_copy_file saw error', repr(exc)) raise exc @@ -191,6 +195,7 @@ async def sync( functools.partial(_copy_file, fs, transfer_sema, src, dst, files_listener, bytes_listener) async for src, dst in iterate_plan_file(plan_folder, fs) ], + cancel_on_error=True, ) From f89a9e5ce5e65396ad05fba913a13eabf7f7e077 Mon Sep 17 00:00:00 2001 From: Dan King Date: Tue, 13 Feb 2024 18:14:51 -0500 Subject: [PATCH 35/62] debug --- hail/python/hailtop/utils/utils.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/hail/python/hailtop/utils/utils.py b/hail/python/hailtop/utils/utils.py index b4fe56341f8..74bd8132311 100644 --- a/hail/python/hailtop/utils/utils.py +++ b/hail/python/hailtop/utils/utils.py @@ -534,6 +534,8 @@ async def bounded_gather2( return_exceptions: bool = False, cancel_on_error: bool = False, ) -> List[T]: + if not cancel_on_error: + raise ValueError('nope') if return_exceptions: if cancel_on_error: raise ValueError('cannot request return_exceptions and cancel_on_error') From 188fff76bdf6b5f8f2733bd1eff08f6c9cfaaf8c Mon Sep 17 00:00:00 2001 From: Dan King Date: Tue, 13 Feb 2024 18:15:53 -0500 Subject: [PATCH 36/62] debug --- hail/python/hailtop/utils/utils.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/hail/python/hailtop/utils/utils.py b/hail/python/hailtop/utils/utils.py index 74bd8132311..bc9c9d0e554 100644 --- a/hail/python/hailtop/utils/utils.py +++ b/hail/python/hailtop/utils/utils.py @@ -511,8 +511,16 @@ async def run_with_sema(pf: Callable[[], Awaitable[T]]): raise exc try: - async with WithoutSemaphore(sema): - return await asyncio.gather(*tasks) + try: + async with WithoutSemaphore(sema): + try: + return await asyncio.gather(*tasks) + except Exception as exc: + print('bounded gather inner saw error', repr(exc)) + raise exc + except Exception as exc: + print('bounded gather outer saw error', repr(exc)) + raise exc finally: _, exc, _ = sys.exc_info() if exc is not None: @@ -534,8 +542,6 @@ async def bounded_gather2( return_exceptions: bool = False, cancel_on_error: bool = False, ) -> List[T]: - if not cancel_on_error: - raise ValueError('nope') if return_exceptions: if cancel_on_error: raise ValueError('cannot request return_exceptions and cancel_on_error') From 9ef4b3258fffc35a98727d2ccbb03b7e3a101642 Mon Sep 17 00:00:00 2001 From: Dan King Date: Tue, 13 Feb 2024 18:17:17 -0500 Subject: [PATCH 37/62] debug --- hail/python/hailtop/utils/utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/hail/python/hailtop/utils/utils.py b/hail/python/hailtop/utils/utils.py index bc9c9d0e554..7a34f74f4bf 100644 --- a/hail/python/hailtop/utils/utils.py +++ b/hail/python/hailtop/utils/utils.py @@ -499,6 +499,7 @@ async def run_with_sema(pf: Callable[[], Awaitable[T]]): tasks = [asyncio.create_task(run_with_sema(pf)) for pf in pfs] if not cancel_on_error: + raise ValueError('nope') try: async with WithoutSemaphore(sema): try: From 70866483b49300f4d3407132244ea0a727a60805 Mon Sep 17 00:00:00 2001 From: Dan King Date: Tue, 13 Feb 2024 18:21:13 -0500 Subject: [PATCH 38/62] debug --- .../aiogoogle/client/storage_client.py | 7 -- hail/python/hailtop/aiotools/sync.py | 68 ++++++------------- hail/python/hailtop/utils/utils.py | 37 ++-------- 3 files changed, 28 insertions(+), 84 deletions(-) diff --git a/hail/python/hailtop/aiocloud/aiogoogle/client/storage_client.py b/hail/python/hailtop/aiocloud/aiogoogle/client/storage_client.py index 113e52ee3dd..fa45d1ecdab 100644 --- a/hail/python/hailtop/aiocloud/aiogoogle/client/storage_client.py +++ b/hail/python/hailtop/aiocloud/aiogoogle/client/storage_client.py @@ -109,13 +109,9 @@ async def write(self, b): async def _wait_closed(self): try: - print('_wait_closed', self.url, self._request_task) fut = asyncio.ensure_future(self._it.stop()) - print('_wait_closed ensure_future', self.url, fut, self._request_task) self._exit_stack.push_async_callback(_cleanup_future, fut) - print('_wait_closed async_callback', self.url, fut, self._request_task) await asyncio.wait([fut, self._request_task], return_when=asyncio.FIRST_COMPLETED) - print('_wait_closed async_callback', self.url, fut, self._request_task) finally: await self._exit_stack.aclose() @@ -545,11 +541,8 @@ async def _compose(self, names: List[str], dest_name: str): async def __aexit__( self, exc_type: Optional[Type[BaseException]], exc_val: Optional[BaseException], exc_tb: Optional[TracebackType] ) -> None: - print('aexit 1', repr(exc_val)) async with OnlineBoundedGather2(self._sema) as pool: - print('aexit 2', repr(exc_val)) try: - print('aexit 3', repr(exc_val)) if exc_val is not None: return diff --git a/hail/python/hailtop/aiotools/sync.py b/hail/python/hailtop/aiotools/sync.py index efd66927251..b216121ec29 100644 --- a/hail/python/hailtop/aiotools/sync.py +++ b/hail/python/hailtop/aiotools/sync.py @@ -56,30 +56,20 @@ async def _copy_part( bytes_listener, ) -> None: total_written = 0 - try: - async with await fs.open_from(srcfile, part_number * part_size, length=this_part_size) as srcf: - async with await part_creator.create_part( - part_number, part_number * part_size, size_hint=this_part_size - ) as destf: - n = this_part_size - while n > 0: - b = await srcf.read(min(Copier.BUFFER_SIZE, n)) - if len(b) == 0: - raise UnexpectedEOFError() - written = await destf.write(b) - assert written == len(b) - total_written += written - n -= len(b) - print(f'{srcfile}, part {part_number}, complete') - print(f'{srcfile}, part {part_number}, complete 2') - print(f'{srcfile}, part {part_number}, complete 3') - bytes_listener(-total_written) - except Exception as exc: - import traceback - - traceback.print_exc() - print('error in copy part', repr(exc)) - raise exc + async with await fs.open_from(srcfile, part_number * part_size, length=this_part_size) as srcf: + async with await part_creator.create_part( + part_number, part_number * part_size, size_hint=this_part_size + ) as destf: + n = this_part_size + while n > 0: + b = await srcf.read(min(Copier.BUFFER_SIZE, n)) + if len(b) == 0: + raise UnexpectedEOFError() + written = await destf.write(b) + assert written == len(b) + total_written += written + n -= len(b) + bytes_listener(-total_written) async def _copy_file( @@ -114,29 +104,15 @@ async def _copy_file( async def f(i): this_part_size = rem if i == n_parts - 1 and rem else part_size - - async def wee(): - try: - await _copy_part(fs, part_size, srcfile, i, this_part_size, part_creator, bytes_listener) - except Exception as exc: - print('wee saw error', repr(exc)) - raise exc - - try: - await retry_transient_errors(wee) - except Exception as exc: - print('f saw error', repr(exc)) - raise exc - - try: - await bounded_gather2( - transfer_sema, - *[functools.partial(f, i) for i in range(n_parts)], - cancel_on_error=True, + await retry_transient_errors( + _copy_part, fs, part_size, srcfile, i, this_part_size, part_creator, bytes_listener ) - except Exception as exc: - print('_copy_file saw error', repr(exc)) - raise exc + + await bounded_gather2( + transfer_sema, + *[functools.partial(f, i) for i in range(n_parts)], + cancel_on_error=True, + ) files_listener(-1) diff --git a/hail/python/hailtop/utils/utils.py b/hail/python/hailtop/utils/utils.py index 7a34f74f4bf..315e08e37b3 100644 --- a/hail/python/hailtop/utils/utils.py +++ b/hail/python/hailtop/utils/utils.py @@ -485,43 +485,18 @@ async def bounded_gather2_raise_exceptions( """ async def run_with_sema(pf: Callable[[], Awaitable[T]]): - try: - async with sema: - try: - return await pf() - except Exception as exc: - print('run with sema inner saw error', repr(exc)) - raise exc - except Exception as exc: - print('run with sema outer saw error', repr(exc)) - raise exc + async with sema: + return await pf() tasks = [asyncio.create_task(run_with_sema(pf)) for pf in pfs] if not cancel_on_error: - raise ValueError('nope') - try: - async with WithoutSemaphore(sema): - try: - return await asyncio.gather(*tasks) - except Exception as exc: - print('bounded gather inner saw error', repr(exc)) - raise exc - except Exception as exc: - print('bounded gather outer saw error', repr(exc)) - raise exc + async with WithoutSemaphore(sema): + return await asyncio.gather(*tasks) try: - try: - async with WithoutSemaphore(sema): - try: - return await asyncio.gather(*tasks) - except Exception as exc: - print('bounded gather inner saw error', repr(exc)) - raise exc - except Exception as exc: - print('bounded gather outer saw error', repr(exc)) - raise exc + async with WithoutSemaphore(sema): + return await asyncio.gather(*tasks) finally: _, exc, _ = sys.exc_info() if exc is not None: From 2c9d79d5e365eb34c1713150e8c3e4043f330072 Mon Sep 17 00:00:00 2001 From: Dan King Date: Tue, 13 Feb 2024 18:26:00 -0500 Subject: [PATCH 39/62] debug --- .../hailtop/aiocloud/aiogoogle/client/storage_client.py | 7 +++++-- hail/python/hailtop/aiotools/utils.py | 1 - 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/hail/python/hailtop/aiocloud/aiogoogle/client/storage_client.py b/hail/python/hailtop/aiocloud/aiogoogle/client/storage_client.py index fa45d1ecdab..a19700515e3 100644 --- a/hail/python/hailtop/aiocloud/aiogoogle/client/storage_client.py +++ b/hail/python/hailtop/aiocloud/aiogoogle/client/storage_client.py @@ -89,8 +89,11 @@ def __init__(self, it: FeedableAsyncIterable[bytes], request_task: asyncio.Task[ async def cleanup_request_task(): if not self._request_task.cancelled(): - async with self._request_task as response: - self._value = await response.json() + try: + async with self._request_task as response: + self._value = await response.json() + except AttributeError as err: + raise ValueError(repr(self._request_task)) from err await _cleanup_future(self._request_task) self._exit_stack.push_async_callback(cleanup_request_task) diff --git a/hail/python/hailtop/aiotools/utils.py b/hail/python/hailtop/aiotools/utils.py index 68f06630637..b28c488d39a 100644 --- a/hail/python/hailtop/aiotools/utils.py +++ b/hail/python/hailtop/aiotools/utils.py @@ -23,7 +23,6 @@ def __aiter__(self) -> 'FeedableAsyncIterable[_T]': async def __anext__(self) -> _T: next = await self._queue.get() if isinstance(next, _StopFeedableAsyncIterator): - print('FeedableAsyncIterable.__anext__', self.url, "STOP") raise StopAsyncIteration return next From 6268348f72c6bd511dc55a26e849b36bbee7175d Mon Sep 17 00:00:00 2001 From: Dan King Date: Tue, 13 Feb 2024 18:26:50 -0500 Subject: [PATCH 40/62] debug --- hail/python/hailtop/aiocloud/aiogoogle/client/storage_client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hail/python/hailtop/aiocloud/aiogoogle/client/storage_client.py b/hail/python/hailtop/aiocloud/aiogoogle/client/storage_client.py index a19700515e3..69a3364a7bc 100644 --- a/hail/python/hailtop/aiocloud/aiogoogle/client/storage_client.py +++ b/hail/python/hailtop/aiocloud/aiogoogle/client/storage_client.py @@ -90,7 +90,7 @@ def __init__(self, it: FeedableAsyncIterable[bytes], request_task: asyncio.Task[ async def cleanup_request_task(): if not self._request_task.cancelled(): try: - async with self._request_task as response: + async with await self._request_task as response: self._value = await response.json() except AttributeError as err: raise ValueError(repr(self._request_task)) from err From edbf8cdaa9f418786aca06105b95ae2ced3d632c Mon Sep 17 00:00:00 2001 From: Dan King Date: Tue, 13 Feb 2024 18:34:51 -0500 Subject: [PATCH 41/62] debug --- hail/python/hailtop/aiotools/sync.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/hail/python/hailtop/aiotools/sync.py b/hail/python/hailtop/aiotools/sync.py index b216121ec29..ca39efe98b3 100644 --- a/hail/python/hailtop/aiotools/sync.py +++ b/hail/python/hailtop/aiotools/sync.py @@ -113,7 +113,8 @@ async def f(i): *[functools.partial(f, i) for i in range(n_parts)], cancel_on_error=True, ) - files_listener(-1) + print(f'file complete: {srcfile}') + files_listener(-1) async def sync( From c9b07dee54d4f6334440babb376dd330968696fd Mon Sep 17 00:00:00 2001 From: Dan King Date: Tue, 13 Feb 2024 18:36:30 -0500 Subject: [PATCH 42/62] debug --- hail/python/hailtop/aiotools/sync.py | 1 - 1 file changed, 1 deletion(-) diff --git a/hail/python/hailtop/aiotools/sync.py b/hail/python/hailtop/aiotools/sync.py index ca39efe98b3..2229f21c0be 100644 --- a/hail/python/hailtop/aiotools/sync.py +++ b/hail/python/hailtop/aiotools/sync.py @@ -113,7 +113,6 @@ async def f(i): *[functools.partial(f, i) for i in range(n_parts)], cancel_on_error=True, ) - print(f'file complete: {srcfile}') files_listener(-1) From 7aa4a5a8cd1927501cb4397176933a677e171a20 Mon Sep 17 00:00:00 2001 From: Dan King Date: Tue, 13 Feb 2024 18:40:43 -0500 Subject: [PATCH 43/62] debug --- hail/python/hailtop/aiotools/sync.py | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/hail/python/hailtop/aiotools/sync.py b/hail/python/hailtop/aiotools/sync.py index 2229f21c0be..6a3946b8e98 100644 --- a/hail/python/hailtop/aiotools/sync.py +++ b/hail/python/hailtop/aiotools/sync.py @@ -162,17 +162,20 @@ async def sync( visible=verbose, ) - async with GrowingSempahore(1, 1, (progress, parallelism_tid)) as file_sema, GrowingSempahore( + async with GrowingSempahore( initial_file_parallelism, max_file_parallelism, (progress, file_parallelism_tid) - ) as transfer_sema: - await bounded_gather2( - file_sema, - *[ - functools.partial(_copy_file, fs, transfer_sema, src, dst, files_listener, bytes_listener) - async for src, dst in iterate_plan_file(plan_folder, fs) - ], - cancel_on_error=True, - ) + ) as file_sema: + async with GrowingSempahore( + initial_parallelism, max_parallelism, (progress, parallelism_tid) + ) as transfer_sema: + await bounded_gather2( + file_sema, + *[ + functools.partial(_copy_file, fs, transfer_sema, src, dst, files_listener, bytes_listener) + async for src, dst in iterate_plan_file(plan_folder, fs) + ], + cancel_on_error=True, + ) async def iterate_plan_file(plan_folder: str, fs: AsyncFS): From 2a886aa357f1e0c4fb86dcda43aba3d08da0e936 Mon Sep 17 00:00:00 2001 From: Dan King Date: Tue, 13 Feb 2024 18:41:44 -0500 Subject: [PATCH 44/62] debug --- hail/python/hailtop/aiotools/sync.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hail/python/hailtop/aiotools/sync.py b/hail/python/hailtop/aiotools/sync.py index 6a3946b8e98..70e1b145f7d 100644 --- a/hail/python/hailtop/aiotools/sync.py +++ b/hail/python/hailtop/aiotools/sync.py @@ -157,8 +157,8 @@ async def sync( ) file_parallelism_tid = progress.add_task( description='file parallelism', - completed=initial_parallelism, - total=max_parallelism, + completed=initial_file_parallelism, + total=max_file_parallelism, visible=verbose, ) From 7fc7003b3730cb22f95cccd8e9c20be9f87a4ca8 Mon Sep 17 00:00:00 2001 From: Dan King Date: Tue, 13 Feb 2024 18:48:21 -0500 Subject: [PATCH 45/62] debug --- hail/python/hailtop/aiotools/fs/fs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hail/python/hailtop/aiotools/fs/fs.py b/hail/python/hailtop/aiotools/fs/fs.py index f911d355d9e..09267afb556 100644 --- a/hail/python/hailtop/aiotools/fs/fs.py +++ b/hail/python/hailtop/aiotools/fs/fs.py @@ -259,7 +259,7 @@ def schemes() -> Set[str]: def copy_part_size(url: str) -> int: # pylint: disable=unused-argument """Part size when copying using multi-part uploads. The part size of the destination filesystem is used.""" - return 16 * 1024 * 1024 + return 128 * 1024 * 1024 @staticmethod @abc.abstractmethod From e564b5d2250028ffcbc7a348c310357f69552fcb Mon Sep 17 00:00:00 2001 From: Dan King Date: Thu, 15 Feb 2024 10:29:14 -0500 Subject: [PATCH 46/62] async with await --- .../hailtop/aiocloud/aiogoogle/client/storage_client.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hail/python/hailtop/aiocloud/aiogoogle/client/storage_client.py b/hail/python/hailtop/aiocloud/aiogoogle/client/storage_client.py index 69a3364a7bc..ac72d39527d 100644 --- a/hail/python/hailtop/aiocloud/aiogoogle/client/storage_client.py +++ b/hail/python/hailtop/aiocloud/aiogoogle/client/storage_client.py @@ -175,7 +175,7 @@ async def _write_chunk_1(self): # https://cloud.google.com/storage/docs/performing-resumable-uploads#status-check # note: this retries - async with self._session.put( + async with await self._session.put( self._session_url, headers={'Content-Length': '0', 'Content-Range': f'bytes */{total_size_str}'}, raise_for_status=False, @@ -371,7 +371,7 @@ async def insert_object(self, bucket: str, name: str, **kwargs) -> WritableStrea assert upload_type == 'resumable' chunk_size = kwargs.get('bufsize', 8 * 1024 * 1024) - async with self._session.post( + async with await self._session.post( f'https://storage.googleapis.com/upload/storage/v1/b/{bucket}/o', **kwargs ) as resp: session_url = resp.headers['Location'] From 40f96c5bc774a790e2fc9711a0f282073a8ed14d Mon Sep 17 00:00:00 2001 From: Dan King Date: Thu, 22 Feb 2024 16:27:46 -0500 Subject: [PATCH 47/62] pyright fixes --- hail/python/hailtop/aiotools/fs/fs.py | 1 + 1 file changed, 1 insertion(+) diff --git a/hail/python/hailtop/aiotools/fs/fs.py b/hail/python/hailtop/aiotools/fs/fs.py index 09267afb556..d90df6d41da 100644 --- a/hail/python/hailtop/aiotools/fs/fs.py +++ b/hail/python/hailtop/aiotools/fs/fs.py @@ -18,6 +18,7 @@ import abc import asyncio import datetime +from typing_extensions import Self from hailtop.utils import retry_transient_errors, OnlineBoundedGather2 from .stream import EmptyReadableStream, ReadableStream, WritableStream from .exceptions import FileAndDirectoryError From 4c69ec68274dfa9e583a63e1436d60ec4cffab64 Mon Sep 17 00:00:00 2001 From: Dan King Date: Thu, 22 Feb 2024 16:37:39 -0500 Subject: [PATCH 48/62] remove uvloopx changes --- auth/auth/auth.py | 7 ++++--- batch/batch/driver/main.py | 3 ++- batch/batch/worker/worker.py | 5 +++++ ci/ci/ci.py | 7 ++++--- ci/wait-for.py | 8 ++++---- hail/python/hailtop/aiotools/copy.py | 14 ++++++++++++-- hail/python/hailtop/aiotools/diff.py | 14 ++++++++++++-- hail/python/hailtop/uvloopx.py | 13 ------------- 8 files changed, 43 insertions(+), 28 deletions(-) delete mode 100644 hail/python/hailtop/uvloopx.py diff --git a/auth/auth/auth.py b/auth/auth/auth.py index dbee3451def..dd5f2887c58 100644 --- a/auth/auth/auth.py +++ b/auth/auth/auth.py @@ -11,6 +11,7 @@ import kubernetes_asyncio.client import kubernetes_asyncio.client.rest import kubernetes_asyncio.config +import uvloop from aiohttp import web from prometheus_async.aio.web import server_stats # type: ignore @@ -32,7 +33,7 @@ from gear.auth import AIOHTTPHandler, get_session_id from gear.cloud_config import get_global_config from gear.profiling import install_profiler_if_requested -from hailtop import httpx, uvloopx +from hailtop import httpx from hailtop.auth import AzureFlow, Flow, GoogleFlow, IdentityProvider from hailtop.config import get_deploy_config from hailtop.hail_logging import AccessLogger @@ -55,6 +56,8 @@ log = logging.getLogger('auth') +uvloop.install() + CLOUD = get_global_config()['cloud'] DEFAULT_NAMESPACE = os.environ['HAIL_DEFAULT_NAMESPACE'] @@ -884,8 +887,6 @@ async def auth_check_csrf_token(request: web.Request, handler: AIOHTTPHandler): def run(): - uvloopx.install() - install_profiler_if_requested('auth') app = web.Application(middlewares=[auth_check_csrf_token, monitor_endpoints_middleware]) diff --git a/batch/batch/driver/main.py b/batch/batch/driver/main.py index 8965f5c528c..69e693bf8d7 100644 --- a/batch/batch/driver/main.py +++ b/batch/batch/driver/main.py @@ -20,6 +20,7 @@ import plotly import plotly.graph_objects as go import prometheus_client as pc # type: ignore +import uvloop from aiohttp import web from plotly.subplots import make_subplots from prometheus_async.aio.web import server_stats @@ -80,7 +81,7 @@ from .instance_collection import InstanceCollectionManager, JobPrivateInstanceManager, Pool from .job import mark_job_complete, mark_job_started -uvloopx.install() +uvloop.install() log = logging.getLogger('batch') diff --git a/batch/batch/worker/worker.py b/batch/batch/worker/worker.py index 424ce57c5b3..5e62a92dc6b 100644 --- a/batch/batch/worker/worker.py +++ b/batch/batch/worker/worker.py @@ -86,6 +86,11 @@ from ..worker.worker_api import CloudDisk, CloudWorkerAPI, ContainerRegistryCredentials from .jvm_entryway_protocol import EndOfStream, read_bool, read_int, read_str, write_int, write_str +# import uvloop + + +# uvloop.install() + with open('/subdomains.txt', 'r', encoding='utf-8') as subdomains_file: HAIL_SERVICES = [line.rstrip() for line in subdomains_file.readlines()] diff --git a/ci/ci/ci.py b/ci/ci/ci.py index 513729733c0..fd708f35d97 100644 --- a/ci/ci/ci.py +++ b/ci/ci/ci.py @@ -11,6 +11,7 @@ import kubernetes_asyncio import kubernetes_asyncio.client import kubernetes_asyncio.config +import uvloop # type: ignore import yaml from aiohttp import web from gidgethub import aiohttp as gh_aiohttp @@ -30,7 +31,7 @@ setup_aiohttp_session, ) from gear.profiling import install_profiler_if_requested -from hailtop import aiotools, httpx, uvloopx +from hailtop import aiotools, httpx from hailtop.auth import hail_credentials from hailtop.batch_client.aioclient import Batch, BatchClient from hailtop.config import get_deploy_config @@ -50,6 +51,8 @@ log = logging.getLogger('ci') +uvloop.install() + deploy_config = get_deploy_config() watched_branches: List[WatchedBranch] = [] @@ -818,8 +821,6 @@ async def on_cleanup(app: web.Application): def run(): - uvloopx.install() - install_profiler_if_requested('ci') app = web.Application(middlewares=[check_csrf_token, monitor_endpoints_middleware]) diff --git a/ci/wait-for.py b/ci/wait-for.py index 6f8b54b88bf..22ed7967cd2 100644 --- a/ci/wait-for.py +++ b/ci/wait-for.py @@ -4,9 +4,10 @@ import sys import traceback +import uvloop from kubernetes_asyncio import client, config -from hailtop import uvloopx +uvloop.install() async def timeout(timeout_seconds): @@ -69,6 +70,5 @@ async def main(): await v1.api_client.rest_client.pool_manager.close() -if __name__ == '__main__': - uvloopx.install() - asyncio.run(main()) +loop = asyncio.get_event_loop() +loop.run_until_complete(main()) diff --git a/hail/python/hailtop/aiotools/copy.py b/hail/python/hailtop/aiotools/copy.py index 547a43d85f2..45a361d8fc3 100644 --- a/hail/python/hailtop/aiotools/copy.py +++ b/hail/python/hailtop/aiotools/copy.py @@ -8,12 +8,22 @@ from concurrent.futures import ThreadPoolExecutor from rich.progress import Progress, TaskID -from .. import uvloopx from ..utils.utils import sleep_before_try from ..utils.rich_progress_bar import CopyToolProgressBar, make_listener from . import Transfer, Copier from .router_fs import RouterAsyncFS +try: + import uvloop + + uvloop_install = uvloop.install +except ImportError as e: + if not sys.platform.startswith('win32'): + raise e + + def uvloop_install(): + pass + class GrowingSempahore(AsyncContextManager[asyncio.Semaphore]): def __init__(self, start_max: int, target_max: int, progress_and_tid: Optional[Tuple[Progress, TaskID]]): @@ -212,5 +222,5 @@ async def main() -> None: if __name__ == '__main__': - uvloopx.install() + uvloop_install() asyncio.run(main()) diff --git a/hail/python/hailtop/aiotools/diff.py b/hail/python/hailtop/aiotools/diff.py index bdc7942732f..a2208ecb6b6 100644 --- a/hail/python/hailtop/aiotools/diff.py +++ b/hail/python/hailtop/aiotools/diff.py @@ -7,11 +7,21 @@ from concurrent.futures import ThreadPoolExecutor -from .. import uvloopx from ..utils.rich_progress_bar import SimpleCopyToolProgressBar, SimpleCopyToolProgressBarTask from .router_fs import RouterAsyncFS from .fs import AsyncFS, FileStatus +try: + import uvloop + + uvloop_install = uvloop.install +except ImportError as e: + if not sys.platform.startswith('win32'): + raise e + + def uvloop_install(): + pass + async def diff( *, @@ -175,5 +185,5 @@ async def main() -> None: if __name__ == '__main__': - uvloopx.install() + uvloop_install() asyncio.run(main()) diff --git a/hail/python/hailtop/uvloopx.py b/hail/python/hailtop/uvloopx.py deleted file mode 100644 index bac9f2542f6..00000000000 --- a/hail/python/hailtop/uvloopx.py +++ /dev/null @@ -1,13 +0,0 @@ -import sys - -try: - import uvloop - - def install(): - return uvloop.install() -except ImportError as e: - if not sys.platform.startswith('win32'): - raise e - - def install(): - pass From e3425e128c36bfcdf76f80d6403cf93ae8280382 Mon Sep 17 00:00:00 2001 From: Dan King Date: Thu, 22 Feb 2024 16:39:02 -0500 Subject: [PATCH 49/62] Revert "also front_end.py" This reverts commit 618e9606afa8614d276f2b6679a7dc02c86ca6fa. --- batch/batch/front_end/front_end.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/batch/batch/front_end/front_end.py b/batch/batch/front_end/front_end.py index 56a6bcb9b5f..874e272efc8 100644 --- a/batch/batch/front_end/front_end.py +++ b/batch/batch/front_end/front_end.py @@ -23,6 +23,7 @@ import plotly.express as px import plotly.graph_objects as go import pymysql +import uvloop from aiohttp import web from plotly.subplots import make_subplots from prometheus_async.aio.web import server_stats # type: ignore @@ -45,7 +46,7 @@ from gear.clients import get_cloud_async_fs from gear.database import CallError from gear.profiling import install_profiler_if_requested -from hailtop import aiotools, dictfix, httpx, uvloopx, version +from hailtop import aiotools, dictfix, httpx, version from hailtop.auth import hail_credentials from hailtop.batch_client.globals import MAX_JOB_GROUPS_DEPTH, ROOT_JOB_GROUP_ID from hailtop.batch_client.parse import parse_cpu_in_mcpu, parse_memory_in_bytes, parse_storage_in_bytes @@ -124,7 +125,7 @@ validate_job_groups, ) -uvloopx.install() +uvloop.install() log = logging.getLogger('batch.front_end') From eeb861fc31310f806fadaa99d7f49669f3fe1c11 Mon Sep 17 00:00:00 2001 From: Dan King Date: Thu, 22 Feb 2024 17:12:53 -0500 Subject: [PATCH 50/62] remove cruft --- hail/python/hailtop/aiotools/plan.py | 94 ++++++++++------------------ 1 file changed, 33 insertions(+), 61 deletions(-) diff --git a/hail/python/hailtop/aiotools/plan.py b/hail/python/hailtop/aiotools/plan.py index 14d655527d8..1d2575c021f 100644 --- a/hail/python/hailtop/aiotools/plan.py +++ b/hail/python/hailtop/aiotools/plan.py @@ -225,76 +225,48 @@ async def find_all_copy_pairs( n_files = 0 n_bytes = 0 - async def process_child_directory(new_srcurl: str, new_dsturl: str) -> Tuple[int, int]: - return await find_all_copy_pairs( - fs, - matches, - differs, - srconly, - dstonly, - plan, - new_srcurl, - new_dsturl, - progress, - sema, - source_must_exist=False, - ) - - async def retrieve_child_directory_results(child_directory_task): - nonlocal n_files - nonlocal n_bytes - dir_n_files, dir_n_bytes = await child_directory_task - n_files += dir_n_files - n_bytes += dir_n_bytes - - async with AsyncExitStack() as child_directory_callbacks: - - def background_process_child_dir(new_srcurl: str, new_dsturl: str): - t = asyncio.create_task(process_child_directory(new_srcurl, new_dsturl)) - child_directory_callbacks.push_async_callback(retrieve_child_directory_results, t) - - while srcidx < len(srcfiles) and dstidx < len(dstfiles): - srcf = srcfiles[srcidx] - dstf = dstfiles[dstidx] - relsrcf = relativize_url(folder=src, file=srcf.url) - reldstf = relativize_url(folder=dst, file=dstf.url) - if relsrcf == reldstf: - if srcf.size == dstf.size: - await writeline(matches, srcf.url, dstf.url) - else: - await writeline(differs, srcf.url, dstf.url, str(srcf.size), str(dstf.size)) - dstidx += 1 - srcidx += 1 - progress.update(tid, advance=2) - elif relsrcf < reldstf: - await writeline(srconly, srcf.url) - await writeline(plan, srcf.url, os.path.join(dst, relativize_url(folder=src, file=srcf.url))) - n_files += 1 - n_bytes += srcf.size - srcidx += 1 - progress.update(tid, advance=1) + while srcidx < len(srcfiles) and dstidx < len(dstfiles): + srcf = srcfiles[srcidx] + dstf = dstfiles[dstidx] + relsrcf = relativize_url(folder=src, file=srcf.url) + reldstf = relativize_url(folder=dst, file=dstf.url) + if relsrcf == reldstf: + if srcf.size == dstf.size: + await writeline(matches, srcf.url, dstf.url) else: - assert relsrcf >= reldstf - dstidx += 1 - progress.update(tid, advance=1) - await writeline(dstonly, dstf.url) - - while srcidx < len(srcfiles): - srcf = srcfiles[srcidx] - + await writeline(differs, srcf.url, dstf.url, str(srcf.size), str(dstf.size)) + dstidx += 1 + srcidx += 1 + progress.update(tid, advance=2) + elif relsrcf < reldstf: await writeline(srconly, srcf.url) await writeline(plan, srcf.url, os.path.join(dst, relativize_url(folder=src, file=srcf.url))) n_files += 1 n_bytes += srcf.size srcidx += 1 progress.update(tid, advance=1) - - while dstidx < len(dstfiles): - dstf = dstfiles[dstidx] - - await writeline(dstonly, dstf.url) + else: + assert relsrcf >= reldstf dstidx += 1 progress.update(tid, advance=1) + await writeline(dstonly, dstf.url) + + while srcidx < len(srcfiles): + srcf = srcfiles[srcidx] + + await writeline(srconly, srcf.url) + await writeline(plan, srcf.url, os.path.join(dst, relativize_url(folder=src, file=srcf.url))) + n_files += 1 + n_bytes += srcf.size + srcidx += 1 + progress.update(tid, advance=1) + + while dstidx < len(dstfiles): + dstf = dstfiles[dstidx] + + await writeline(dstonly, dstf.url) + dstidx += 1 + progress.update(tid, advance=1) # a short sleep ensures the progress bar is visible for a moment to the user await asyncio.sleep(0.150) From ba77330b7d5862e4ab1279e4e3d4cde350d72c49 Mon Sep 17 00:00:00 2001 From: Dan King Date: Thu, 22 Feb 2024 17:14:14 -0500 Subject: [PATCH 51/62] remove debug cruft --- .../hailtop/aiocloud/aiogoogle/client/storage_client.py | 4 ++-- hail/python/hailtop/aiotools/utils.py | 3 +-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/hail/python/hailtop/aiocloud/aiogoogle/client/storage_client.py b/hail/python/hailtop/aiocloud/aiogoogle/client/storage_client.py index ac72d39527d..94e34a1275f 100644 --- a/hail/python/hailtop/aiocloud/aiogoogle/client/storage_client.py +++ b/hail/python/hailtop/aiocloud/aiogoogle/client/storage_client.py @@ -219,7 +219,7 @@ async def _write_chunk_1(self): # Upload a single chunk. See: # https://cloud.google.com/storage/docs/performing-resumable-uploads#chunked-upload - it: FeedableAsyncIterable[bytes] = FeedableAsyncIterable(self._session_url) + it: FeedableAsyncIterable[bytes] = FeedableAsyncIterable() async with _TaskManager( self._session.put( self._session_url, @@ -357,7 +357,7 @@ async def insert_object(self, bucket: str, name: str, **kwargs) -> WritableStrea params['uploadType'] = upload_type if upload_type == 'media': - it: FeedableAsyncIterable[bytes] = FeedableAsyncIterable('gs://' + bucket + '/' + name) + it: FeedableAsyncIterable[bytes] = FeedableAsyncIterable() kwargs['data'] = aiohttp.AsyncIterablePayload(it) request_task = asyncio.create_task( self._session.post( diff --git a/hail/python/hailtop/aiotools/utils.py b/hail/python/hailtop/aiotools/utils.py index b28c488d39a..0a7ae6d96cf 100644 --- a/hail/python/hailtop/aiotools/utils.py +++ b/hail/python/hailtop/aiotools/utils.py @@ -12,10 +12,9 @@ class _StopFeedableAsyncIterator: class FeedableAsyncIterable(AsyncIterator[_T]): - def __init__(self, url: str, maxsize: int = 1): + def __init__(self, maxsize: int = 1): self._queue: asyncio.Queue = asyncio.Queue(maxsize=maxsize) self._stopped = False - self.url = url def __aiter__(self) -> 'FeedableAsyncIterable[_T]': return self From 44c93641b7b1c89f0c3a8e56886e6df8ed02bb99 Mon Sep 17 00:00:00 2001 From: Dan King Date: Tue, 27 Feb 2024 17:26:48 -0500 Subject: [PATCH 52/62] fix Self improt --- batch/batch/driver/main.py | 2 +- hail/python/hailtop/aiotools/fs/fs.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/batch/batch/driver/main.py b/batch/batch/driver/main.py index 69e693bf8d7..f05f1d297fa 100644 --- a/batch/batch/driver/main.py +++ b/batch/batch/driver/main.py @@ -41,7 +41,7 @@ from gear.auth import AIOHTTPHandler, UserData from gear.clients import get_cloud_async_fs from gear.profiling import install_profiler_if_requested -from hailtop import aiotools, httpx, uvloopx +from hailtop import aiotools, httpx from hailtop.batch_client.globals import ROOT_JOB_GROUP_ID from hailtop.config import get_deploy_config from hailtop.hail_logging import AccessLogger diff --git a/hail/python/hailtop/aiotools/fs/fs.py b/hail/python/hailtop/aiotools/fs/fs.py index d90df6d41da..09267afb556 100644 --- a/hail/python/hailtop/aiotools/fs/fs.py +++ b/hail/python/hailtop/aiotools/fs/fs.py @@ -18,7 +18,6 @@ import abc import asyncio import datetime -from typing_extensions import Self from hailtop.utils import retry_transient_errors, OnlineBoundedGather2 from .stream import EmptyReadableStream, ReadableStream, WritableStream from .exceptions import FileAndDirectoryError From f6eacea9cde92a5074ad3f872040a1787aed75d7 Mon Sep 17 00:00:00 2001 From: Dan King Date: Tue, 27 Feb 2024 22:48:34 -0500 Subject: [PATCH 53/62] fix bad imports --- hail/python/test/hailtop/inter_cloud/test_hailctl_fs_sync.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/hail/python/test/hailtop/inter_cloud/test_hailctl_fs_sync.py b/hail/python/test/hailtop/inter_cloud/test_hailctl_fs_sync.py index e3bdd041aa6..8761ddc05b6 100644 --- a/hail/python/test/hailtop/inter_cloud/test_hailctl_fs_sync.py +++ b/hail/python/test/hailtop/inter_cloud/test_hailctl_fs_sync.py @@ -10,7 +10,8 @@ from .generate_copy_test_specs import run_test_spec, create_test_file, create_test_dir -from .test_copy import cloud_scheme, router_filesystem, fresh_dir +from .test_copy import cloud_scheme +from .utils import fresh_dir @pytest.mark.asyncio From 2bd85ecc63ada3d6aad7ca94a14294cfa0f65ca0 Mon Sep 17 00:00:00 2001 From: Dan King Date: Thu, 22 Feb 2024 16:34:09 -0500 Subject: [PATCH 54/62] [uvloopx] consolidate uvloop initialization code to one place --- auth/auth/auth.py | 7 +++---- batch/batch/driver/main.py | 5 ++--- batch/batch/worker/worker.py | 5 ----- ci/ci/ci.py | 7 +++---- ci/wait-for.py | 8 ++++---- hail/python/hailtop/aiotools/copy.py | 14 ++------------ hail/python/hailtop/aiotools/diff.py | 14 ++------------ hail/python/hailtop/uvloopx.py | 13 +++++++++++++ 8 files changed, 29 insertions(+), 44 deletions(-) create mode 100644 hail/python/hailtop/uvloopx.py diff --git a/auth/auth/auth.py b/auth/auth/auth.py index dd5f2887c58..dbee3451def 100644 --- a/auth/auth/auth.py +++ b/auth/auth/auth.py @@ -11,7 +11,6 @@ import kubernetes_asyncio.client import kubernetes_asyncio.client.rest import kubernetes_asyncio.config -import uvloop from aiohttp import web from prometheus_async.aio.web import server_stats # type: ignore @@ -33,7 +32,7 @@ from gear.auth import AIOHTTPHandler, get_session_id from gear.cloud_config import get_global_config from gear.profiling import install_profiler_if_requested -from hailtop import httpx +from hailtop import httpx, uvloopx from hailtop.auth import AzureFlow, Flow, GoogleFlow, IdentityProvider from hailtop.config import get_deploy_config from hailtop.hail_logging import AccessLogger @@ -56,8 +55,6 @@ log = logging.getLogger('auth') -uvloop.install() - CLOUD = get_global_config()['cloud'] DEFAULT_NAMESPACE = os.environ['HAIL_DEFAULT_NAMESPACE'] @@ -887,6 +884,8 @@ async def auth_check_csrf_token(request: web.Request, handler: AIOHTTPHandler): def run(): + uvloopx.install() + install_profiler_if_requested('auth') app = web.Application(middlewares=[auth_check_csrf_token, monitor_endpoints_middleware]) diff --git a/batch/batch/driver/main.py b/batch/batch/driver/main.py index 8db8a6bc5c2..5499ac861d6 100644 --- a/batch/batch/driver/main.py +++ b/batch/batch/driver/main.py @@ -20,7 +20,6 @@ import plotly import plotly.graph_objects as go import prometheus_client as pc # type: ignore -import uvloop from aiohttp import web from plotly.subplots import make_subplots from prometheus_async.aio.web import server_stats @@ -41,7 +40,7 @@ from gear.auth import AIOHTTPHandler, UserData from gear.clients import get_cloud_async_fs from gear.profiling import install_profiler_if_requested -from hailtop import aiotools, httpx +from hailtop import aiotools, httpx, uvloopx from hailtop.batch_client.globals import ROOT_JOB_GROUP_ID from hailtop.config import get_deploy_config from hailtop.hail_logging import AccessLogger @@ -81,7 +80,7 @@ from .instance_collection import InstanceCollectionManager, JobPrivateInstanceManager, Pool from .job import mark_job_complete, mark_job_started -uvloop.install() +uvloopx.install() log = logging.getLogger('batch') diff --git a/batch/batch/worker/worker.py b/batch/batch/worker/worker.py index 5e62a92dc6b..424ce57c5b3 100644 --- a/batch/batch/worker/worker.py +++ b/batch/batch/worker/worker.py @@ -86,11 +86,6 @@ from ..worker.worker_api import CloudDisk, CloudWorkerAPI, ContainerRegistryCredentials from .jvm_entryway_protocol import EndOfStream, read_bool, read_int, read_str, write_int, write_str -# import uvloop - - -# uvloop.install() - with open('/subdomains.txt', 'r', encoding='utf-8') as subdomains_file: HAIL_SERVICES = [line.rstrip() for line in subdomains_file.readlines()] diff --git a/ci/ci/ci.py b/ci/ci/ci.py index fd708f35d97..513729733c0 100644 --- a/ci/ci/ci.py +++ b/ci/ci/ci.py @@ -11,7 +11,6 @@ import kubernetes_asyncio import kubernetes_asyncio.client import kubernetes_asyncio.config -import uvloop # type: ignore import yaml from aiohttp import web from gidgethub import aiohttp as gh_aiohttp @@ -31,7 +30,7 @@ setup_aiohttp_session, ) from gear.profiling import install_profiler_if_requested -from hailtop import aiotools, httpx +from hailtop import aiotools, httpx, uvloopx from hailtop.auth import hail_credentials from hailtop.batch_client.aioclient import Batch, BatchClient from hailtop.config import get_deploy_config @@ -51,8 +50,6 @@ log = logging.getLogger('ci') -uvloop.install() - deploy_config = get_deploy_config() watched_branches: List[WatchedBranch] = [] @@ -821,6 +818,8 @@ async def on_cleanup(app: web.Application): def run(): + uvloopx.install() + install_profiler_if_requested('ci') app = web.Application(middlewares=[check_csrf_token, monitor_endpoints_middleware]) diff --git a/ci/wait-for.py b/ci/wait-for.py index 22ed7967cd2..6f8b54b88bf 100644 --- a/ci/wait-for.py +++ b/ci/wait-for.py @@ -4,10 +4,9 @@ import sys import traceback -import uvloop from kubernetes_asyncio import client, config -uvloop.install() +from hailtop import uvloopx async def timeout(timeout_seconds): @@ -70,5 +69,6 @@ async def main(): await v1.api_client.rest_client.pool_manager.close() -loop = asyncio.get_event_loop() -loop.run_until_complete(main()) +if __name__ == '__main__': + uvloopx.install() + asyncio.run(main()) diff --git a/hail/python/hailtop/aiotools/copy.py b/hail/python/hailtop/aiotools/copy.py index 45a361d8fc3..547a43d85f2 100644 --- a/hail/python/hailtop/aiotools/copy.py +++ b/hail/python/hailtop/aiotools/copy.py @@ -8,22 +8,12 @@ from concurrent.futures import ThreadPoolExecutor from rich.progress import Progress, TaskID +from .. import uvloopx from ..utils.utils import sleep_before_try from ..utils.rich_progress_bar import CopyToolProgressBar, make_listener from . import Transfer, Copier from .router_fs import RouterAsyncFS -try: - import uvloop - - uvloop_install = uvloop.install -except ImportError as e: - if not sys.platform.startswith('win32'): - raise e - - def uvloop_install(): - pass - class GrowingSempahore(AsyncContextManager[asyncio.Semaphore]): def __init__(self, start_max: int, target_max: int, progress_and_tid: Optional[Tuple[Progress, TaskID]]): @@ -222,5 +212,5 @@ async def main() -> None: if __name__ == '__main__': - uvloop_install() + uvloopx.install() asyncio.run(main()) diff --git a/hail/python/hailtop/aiotools/diff.py b/hail/python/hailtop/aiotools/diff.py index a2208ecb6b6..bdc7942732f 100644 --- a/hail/python/hailtop/aiotools/diff.py +++ b/hail/python/hailtop/aiotools/diff.py @@ -7,21 +7,11 @@ from concurrent.futures import ThreadPoolExecutor +from .. import uvloopx from ..utils.rich_progress_bar import SimpleCopyToolProgressBar, SimpleCopyToolProgressBarTask from .router_fs import RouterAsyncFS from .fs import AsyncFS, FileStatus -try: - import uvloop - - uvloop_install = uvloop.install -except ImportError as e: - if not sys.platform.startswith('win32'): - raise e - - def uvloop_install(): - pass - async def diff( *, @@ -185,5 +175,5 @@ async def main() -> None: if __name__ == '__main__': - uvloop_install() + uvloopx.install() asyncio.run(main()) diff --git a/hail/python/hailtop/uvloopx.py b/hail/python/hailtop/uvloopx.py new file mode 100644 index 00000000000..bac9f2542f6 --- /dev/null +++ b/hail/python/hailtop/uvloopx.py @@ -0,0 +1,13 @@ +import sys + +try: + import uvloop + + def install(): + return uvloop.install() +except ImportError as e: + if not sys.platform.startswith('win32'): + raise e + + def install(): + pass From 4c054716d8b4a6b09657e6917547256b49ffed45 Mon Sep 17 00:00:00 2001 From: Dan King Date: Thu, 22 Feb 2024 16:38:38 -0500 Subject: [PATCH 55/62] also front_end.py --- batch/batch/front_end/front_end.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/batch/batch/front_end/front_end.py b/batch/batch/front_end/front_end.py index eafbcf518d1..48aa7aa5206 100644 --- a/batch/batch/front_end/front_end.py +++ b/batch/batch/front_end/front_end.py @@ -23,7 +23,6 @@ import plotly.express as px import plotly.graph_objects as go import pymysql -import uvloop from aiohttp import web from plotly.subplots import make_subplots from prometheus_async.aio.web import server_stats # type: ignore @@ -46,7 +45,7 @@ from gear.clients import get_cloud_async_fs from gear.database import CallError from gear.profiling import install_profiler_if_requested -from hailtop import aiotools, dictfix, httpx, version +from hailtop import aiotools, dictfix, httpx, uvloopx, version from hailtop.auth import hail_credentials from hailtop.batch_client.globals import MAX_JOB_GROUPS_DEPTH, ROOT_JOB_GROUP_ID from hailtop.batch_client.parse import parse_cpu_in_mcpu, parse_memory_in_bytes, parse_storage_in_bytes @@ -125,7 +124,7 @@ validate_job_groups, ) -uvloop.install() +uvloopx.install() log = logging.getLogger('batch.front_end') From 4bba249e2739290ef6295906783910917fb97247 Mon Sep 17 00:00:00 2001 From: Dan King Date: Wed, 28 Feb 2024 17:12:45 -0500 Subject: [PATCH 56/62] revert unnecsesary changes to copy and copier --- hail/python/hailtop/aiotools/copy.py | 32 ++++++----------------- hail/python/hailtop/aiotools/fs/copier.py | 13 ++++----- 2 files changed, 13 insertions(+), 32 deletions(-) diff --git a/hail/python/hailtop/aiotools/copy.py b/hail/python/hailtop/aiotools/copy.py index 547a43d85f2..26c438be6dc 100644 --- a/hail/python/hailtop/aiotools/copy.py +++ b/hail/python/hailtop/aiotools/copy.py @@ -57,14 +57,6 @@ async def __aexit__(self, exc_type, exc, tb): self.task.cancel() -def only_update_completions(progress: Progress, tid): - def listen(delta: int): - if delta < 0: - progress.update(tid, advance=-delta) - - return listen - - async def copy( *, max_simultaneous_transfers: Optional[int] = None, @@ -74,7 +66,6 @@ async def copy( s3_kwargs: Optional[dict] = None, transfers: List[Transfer], verbose: bool = False, - totals: Optional[Tuple[int, int]] = None, ) -> None: with ThreadPoolExecutor() as thread_pool: if max_simultaneous_transfers is None: @@ -95,7 +86,7 @@ async def copy( local_kwargs=local_kwargs, gcs_kwargs=gcs_kwargs, azure_kwargs=azure_kwargs, s3_kwargs=s3_kwargs ) as fs: with CopyToolProgressBar(transient=True, disable=not verbose) as progress: - initial_simultaneous_transfers = min(10, max_simultaneous_transfers) + initial_simultaneous_transfers = 10 parallelism_tid = progress.add_task( description='parallelism', completed=initial_simultaneous_transfers, @@ -107,22 +98,15 @@ async def copy( ) as sema: file_tid = progress.add_task(description='files', total=0, visible=verbose) bytes_tid = progress.add_task(description='bytes', total=0, visible=verbose) - - if totals: - n_files, n_bytes = totals - progress.update(file_tid, total=n_files) - progress.update(bytes_tid, total=n_bytes) - file_listener = only_update_completions(progress, file_tid) - bytes_listener = only_update_completions(progress, bytes_tid) - else: - file_listener = make_listener(progress, file_tid) - bytes_listener = make_listener(progress, bytes_tid) - copy_report = await Copier.copy( - fs, sema, transfers, files_listener=file_listener, bytes_listener=bytes_listener + fs, + sema, + transfers, + files_listener=make_listener(progress, file_tid), + bytes_listener=make_listener(progress, bytes_tid), ) if verbose: - copy_report.summarize(include_sources=totals is None) + copy_report.summarize() def make_transfer(json_object: Dict[str, str]) -> Transfer: @@ -175,7 +159,7 @@ async def main() -> None: parser.add_argument( '-v', '--verbose', action='store_const', const=True, default=False, help='show logging information' ) - parser.add_argument('--timeout', type=str, default=None, help='Set the total timeout for HTTP requests.') + parser.add_argument('--timeout', type=str, default=None, help='show logging information') args = parser.parse_args() if args.verbose: diff --git a/hail/python/hailtop/aiotools/fs/copier.py b/hail/python/hailtop/aiotools/fs/copier.py index b934675f08f..def7933a115 100644 --- a/hail/python/hailtop/aiotools/fs/copier.py +++ b/hail/python/hailtop/aiotools/fs/copier.py @@ -143,7 +143,7 @@ def mark_done(self): self._end_time = time_msecs() self._duration = self._end_time - self._start_time - def summarize(self, include_sources: bool = True): + def summarize(self): source_reports = [] def add_source_reports(transfer_report): @@ -177,10 +177,9 @@ def add_source_reports(transfer_report): file_rate = total_files / (self._duration / 1000) print(f' Average file rate: {file_rate:,.1f}/s') - if include_sources: - print('Sources:') - for sr in source_reports: - print(f' {sr._source}: {sr._files} files, {humanize.naturalsize(sr._bytes)}') + print('Sources:') + for sr in source_reports: + print(f' {sr._source}: {sr._files} files, {humanize.naturalsize(sr._bytes)}') class SourceCopier: @@ -240,7 +239,6 @@ async def _copy_part( part_creator: MultiPartCreate, return_exceptions: bool, ) -> None: - total_written = 0 try: async with self.xfer_sema.acquire_manager(min(Copier.BUFFER_SIZE, this_part_size)): async with await self.router_fs.open_from( @@ -256,9 +254,8 @@ async def _copy_part( raise UnexpectedEOFError() written = await destf.write(b) assert written == len(b) - total_written += written + source_report.finish_bytes(written) n -= len(b) - source_report.finish_bytes(total_written) except Exception as e: if return_exceptions: source_report.set_exception(e) From 7ae6107f429d1eb0317af0894512da5d026a457e Mon Sep 17 00:00:00 2001 From: Dan King Date: Thu, 29 Feb 2024 11:53:04 -0500 Subject: [PATCH 57/62] add assertion about total size and also fix lint about unused variable --- hail/python/hailtop/aiotools/sync.py | 1 + 1 file changed, 1 insertion(+) diff --git a/hail/python/hailtop/aiotools/sync.py b/hail/python/hailtop/aiotools/sync.py index 70e1b145f7d..0deebcb9a49 100644 --- a/hail/python/hailtop/aiotools/sync.py +++ b/hail/python/hailtop/aiotools/sync.py @@ -44,6 +44,7 @@ async def _copy_file_one_part( written = await destf.write(b) assert written == len(b) total_written += written + assert total_written == size async def _copy_part( From 9c25bc03bf0d086d7d40326c9495506e3cbe8237 Mon Sep 17 00:00:00 2001 From: Chris Vittal Date: Tue, 25 Jun 2024 13:09:00 -0400 Subject: [PATCH 58/62] fix --- hail/python/hailtop/aiocloud/aioazure/fs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hail/python/hailtop/aiocloud/aioazure/fs.py b/hail/python/hailtop/aiocloud/aioazure/fs.py index a8489b84cdc..8acdc0c7257 100644 --- a/hail/python/hailtop/aiocloud/aioazure/fs.py +++ b/hail/python/hailtop/aiocloud/aioazure/fs.py @@ -559,7 +559,7 @@ async def isfile(self, url: str) -> bool: @handle_public_access_error async def isdir(self, url: str) -> bool: fs_url = self.parse_url(url, error_if_bucket=True) - client = self.get_container_client(fs_url) + client = await self.get_container_client(fs_url) prefix = fs_url.path if prefix[-1] != '/': prefix = prefix + '/' From 44ef79472c280345907d55f5fc96b2801f96086d Mon Sep 17 00:00:00 2001 From: Christopher Vittal Date: Wed, 7 Aug 2024 12:32:09 -0400 Subject: [PATCH 59/62] lint fixes --- hail/python/hailtop/aiotools/plan.py | 7 ++++--- hail/python/hailtop/aiotools/sync.py | 12 ++++++------ hail/python/hailtop/hailctl/__main__.py | 3 +-- hail/python/hailtop/hailctl/fs/cli.py | 10 ++++++---- .../python/test/hailtop/inter_cloud/test_copy.py | 16 ++++++++-------- .../hailtop/inter_cloud/test_hailctl_fs_sync.py | 2 ++ 6 files changed, 27 insertions(+), 23 deletions(-) diff --git a/hail/python/hailtop/aiotools/plan.py b/hail/python/hailtop/aiotools/plan.py index 1d2575c021f..6545c3791fb 100644 --- a/hail/python/hailtop/aiotools/plan.py +++ b/hail/python/hailtop/aiotools/plan.py @@ -1,12 +1,13 @@ -from typing import List, Tuple, Optional import asyncio import os from contextlib import AsyncExitStack +from typing import List, Optional, Tuple + from hailtop.aiotools import FileAndDirectoryError -from .router_fs import RouterAsyncFS -from .fs import FileListEntry, FileStatus, AsyncFS, WritableStream from ..utils.rich_progress_bar import CopyToolProgressBar, Progress +from .fs import AsyncFS, FileListEntry, FileStatus, WritableStream +from .router_fs import RouterAsyncFS class PlanError(ValueError): diff --git a/hail/python/hailtop/aiotools/sync.py b/hail/python/hailtop/aiotools/sync.py index 0deebcb9a49..d98fd43c3f5 100644 --- a/hail/python/hailtop/aiotools/sync.py +++ b/hail/python/hailtop/aiotools/sync.py @@ -1,15 +1,15 @@ -from typing import Optional import asyncio -import os import functools +import os +from typing import Optional +from ..utils import bounded_gather2, retry_transient_errors +from ..utils.rich_progress_bar import CopyToolProgressBar, make_listener from .copy import GrowingSempahore -from .fs.fs import MultiPartCreate from .fs.copier import Copier from .fs.exceptions import UnexpectedEOFError -from .router_fs import RouterAsyncFS, AsyncFS -from ..utils import retry_transient_errors, bounded_gather2 -from ..utils.rich_progress_bar import make_listener, CopyToolProgressBar +from .fs.fs import MultiPartCreate +from .router_fs import AsyncFS, RouterAsyncFS class SyncError(ValueError): diff --git a/hail/python/hailtop/hailctl/__main__.py b/hail/python/hailtop/hailctl/__main__.py index 36fc0154d83..3f514d2b224 100644 --- a/hail/python/hailtop/hailctl/__main__.py +++ b/hail/python/hailtop/hailctl/__main__.py @@ -1,5 +1,4 @@ import os - from typing import cast import click @@ -11,8 +10,8 @@ from .dataproc import cli as dataproc_cli from .describe import describe from .dev import cli as dev_cli -from .hdinsight import cli as hdinsight_cli from .fs import cli as fs_cli +from .hdinsight import cli as hdinsight_cli app = typer.Typer( help='Manage and monitor hail deployments.', diff --git a/hail/python/hailtop/hailctl/fs/cli.py b/hail/python/hailtop/hailctl/fs/cli.py index dba1db5acea..f27a7e57526 100644 --- a/hail/python/hailtop/hailctl/fs/cli.py +++ b/hail/python/hailtop/hailctl/fs/cli.py @@ -1,12 +1,14 @@ -from typing import Optional, List, Tuple, cast import asyncio -import click import sys +from typing import List, Optional, Tuple, cast + +import click import typer from hailtop import uvloopx -from hailtop.aiotools.plan import plan, PlanError -from hailtop.aiotools.sync import sync as aiotools_sync, SyncError +from hailtop.aiotools.plan import PlanError, plan +from hailtop.aiotools.sync import SyncError +from hailtop.aiotools.sync import sync as aiotools_sync app_without_click = typer.Typer( name='fs', diff --git a/hail/python/test/hailtop/inter_cloud/test_copy.py b/hail/python/test/hailtop/inter_cloud/test_copy.py index 8ab4e4790ed..7e8f8c22083 100644 --- a/hail/python/test/hailtop/inter_cloud/test_copy.py +++ b/hail/python/test/hailtop/inter_cloud/test_copy.py @@ -2,21 +2,21 @@ import os import secrets import tempfile - from typing import AsyncIterator, Dict, List, Tuple import pytest -from hailtop.utils import url_scheme -from hailtop.aiotools.plan import plan, PlanError -from hailtop.aiotools.sync import sync, SyncError -from hailtop.aiotools.router_fs import RouterAsyncFS + from hailtop.aiotools import ( - Transfer, - FileAndDirectoryError, - Copier, AsyncFS, + Copier, + FileAndDirectoryError, FileListEntry, + Transfer, ) +from hailtop.aiotools.plan import PlanError, plan +from hailtop.aiotools.router_fs import RouterAsyncFS +from hailtop.aiotools.sync import SyncError, sync +from hailtop.utils import url_scheme from .copy_test_specs import COPY_TEST_SPECS from .generate_copy_test_specs import create_test_dir, create_test_file, run_test_spec diff --git a/hail/python/test/hailtop/inter_cloud/test_hailctl_fs_sync.py b/hail/python/test/hailtop/inter_cloud/test_hailctl_fs_sync.py index a0f2f80efc0..119623e9b21 100644 --- a/hail/python/test/hailtop/inter_cloud/test_hailctl_fs_sync.py +++ b/hail/python/test/hailtop/inter_cloud/test_hailctl_fs_sync.py @@ -1,6 +1,8 @@ import pytest + from hailtop.utils import check_exec_output +from .test_copy import cloud_scheme from .utils import fresh_dir From 7087fc0601db3c05df103eb6215516ca8ece180c Mon Sep 17 00:00:00 2001 From: Christopher Vittal Date: Thu, 8 Aug 2024 11:41:20 -0400 Subject: [PATCH 60/62] test fixes --- hail/python/test/hailtop/inter_cloud/conftest.py | 5 +++++ hail/python/test/hailtop/inter_cloud/test_copy.py | 5 ----- .../test/hailtop/inter_cloud/test_hailctl_fs_sync.py | 11 +++++++---- 3 files changed, 12 insertions(+), 9 deletions(-) diff --git a/hail/python/test/hailtop/inter_cloud/conftest.py b/hail/python/test/hailtop/inter_cloud/conftest.py index 841650d9b6e..65caca0053a 100644 --- a/hail/python/test/hailtop/inter_cloud/conftest.py +++ b/hail/python/test/hailtop/inter_cloud/conftest.py @@ -45,3 +45,8 @@ async def router_filesystem() -> AsyncIterator[Tuple[asyncio.Semaphore, AsyncFS, assert not await fs.isdir(gs_base) assert not await fs.isdir(s3_base) assert not await fs.isdir(azure_base) + + +@pytest.fixture(params=['gs', 's3', 'azure-https'], scope='module') +async def cloud_scheme(request): + yield request.param diff --git a/hail/python/test/hailtop/inter_cloud/test_copy.py b/hail/python/test/hailtop/inter_cloud/test_copy.py index 7e8f8c22083..49d6ac29b98 100644 --- a/hail/python/test/hailtop/inter_cloud/test_copy.py +++ b/hail/python/test/hailtop/inter_cloud/test_copy.py @@ -35,11 +35,6 @@ async def test_spec(request): return request.param -@pytest.fixture(params=['gs', 's3', 'azure-https']) -async def cloud_scheme(request): - yield request.param - - @pytest.fixture( params=[ 'file/file', diff --git a/hail/python/test/hailtop/inter_cloud/test_hailctl_fs_sync.py b/hail/python/test/hailtop/inter_cloud/test_hailctl_fs_sync.py index 119623e9b21..55c68700b46 100644 --- a/hail/python/test/hailtop/inter_cloud/test_hailctl_fs_sync.py +++ b/hail/python/test/hailtop/inter_cloud/test_hailctl_fs_sync.py @@ -1,13 +1,16 @@ +import asyncio +from typing import Dict, Tuple + import pytest +from hailtop.aiotools import AsyncFS from hailtop.utils import check_exec_output -from .test_copy import cloud_scheme from .utils import fresh_dir @pytest.mark.asyncio -async def test_cli_file_and_dir(router_filesystem, cloud_scheme): +async def test_cli_file_and_dir(router_filesystem: Tuple[asyncio.Semaphore, AsyncFS, Dict[str, str]], cloud_scheme): sema, fs, bases = router_filesystem test_dir = await fresh_dir(fs, bases, cloud_scheme) @@ -61,7 +64,7 @@ async def test_cli_file_and_dir(router_filesystem, cloud_scheme): @pytest.mark.asyncio -async def test_cli_subdir(router_filesystem, cloud_scheme): +async def test_cli_subdir(router_filesystem: Tuple[asyncio.Semaphore, AsyncFS, Dict[str, str]], cloud_scheme): sema, fs, bases = router_filesystem test_dir = await fresh_dir(fs, bases, cloud_scheme) @@ -99,7 +102,7 @@ async def test_cli_subdir(router_filesystem, cloud_scheme): @pytest.mark.asyncio -async def test_cli_already_synced(router_filesystem, cloud_scheme): +async def test_cli_already_synced(router_filesystem: Tuple[asyncio.Semaphore, AsyncFS, Dict[str, str]], cloud_scheme): sema, fs, bases = router_filesystem test_dir = await fresh_dir(fs, bases, cloud_scheme) From 5a32d23b8e2a647602d87119bf882f0932a09b0b Mon Sep 17 00:00:00 2001 From: Christopher Vittal Date: Mon, 12 Aug 2024 13:41:41 -0400 Subject: [PATCH 61/62] fix? --- hail/python/test/hailtop/inter_cloud/test_hailctl_fs_sync.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/hail/python/test/hailtop/inter_cloud/test_hailctl_fs_sync.py b/hail/python/test/hailtop/inter_cloud/test_hailctl_fs_sync.py index 55c68700b46..b332bd8c4ee 100644 --- a/hail/python/test/hailtop/inter_cloud/test_hailctl_fs_sync.py +++ b/hail/python/test/hailtop/inter_cloud/test_hailctl_fs_sync.py @@ -9,7 +9,6 @@ from .utils import fresh_dir -@pytest.mark.asyncio async def test_cli_file_and_dir(router_filesystem: Tuple[asyncio.Semaphore, AsyncFS, Dict[str, str]], cloud_scheme): sema, fs, bases = router_filesystem @@ -63,7 +62,6 @@ async def test_cli_file_and_dir(router_filesystem: Tuple[asyncio.Semaphore, Asyn assert await fs.read(plandir2 + 'srconly') == b'' -@pytest.mark.asyncio async def test_cli_subdir(router_filesystem: Tuple[asyncio.Semaphore, AsyncFS, Dict[str, str]], cloud_scheme): sema, fs, bases = router_filesystem @@ -101,7 +99,6 @@ async def test_cli_subdir(router_filesystem: Tuple[asyncio.Semaphore, AsyncFS, D assert await fs.read(plandir2 + 'matches') == f'{test_dir}dir/subdir/file1\t{test_dir}dir2/subdir/file1\n'.encode() -@pytest.mark.asyncio async def test_cli_already_synced(router_filesystem: Tuple[asyncio.Semaphore, AsyncFS, Dict[str, str]], cloud_scheme): sema, fs, bases = router_filesystem From c66533f03f33c2f32d6276481161698b5188f071 Mon Sep 17 00:00:00 2001 From: Christopher Vittal Date: Tue, 13 Aug 2024 15:19:57 -0400 Subject: [PATCH 62/62] lint --- hail/python/test/hailtop/inter_cloud/test_hailctl_fs_sync.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/hail/python/test/hailtop/inter_cloud/test_hailctl_fs_sync.py b/hail/python/test/hailtop/inter_cloud/test_hailctl_fs_sync.py index b332bd8c4ee..1b6701aba4f 100644 --- a/hail/python/test/hailtop/inter_cloud/test_hailctl_fs_sync.py +++ b/hail/python/test/hailtop/inter_cloud/test_hailctl_fs_sync.py @@ -1,8 +1,6 @@ import asyncio from typing import Dict, Tuple -import pytest - from hailtop.aiotools import AsyncFS from hailtop.utils import check_exec_output