-
Notifications
You must be signed in to change notification settings - Fork 59
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
13 changed files
with
542 additions
and
53 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
from .httpclient import HttpClient | ||
from .httppath import HttpPath | ||
|
||
__all__ = [ | ||
"HttpClient", | ||
"HttpPath", | ||
] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,160 @@ | ||
from datetime import datetime | ||
import os | ||
import re | ||
import urllib.request | ||
import urllib.parse | ||
import urllib.error | ||
from pathlib import Path | ||
from typing import Iterable, Optional, Tuple, Union, Callable | ||
import shutil | ||
import mimetypes | ||
import urllib.response | ||
|
||
import pytz | ||
|
||
from cloudpathlib.client import Client, register_client_class | ||
from cloudpathlib.enums import FileCacheMode | ||
|
||
from .httppath import HttpPath | ||
|
||
|
||
@register_client_class("http") | ||
class HttpClient(Client): | ||
def __init__( | ||
self, | ||
file_cache_mode: Optional[Union[str, FileCacheMode]] = None, | ||
local_cache_dir: Optional[Union[str, os.PathLike]] = None, | ||
content_type_method: Optional[Callable] = mimetypes.guess_type, | ||
auth: Optional[urllib.request.BaseHandler] = None, | ||
custom_list_page_parser: Optional[Callable[[str], Iterable[str]]] = None, | ||
): | ||
super().__init__(file_cache_mode, local_cache_dir, content_type_method) | ||
self.auth = auth | ||
|
||
if self.auth is None: | ||
self.opener = urllib.request.build_opener() | ||
else: | ||
self.openener = urllib.request.build_opener(self.auth) | ||
|
||
self.custom_list_page_parser = custom_list_page_parser | ||
|
||
def _get_metadata(self, cloud_path: HttpPath) -> dict: | ||
with self.opener.open(cloud_path.as_url()) as response: | ||
last_modified = response.headers.get("Last-Modified", None) | ||
|
||
if last_modified is not None: | ||
# per https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Last-Modified | ||
last_modified = datetime.strptime(last_modified, "%a, %d %b %Y %H:%M:%S %Z") | ||
|
||
# should always be utc https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Last-Modified#gmt | ||
last_modified = last_modified.replace(tzinfo=pytz.UTC) | ||
|
||
return { | ||
"size": int(response.headers.get("Content-Length", 0)), | ||
"last_modified": last_modified, | ||
"content_type": response.headers.get("Content-Type", None), | ||
} | ||
|
||
def _download_file(self, cloud_path: HttpPath, local_path: Union[str, os.PathLike]) -> Path: | ||
local_path = Path(local_path) | ||
with self.opener.open(cloud_path.as_url()) as response: | ||
with open(local_path, "wb") as out_file: | ||
shutil.copyfileobj(response, out_file) | ||
return local_path | ||
|
||
def _exists(self, cloud_path: HttpPath) -> bool: | ||
request = urllib.request.Request(cloud_path.as_url(), method="HEAD") | ||
try: | ||
with self.opener.open(request) as response: | ||
return response.status == 200 | ||
except (urllib.error.HTTPError, urllib.error.URLError) as e: | ||
if isinstance(e, urllib.error.URLError) or e.code == 404: | ||
return False | ||
raise | ||
|
||
def _move_file(self, src: HttpPath, dst: HttpPath, remove_src: bool = True) -> HttpPath: | ||
self._upload_file(src, dst) | ||
if remove_src: | ||
self._remove(src) | ||
return dst | ||
|
||
def _remove(self, cloud_path: HttpPath, missing_ok: bool = True) -> None: | ||
request = urllib.request.Request(cloud_path.as_url(), method="DELETE") | ||
try: | ||
with self.opener.open(request) as response: | ||
if response.status != 204: | ||
raise Exception(f"Failed to delete {cloud_path}.") | ||
except urllib.error.HTTPError as e: | ||
if e.code == 404 and missing_ok: | ||
pass | ||
else: | ||
raise FileNotFoundError(f"Failed to delete {cloud_path}.") | ||
|
||
def _list_dir(self, cloud_path: HttpPath, recursive: bool) -> Iterable[Tuple[HttpPath, bool]]: | ||
try: | ||
with self.opener.open(cloud_path.as_url()) as response: | ||
# Parse the directory listing | ||
for path, is_dir in self._parse_list_dir_response( | ||
response.read().decode(), base_url=str(cloud_path) | ||
): | ||
yield path, is_dir | ||
|
||
# If it's a directory and recursive is True, list the contents of the directory | ||
if recursive and is_dir: | ||
yield from self._list_dir(path, recursive=True) | ||
|
||
except: # noqa E722 | ||
raise NotImplementedError( | ||
"Unable to parse response as a listing of files; please provide a custom parser as `custom_list_page_parser`." | ||
) | ||
|
||
def _upload_file(self, local_path: Union[str, os.PathLike], cloud_path: HttpPath) -> HttpPath: | ||
local_path = Path(local_path) | ||
if self.content_type_method is not None: | ||
content_type, _ = self.content_type_method(local_path) | ||
|
||
headers = {"Content-Type": content_type or "application/octet-stream"} | ||
|
||
with open(local_path, "rb") as file_data: | ||
request = urllib.request.Request( | ||
cloud_path.as_url(), data=file_data.read(), method="PUT", headers=headers | ||
) | ||
with self.opener.open(request) as response: | ||
if response.status != 201 and response.status != 200: | ||
raise Exception(f"Failed to upload {local_path} to {cloud_path}.") | ||
return cloud_path | ||
|
||
def _get_public_url(self, cloud_path: HttpPath) -> str: | ||
return cloud_path.as_url() | ||
|
||
def _generate_presigned_url(self, cloud_path: HttpPath, expire_seconds: int = 60 * 60) -> str: | ||
raise NotImplementedError("Presigned URLs are not supported using urllib.") | ||
|
||
def _parse_list_dir_response( | ||
self, response: str, base_url: str | ||
) -> Iterable[Tuple[HttpPath, bool]]: | ||
# Ensure base_url ends with a trailing slash so joining works | ||
if not base_url.endswith("/"): | ||
base_url += "/" | ||
|
||
def _simple_links(html: str) -> Iterable[str]: | ||
return re.findall(r'<a\s+href="([^"]+)"', html) | ||
|
||
parser: Callable[[str], Iterable[str]] = ( | ||
self.custom_list_page_parser | ||
if self.custom_list_page_parser is not None | ||
else _simple_links | ||
) | ||
|
||
yield from ( | ||
(self.CloudPath((urllib.parse.urljoin(base_url, match))), Path(match).suffix == "") | ||
for match in parser(response) | ||
) | ||
|
||
def request(self, url: HttpPath, method: str, **kwargs) -> None: | ||
request = urllib.request.Request(url.as_url(), method=method, **kwargs) | ||
with self.opener.open(request) as response: | ||
return response | ||
|
||
|
||
HttpClient.HttpPath = HttpClient.CloudPath # type: ignore |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,130 @@ | ||
from pathlib import PurePosixPath | ||
from typing import Tuple, Union, Optional | ||
|
||
import os | ||
from pathlib import Path | ||
from tempfile import TemporaryDirectory | ||
from typing import TYPE_CHECKING | ||
|
||
from ..cloudpath import CloudPath, NoStatError, register_path_class | ||
|
||
|
||
if TYPE_CHECKING: | ||
from .httpclient import HttpClient | ||
|
||
|
||
@register_path_class("http") | ||
class HttpPath(CloudPath): | ||
cloud_prefix = "http://" | ||
client: "HttpClient" | ||
|
||
def __init__( | ||
self, | ||
cloud_path: Union[str, "HttpPath"], | ||
client: Optional["HttpClient"] = None, | ||
) -> None: | ||
super().__init__(cloud_path, client) | ||
|
||
self._path = ( | ||
PurePosixPath(self._url.path) | ||
if self._url.path.startswith("/") | ||
else PurePosixPath(f"/{self._url.path}") | ||
) | ||
|
||
@property | ||
def drive(self) -> str: | ||
# For HTTP paths, no drive; use .anchor for scheme + netloc | ||
return self._url.netloc | ||
|
||
@property | ||
def anchor(self) -> str: | ||
return f"{self._url.scheme}://{self._url.netloc}/" | ||
|
||
@property | ||
def _no_prefix_no_drive(self) -> str: | ||
# netloc appears in anchor and drive for httppath; so don't double count | ||
return self._str[len(self.anchor) - 1 :] | ||
|
||
def is_dir(self) -> bool: | ||
if not self.exists(): | ||
return False | ||
|
||
# HTTP doesn't really have directories, but some servers might list files if treated as such | ||
# Here we'll assume paths without are dirs | ||
return self._path.suffix == "" | ||
|
||
def is_file(self) -> bool: | ||
if not self.exists(): | ||
return False | ||
|
||
# HTTP doesn't have a direct file check, but we assume if it has a suffix, it's a file | ||
return self._path.suffix != "" | ||
|
||
def mkdir(self, parents: bool = False, exist_ok: bool = False) -> None: | ||
pass # no-op for HTTP Paths | ||
|
||
def touch(self, exist_ok: bool = True) -> None: | ||
if self.exists(): | ||
if not exist_ok: | ||
raise FileExistsError(f"File already exists: {self}") | ||
|
||
raise NotImplementedError( | ||
"Touch not implemented for existing HTTP files since we can't update the modified time." | ||
) | ||
else: | ||
empty_file = Path(TemporaryDirectory().name) / "empty_file.txt" | ||
empty_file.parent.mkdir(parents=True, exist_ok=True) | ||
empty_file.write_text("") | ||
self.client._upload_file(empty_file, self) | ||
|
||
def stat(self, follow_symlinks: bool = True) -> os.stat_result: | ||
try: | ||
meta = self.client._get_metadata(self) | ||
except: # noqa E722 | ||
raise NoStatError(f"Could not get metadata for {self}") | ||
|
||
return os.stat_result( | ||
( # type: ignore | ||
None, # mode | ||
None, # ino | ||
self.cloud_prefix, # dev, | ||
None, # nlink, | ||
None, # uid, | ||
None, # gid, | ||
meta.get("size", 0), # size, | ||
None, # atime, | ||
meta.get("last_modified", 0).timestamp(), # mtime, | ||
None, # ctime, | ||
) | ||
) | ||
|
||
def as_url(self, presign: bool = False, expire_seconds: int = 60 * 60) -> str: | ||
if presign: | ||
raise NotImplementedError("Presigning not supported for HTTP paths") | ||
|
||
return ( | ||
self._url.geturl() | ||
) # recreate from what was initialized so we have the same query params, etc. | ||
|
||
@property | ||
def name(self) -> str: | ||
return self._path.name | ||
|
||
@property | ||
def parents(self) -> Tuple["HttpPath", ...]: | ||
return super().parents + (self._new_cloudpath(""),) | ||
|
||
def get(self, **kwargs): | ||
return self.client.request(self, "GET", **kwargs) | ||
|
||
def put(self, **kwargs): | ||
return self.client.request(self, "PUT", **kwargs) | ||
|
||
def post(self, **kwargs): | ||
return self.client.request(self, "POST", **kwargs) | ||
|
||
def delete(self, **kwargs): | ||
return self.client.request(self, "DELETE", **kwargs) | ||
|
||
def head(self, **kwargs): | ||
return self.client.request(self, "HEAD", **kwargs) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -33,4 +33,5 @@ tabulate | |
tenacity | ||
tqdm | ||
typer | ||
types-pytz | ||
wheel |
Oops, something went wrong.