intial implementation of http

drivendataorg · Sep 1, 2024 · 1e54b21 · 1e54b21
1 parent b776bee
commit 1e54b21
Show file tree

Hide file tree

Showing 13 changed files with 542 additions and 53 deletions.
diff --git a/cloudpathlib/__init__.py b/cloudpathlib/__init__.py
@@ -4,9 +4,11 @@
 from .azure.azblobclient import AzureBlobClient
 from .azure.azblobpath import AzureBlobPath
 from .cloudpath import CloudPath, implementation_registry
-from .s3.s3client import S3Client
-from .gs.gspath import GSPath
 from .gs.gsclient import GSClient
+from .gs.gspath import GSPath
+from .http.httpclient import HttpClient
+from .http.httppath import HttpPath
+from .s3.s3client import S3Client
 from .s3.s3path import S3Path
 
 
@@ -27,6 +29,8 @@
     "implementation_registry",
     "GSClient",
     "GSPath",
+    "HttpClient",
+    "HttpPath",
     "S3Client",
     "S3Path",
 ]
diff --git a/cloudpathlib/cloudpath.py b/cloudpathlib/cloudpath.py
@@ -27,7 +27,6 @@
     Generator,
     List,
     Optional,
-    Sequence,
     Tuple,
     Type,
     TYPE_CHECKING,
@@ -286,11 +285,11 @@ def __setstate__(self, state: Dict[str, Any]) -> None:
 
     @property
     def _no_prefix(self) -> str:
-        return self._str[len(self.cloud_prefix) :]
+        return self._str[len(self.anchor) :]
 
     @property
     def _no_prefix_no_drive(self) -> str:
-        return self._str[len(self.cloud_prefix) + len(self.drive) :]
+        return self._str[len(self.anchor) + len(self.drive) :]
 
     @overload
     @classmethod
@@ -881,9 +880,9 @@ def relative_to(self, other: Self, walk_up: bool = False) -> PurePosixPath:
         # absolute)
         if not isinstance(other, CloudPath):
             raise ValueError(f"{self} is a cloud path, but {other} is not")
-        if self.cloud_prefix != other.cloud_prefix:
+        if self.anchor != other.anchor:
             raise ValueError(
-                f"{self} is a {self.cloud_prefix} path, but {other} is a {other.cloud_prefix} path"
+                f"{self} is a {self.anchor} path, but {other} is a {other.anchor} path"
             )
 
         kwargs = dict(walk_up=walk_up)
@@ -921,7 +920,7 @@ def parent(self) -> Self:
         return self._dispatch_to_path("parent")
 
     @property
-    def parents(self) -> Sequence[Self]:
+    def parents(self) -> Tuple[Self, ...]:
         return self._dispatch_to_path("parents")
 
     @property
@@ -1210,8 +1209,8 @@ def _new_cloudpath(self, path: Union[str, os.PathLike]) -> Self:
             path = path[1:]
 
         # add prefix/anchor if it is not already
-        if not path.startswith(self.cloud_prefix):
-            path = f"{self.cloud_prefix}{path}"
+        if not path.startswith(self.anchor):
+            path = f"{self.anchor}{path}"
 
         return self.client.CloudPath(path)
 

diff --git a/cloudpathlib/http/__init__.py b/cloudpathlib/http/__init__.py
@@ -0,0 +1,7 @@
+from .httpclient import HttpClient
+from .httppath import HttpPath
+
+__all__ = [
+    "HttpClient",
+    "HttpPath",
+]
diff --git a/cloudpathlib/http/httpclient.py b/cloudpathlib/http/httpclient.py
@@ -0,0 +1,160 @@
+from datetime import datetime
+import os
+import re
+import urllib.request
+import urllib.parse
+import urllib.error
+from pathlib import Path
+from typing import Iterable, Optional, Tuple, Union, Callable
+import shutil
+import mimetypes
+import urllib.response
+
+import pytz
+
+from cloudpathlib.client import Client, register_client_class
+from cloudpathlib.enums import FileCacheMode
+
+from .httppath import HttpPath
+
+
+@register_client_class("http")
+class HttpClient(Client):
+    def __init__(
+        self,
+        file_cache_mode: Optional[Union[str, FileCacheMode]] = None,
+        local_cache_dir: Optional[Union[str, os.PathLike]] = None,
+        content_type_method: Optional[Callable] = mimetypes.guess_type,
+        auth: Optional[urllib.request.BaseHandler] = None,
+        custom_list_page_parser: Optional[Callable[[str], Iterable[str]]] = None,
+    ):
+        super().__init__(file_cache_mode, local_cache_dir, content_type_method)
+        self.auth = auth
+
+        if self.auth is None:
+            self.opener = urllib.request.build_opener()
+        else:
+            self.openener = urllib.request.build_opener(self.auth)
+
+        self.custom_list_page_parser = custom_list_page_parser
+
+    def _get_metadata(self, cloud_path: HttpPath) -> dict:
+        with self.opener.open(cloud_path.as_url()) as response:
+            last_modified = response.headers.get("Last-Modified", None)
+
+            if last_modified is not None:
+                # per https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Last-Modified
+                last_modified = datetime.strptime(last_modified, "%a, %d %b %Y %H:%M:%S %Z")
+
+                # should always be utc https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Last-Modified#gmt
+                last_modified = last_modified.replace(tzinfo=pytz.UTC)
+
+            return {
+                "size": int(response.headers.get("Content-Length", 0)),
+                "last_modified": last_modified,
+                "content_type": response.headers.get("Content-Type", None),
+            }
+
+    def _download_file(self, cloud_path: HttpPath, local_path: Union[str, os.PathLike]) -> Path:
+        local_path = Path(local_path)
+        with self.opener.open(cloud_path.as_url()) as response:
+            with open(local_path, "wb") as out_file:
+                shutil.copyfileobj(response, out_file)
+        return local_path
+
+    def _exists(self, cloud_path: HttpPath) -> bool:
+        request = urllib.request.Request(cloud_path.as_url(), method="HEAD")
+        try:
+            with self.opener.open(request) as response:
+                return response.status == 200
+        except (urllib.error.HTTPError, urllib.error.URLError) as e:
+            if isinstance(e, urllib.error.URLError) or e.code == 404:
+                return False
+            raise
+
+    def _move_file(self, src: HttpPath, dst: HttpPath, remove_src: bool = True) -> HttpPath:
+        self._upload_file(src, dst)
+        if remove_src:
+            self._remove(src)
+        return dst
+
+    def _remove(self, cloud_path: HttpPath, missing_ok: bool = True) -> None:
+        request = urllib.request.Request(cloud_path.as_url(), method="DELETE")
+        try:
+            with self.opener.open(request) as response:
+                if response.status != 204:
+                    raise Exception(f"Failed to delete {cloud_path}.")
+        except urllib.error.HTTPError as e:
+            if e.code == 404 and missing_ok:
+                pass
+            else:
+                raise FileNotFoundError(f"Failed to delete {cloud_path}.")
+
+    def _list_dir(self, cloud_path: HttpPath, recursive: bool) -> Iterable[Tuple[HttpPath, bool]]:
+        try:
+            with self.opener.open(cloud_path.as_url()) as response:
+                # Parse the directory listing
+                for path, is_dir in self._parse_list_dir_response(
+                    response.read().decode(), base_url=str(cloud_path)
+                ):
+                    yield path, is_dir
+
+                    # If it's a directory and recursive is True, list the contents of the directory
+                    if recursive and is_dir:
+                        yield from self._list_dir(path, recursive=True)
+
+        except:  # noqa E722
+            raise NotImplementedError(
+                "Unable to parse response as a listing of files; please provide a custom parser as `custom_list_page_parser`."
+            )
+
+    def _upload_file(self, local_path: Union[str, os.PathLike], cloud_path: HttpPath) -> HttpPath:
+        local_path = Path(local_path)
+        if self.content_type_method is not None:
+            content_type, _ = self.content_type_method(local_path)
+
+        headers = {"Content-Type": content_type or "application/octet-stream"}
+
+        with open(local_path, "rb") as file_data:
+            request = urllib.request.Request(
+                cloud_path.as_url(), data=file_data.read(), method="PUT", headers=headers
+            )
+            with self.opener.open(request) as response:
+                if response.status != 201 and response.status != 200:
+                    raise Exception(f"Failed to upload {local_path} to {cloud_path}.")
+        return cloud_path
+
+    def _get_public_url(self, cloud_path: HttpPath) -> str:
+        return cloud_path.as_url()
+
+    def _generate_presigned_url(self, cloud_path: HttpPath, expire_seconds: int = 60 * 60) -> str:
+        raise NotImplementedError("Presigned URLs are not supported using urllib.")
+
+    def _parse_list_dir_response(
+        self, response: str, base_url: str
+    ) -> Iterable[Tuple[HttpPath, bool]]:
+        # Ensure base_url ends with a trailing slash so joining works
+        if not base_url.endswith("/"):
+            base_url += "/"
+
+        def _simple_links(html: str) -> Iterable[str]:
+            return re.findall(r'<a\s+href="([^"]+)"', html)
+
+        parser: Callable[[str], Iterable[str]] = (
+            self.custom_list_page_parser
+            if self.custom_list_page_parser is not None
+            else _simple_links
+        )
+
+        yield from (
+            (self.CloudPath((urllib.parse.urljoin(base_url, match))), Path(match).suffix == "")
+            for match in parser(response)
+        )
+
+    def request(self, url: HttpPath, method: str, **kwargs) -> None:
+        request = urllib.request.Request(url.as_url(), method=method, **kwargs)
+        with self.opener.open(request) as response:
+            return response
+
+
+HttpClient.HttpPath = HttpClient.CloudPath  # type: ignore
diff --git a/cloudpathlib/http/httppath.py b/cloudpathlib/http/httppath.py
@@ -0,0 +1,130 @@
+from pathlib import PurePosixPath
+from typing import Tuple, Union, Optional
+
+import os
+from pathlib import Path
+from tempfile import TemporaryDirectory
+from typing import TYPE_CHECKING
+
+from ..cloudpath import CloudPath, NoStatError, register_path_class
+
+
+if TYPE_CHECKING:
+    from .httpclient import HttpClient
+
+
+@register_path_class("http")
+class HttpPath(CloudPath):
+    cloud_prefix = "http://"
+    client: "HttpClient"
+
+    def __init__(
+        self,
+        cloud_path: Union[str, "HttpPath"],
+        client: Optional["HttpClient"] = None,
+    ) -> None:
+        super().__init__(cloud_path, client)
+
+        self._path = (
+            PurePosixPath(self._url.path)
+            if self._url.path.startswith("/")
+            else PurePosixPath(f"/{self._url.path}")
+        )
+
+    @property
+    def drive(self) -> str:
+        # For HTTP paths, no drive; use .anchor for scheme + netloc
+        return self._url.netloc
+
+    @property
+    def anchor(self) -> str:
+        return f"{self._url.scheme}://{self._url.netloc}/"
+
+    @property
+    def _no_prefix_no_drive(self) -> str:
+        # netloc appears in anchor and drive for httppath; so don't double count
+        return self._str[len(self.anchor) - 1 :]
+
+    def is_dir(self) -> bool:
+        if not self.exists():
+            return False
+
+        # HTTP doesn't really have directories, but some servers might list files if treated as such
+        # Here we'll assume paths without are dirs
+        return self._path.suffix == ""
+
+    def is_file(self) -> bool:
+        if not self.exists():
+            return False
+
+        # HTTP doesn't have a direct file check, but we assume if it has a suffix, it's a file
+        return self._path.suffix != ""
+
+    def mkdir(self, parents: bool = False, exist_ok: bool = False) -> None:
+        pass  # no-op for HTTP Paths
+
+    def touch(self, exist_ok: bool = True) -> None:
+        if self.exists():
+            if not exist_ok:
+                raise FileExistsError(f"File already exists: {self}")
+
+            raise NotImplementedError(
+                "Touch not implemented for existing HTTP files since we can't update the modified time."
+            )
+        else:
+            empty_file = Path(TemporaryDirectory().name) / "empty_file.txt"
+            empty_file.parent.mkdir(parents=True, exist_ok=True)
+            empty_file.write_text("")
+            self.client._upload_file(empty_file, self)
+
+    def stat(self, follow_symlinks: bool = True) -> os.stat_result:
+        try:
+            meta = self.client._get_metadata(self)
+        except:  # noqa E722
+            raise NoStatError(f"Could not get metadata for {self}")
+
+        return os.stat_result(
+            (  # type: ignore
+                None,  # mode
+                None,  # ino
+                self.cloud_prefix,  # dev,
+                None,  # nlink,
+                None,  # uid,
+                None,  # gid,
+                meta.get("size", 0),  # size,
+                None,  # atime,
+                meta.get("last_modified", 0).timestamp(),  # mtime,
+                None,  # ctime,
+            )
+        )
+
+    def as_url(self, presign: bool = False, expire_seconds: int = 60 * 60) -> str:
+        if presign:
+            raise NotImplementedError("Presigning not supported for HTTP paths")
+
+        return (
+            self._url.geturl()
+        )  # recreate from what was initialized so we have the same query params, etc.
+
+    @property
+    def name(self) -> str:
+        return self._path.name
+
+    @property
+    def parents(self) -> Tuple["HttpPath", ...]:
+        return super().parents + (self._new_cloudpath(""),)
+
+    def get(self, **kwargs):
+        return self.client.request(self, "GET", **kwargs)
+
+    def put(self, **kwargs):
+        return self.client.request(self, "PUT", **kwargs)
+
+    def post(self, **kwargs):
+        return self.client.request(self, "POST", **kwargs)
+
+    def delete(self, **kwargs):
+        return self.client.request(self, "DELETE", **kwargs)
+
+    def head(self, **kwargs):
+        return self.client.request(self, "HEAD", **kwargs)
diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -33,4 +33,5 @@ tabulate
 tenacity
 tqdm
 typer
+types-pytz
 wheel