From e6c88587255a7c742ed8310a741888cc6a34caa1 Mon Sep 17 00:00:00 2001 From: atripathy86 Date: Tue, 18 Jun 2024 11:53:39 -0500 Subject: [PATCH] OSDF Origin Push/Pull Support added (#182) * Enable Artifact Pushing/Pull from OSDF Origin - This functions with dvc[http] plugin for push except that it passes a bearer token in the header before any push/pull to a OSDF HTTP Remote. - Introduces new command `cmf init osdfremote` with new cmflib/commands/init/osdfremote.py - Handles artifact push when `core.remote==osdf` through dvc_push except that it updates `core.remote.password` in .dvc/config before doing so - Required OSDF Token is generated in function new generate_osdf_token() in utils/helper_functions.py. This uses scitokens and cryptography packages. This reads supplied private key, gets token from issuer and formats it as "Bearer + token". - Handles artifact pull through a new OSDFRemoteArtifacts Class in `storage_backends/osdf_artifacts.py`. This works with requests package - `commands/artifacts/pull.py` has new handling for `core.remote==osdf` which updates `core.remote.password` in .dvc/config and passes the newly generated dynamic password to download_artifacts to use. - Also updated dvc version to 3.50.2 which is needed for dvc[ttp] to function. Also bumped ml-metadata to 1.15.0 which is needed for compatibility. External packages scitokens and cryptography also added as dependencies. * Added init doc for OSDF Remote in CMF-client --- cmflib/cli/utils.py | 3 - cmflib/cmf_commands_wrapper.py | 31 ++++ cmflib/commands/artifact/pull.py | 61 ++++++- cmflib/commands/artifact/push.py | 17 +- cmflib/commands/init/__init__.py | 4 +- cmflib/commands/init/osdfremote.py | 192 ++++++++++++++++++++++ cmflib/storage_backends/osdf_artifacts.py | 68 ++++++++ cmflib/utils/helper_functions.py | 52 ++++++ docs/cmf_client/cmf_client.md | 34 +++- pyproject.toml | 2 + setup.py | 2 +- 11 files changed, 457 insertions(+), 9 deletions(-) create mode 100644 cmflib/commands/init/osdfremote.py create mode 100644 cmflib/storage_backends/osdf_artifacts.py diff --git a/cmflib/cli/utils.py b/cmflib/cli/utils.py index 0b24fb8e..e8dc861b 100644 --- a/cmflib/cli/utils.py +++ b/cmflib/cli/utils.py @@ -18,7 +18,6 @@ import os import sys - def fix_subparsers(subparsers): subparsers.required = True subparsers.dest = "cmd" @@ -74,5 +73,3 @@ def check_minio_server(dvc_config_op): return exception except S3Error as exception: return exception - - diff --git a/cmflib/cmf_commands_wrapper.py b/cmflib/cmf_commands_wrapper.py index 8d558857..870dfb45 100644 --- a/cmflib/cmf_commands_wrapper.py +++ b/cmflib/cmf_commands_wrapper.py @@ -229,3 +229,34 @@ def _init_sshremote(path,user,port,password,git_remote_url,cmf_server_url,neo4j_ msg = cmd.do_run() print(msg) return msg + +def _init_osdfremote(path,key_id,key_path,key_issuer,git_remote_url,cmf_server_url,neo4j_user,neo4j_password,neo4j_uri): + cli_args = cli.parse_args( + [ + "init", + "osdf", + "--path", + path, + "--key-id", + key_id, + "--key-path", + key_path, + "--key-issuer", + key_issuer, + "--git-remote-url", + git_remote_url, + "--cmf-server-url", + cmf_server_url, + "--neo4j-user", + neo4j_user, + "--neo4j-password", + neo4j_password, + "--neo4j-uri", + neo4j_uri + ] + ) + cmd = cli_args.func(cli_args) + msg = cmd.do_run() + print(msg) + return msg + \ No newline at end of file diff --git a/cmflib/commands/artifact/pull.py b/cmflib/commands/artifact/pull.py index dda0da8c..a58898a1 100644 --- a/cmflib/commands/artifact/pull.py +++ b/cmflib/commands/artifact/pull.py @@ -24,6 +24,7 @@ local_artifacts, amazonS3_artifacts, sshremote_artifacts, + osdf_artifacts, ) from cmflib.cli.command import CmdBase from cmflib.utils.dvc_config import DvcConfig @@ -120,7 +121,12 @@ def extract_repo_args(self, type: str, name: str, url: str, current_directory: s current_loc_1 = "/".join(token) current_loc = f"/{current_loc_1}" return host, current_loc, name - + elif type == "osdf": + token_length = len(token) + download_loc = current_directory + "/" + name if current_directory != "" else name + #current_dvc_loc = (token[(token_length - 2)] + "/" + token[(token_length - 1)]) + #return FQDNL of where to download from, where to download to, what the artifact will be named + return s_url, download_loc, name else: # sometimes s_url is empty - this shouldn't happen technically # sometimes s_url is not starting with s3:// - technically this shouldn't happen @@ -294,6 +300,59 @@ def run(self): ) print(stmt) return "Done" + elif dvc_config_op["core.remote"] == "osdf": + #Regenerate Token for OSDF + from cmflib.utils.helper_functions import generate_osdf_token + from cmflib.utils.helper_functions import is_url + from cmflib.dvc_wrapper import dvc_add_attribute + from cmflib.utils.cmf_config import CmfConfig + #Fetch Config from CMF_Config_File + cmf_config_file = os.environ.get("CONFIG_FILE", ".cmfconfig") + cmf_config={} + cmf_config=CmfConfig.read_config(cmf_config_file) + #Regenerate password + dynamic_password = generate_osdf_token(cmf_config["osdf-key_id"],cmf_config["osdf-key_path"],cmf_config["osdf-key_issuer"]) + #cmf_config["password"]=dynamic_password + #Update Password in .dvc/config for future use + dvc_add_attribute(dvc_config_op["core.remote"],"password",dynamic_password) + #Updating dvc_config_op data structure with new password as well since this is used in download_artifacts() below + dvc_config_op["remote.osdf.password"]=dynamic_password + #Need to write to cmfconfig with new credentials + #CmfConfig.write_config(cmf_config, "osdf", attr_dict, True) + #Now Ready to do dvc pull + + osdfremote_class_obj = osdf_artifacts.OSDFremoteArtifacts() + if self.args.artifact_name: + output = self.search_artifact(name_url_dict) + # output[0] = name + # output[1] = url + if output is None: + print(f"{self.args.artifact_name} doesn't exist.") + else: + args = self.extract_repo_args("osdf", output[0], output[1], current_directory) + stmt = osdfremote_class_obj.download_artifacts( + dvc_config_op, + args[0], # s_url of the artifact + current_directory, + args[1], # download_loc of the artifact + args[2] # name of the artifact + ) + print(stmt) + else: + for name, url in name_url_dict.items(): + #print(name, url) + if not isinstance(url, str): + continue + args = self.extract_repo_args("osdf", name, url, current_directory) + stmt = osdfremote_class_obj.download_artifacts( + dvc_config_op, + args[0], # host, + current_directory, + args[1], # remote_loc of the artifact + args[2] # name + ) + print(stmt) + return "Done" elif dvc_config_op["core.remote"] == "amazons3": amazonS3_class_obj = amazonS3_artifacts.AmazonS3Artifacts() #print(self.args.artifact_name,"artifact name") diff --git a/cmflib/commands/artifact/push.py b/cmflib/commands/artifact/push.py index 40a57c41..d1884cb0 100644 --- a/cmflib/commands/artifact/push.py +++ b/cmflib/commands/artifact/push.py @@ -21,17 +21,32 @@ from cmflib.cli.command import CmdBase from cmflib.cli.utils import check_minio_server +from cmflib.utils.helper_functions import generate_osdf_token +from cmflib.utils.helper_functions import is_url from cmflib.utils.dvc_config import DvcConfig from cmflib.dvc_wrapper import dvc_push - +from cmflib.dvc_wrapper import dvc_add_attribute +from cmflib.utils.cmf_config import CmfConfig class CmdArtifactPush(CmdBase): def run(self): result = "" dvc_config_op = DvcConfig.get_dvc_config() + cmf_config_file = os.environ.get("CONFIG_FILE", ".cmfconfig") + cmf_config={} + cmf_config=CmfConfig.read_config(cmf_config_file) out_msg = check_minio_server(dvc_config_op) if dvc_config_op["core.remote"] == "minio" and out_msg != "SUCCESS": return out_msg + elif dvc_config_op["core.remote"] == "osdf": + #print("key_id="+cmf_config["osdf-key_id"]) + dynamic_password = generate_osdf_token(cmf_config["osdf-key_id"],cmf_config["osdf-key_path"],cmf_config["osdf-key_issuer"]) + #print("Dynamic Password"+dynamic_password) + dvc_add_attribute(dvc_config_op["core.remote"],"password",dynamic_password) + #The Push URL will be something like: https:///files/md5/[First Two of MD5 Hash] + result = dvc_push() + #print(result) + return result else: result = dvc_push() return result diff --git a/cmflib/commands/init/__init__.py b/cmflib/commands/init/__init__.py index 5095dd30..437ecb33 100644 --- a/cmflib/commands/init/__init__.py +++ b/cmflib/commands/init/__init__.py @@ -16,10 +16,10 @@ import argparse -from cmflib.commands.init import minioS3, amazonS3, local, sshremote, show +from cmflib.commands.init import minioS3, amazonS3, local, sshremote, osdfremote, show from cmflib.cli.utils import * -SUB_COMMANDS = [minioS3, amazonS3, local, sshremote, show] +SUB_COMMANDS = [minioS3, amazonS3, local, sshremote, osdfremote, show] # This parser adds positional arguments to the main parser def add_parser(subparsers, parent_parser): diff --git a/cmflib/commands/init/osdfremote.py b/cmflib/commands/init/osdfremote.py new file mode 100644 index 00000000..d63f4ff8 --- /dev/null +++ b/cmflib/commands/init/osdfremote.py @@ -0,0 +1,192 @@ +### +# Copyright (2024) Hewlett Packard Enterprise Development LP +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# You may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +### + + +#!/usr/bin/env python3 +import argparse +import os +import subprocess +import sys + +from cmflib.cli.command import CmdBase +from cmflib.dvc_wrapper import ( + git_quiet_init, + git_checkout_new_branch, + git_initial_commit, + git_add_remote, + check_git_repo, + dvc_quiet_init, + dvc_add_remote_repo, + dvc_add_attribute, +) +from cmflib.utils.cmf_config import CmfConfig +from cmflib.utils.helper_functions import is_git_repo +from cmflib.utils.helper_functions import generate_osdf_token +from cmflib.utils.helper_functions import is_url + +class CmdInitOSDFRemote(CmdBase): + def run(self): + # Reading CONFIG_FILE variable + cmf_config = os.environ.get("CONFIG_FILE", ".cmfconfig") + # checking if config file exists + if not os.path.exists(cmf_config): + # writing default value to config file + attr_dict = {} + attr_dict["server-ip"] = "http://127.0.0.1:80" + CmfConfig.write_config(cmf_config, "cmf", attr_dict) + + # if user gave --cmf-server-url, override the config file + if self.args.cmf_server_url: + attr_dict = {} + attr_dict["server-ip"] = self.args.cmf_server_url + CmfConfig.write_config(cmf_config, "cmf", attr_dict, True) + + # read --neo4j details and add to the exsting file + if self.args.neo4j_user and self.args.neo4j_password and self.args.neo4j_uri: + attr_dict = {} + attr_dict["user"] = self.args.neo4j_user + attr_dict["password"] = self.args.neo4j_password + attr_dict["uri"] = self.args.neo4j_uri + CmfConfig.write_config(cmf_config, "neo4j", attr_dict, True) + elif ( + not self.args.neo4j_user + and not self.args.neo4j_password + and not self.args.neo4j_uri + ): + pass + else: + return "ERROR: Provide user, password and uri for neo4j initialization." + output = is_git_repo() + if not output: + branch_name = "master" + print("Starting git init.") + git_quiet_init() + git_checkout_new_branch(branch_name) + git_initial_commit() + git_add_remote(self.args.git_remote_url) + print("git init complete.") + + print("Starting cmf init.") + repo_type = "osdf" + dvc_quiet_init() + output = dvc_add_remote_repo(repo_type, self.args.path) + if not output: + return "cmf init failed." + print(output) + #dvc_add_attribute(repo_type, "key_id", self.args.key_id) + #dvc_add_attribute(repo_type, "key_path", self.args.key_path) + #dvc_add_attribute(repo_type, "key_issuer", self.args.key_issuer) + #Writing to an OSDF Remote is based on SSH Remote. With few additions + #In addition to URL (including FQDN, port, path), we need to provide + #method=PUT, ssl_verify=false, ask_password=false, auth=custom, custom_auth-header='Authorization' + #password='Bearer + dynamically generated scitoken' (This token has a timeout of 15 mins so must be generated right before any push/pull) + dvc_add_attribute(repo_type,"method", "PUT") + dvc_add_attribute(repo_type,"ssl_verify", "false") + dvc_add_attribute(repo_type,"ask_password", "false") + dvc_add_attribute(repo_type,"auth", "custom") + dvc_add_attribute(repo_type,"custom_auth_header", "Authorization") + dynamic_password = generate_osdf_token(self.args.key_id,self.args.key_path,self.args.key_issuer) + dvc_add_attribute(repo_type,"password",dynamic_password) + + attr_dict = {} + attr_dict["path"] = self.args.path + attr_dict["key_id"] = self.args.key_id + attr_dict["key_path"] = self.args.key_path + attr_dict["key_issuer"] = self.args.key_issuer + CmfConfig.write_config(cmf_config, "osdf", attr_dict, True) + + return "cmf init complete." + + +def add_parser(subparsers, parent_parser): + HELP = "Initialises remote OSDF directory as artifact repository." + + parser = subparsers.add_parser( + "osdfremote", + parents=[parent_parser], + description="This command initialises remote OSDF directory as artifact repository for CMF.", + help=HELP, + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + ) + required_arguments = parser.add_argument_group("required arguments") + + required_arguments.add_argument( + "--path", + required=True, + help="Specify FQDN for OSDF directory path including port and path", + metavar="", + default=argparse.SUPPRESS, + ) + + required_arguments.add_argument( + "--key-id", + required=True, + help="Specify key_id for provided private key. eg. b2d3", + metavar="", + default=argparse.SUPPRESS, + ) + + required_arguments.add_argument( + "--key-path", + required=True, + help="Specify path for private key on local filesystem. eg. ~/.ssh/XXX.pem", + metavar="", + default=argparse.SUPPRESS, + ) + + required_arguments.add_argument( + "--key-issuer", + required=True, + help="Specify URL for Key Issuer. eg. https://t.nationalresearchplatform.org/XXX", + metavar="", + default=argparse.SUPPRESS, + ) + + required_arguments.add_argument( + "--git-remote-url", + required=True, + help="Specify git repo url. eg: https://github.com/XXX/example.git", + metavar="", + default=argparse.SUPPRESS, + ) + + parser.add_argument( + "--cmf-server-url", + help="Specify cmf-server URL.", + metavar="", + default="http://127.0.0.1:80", + ) + + parser.add_argument( + "--neo4j-user", + help="Specify neo4j user.", + metavar="", + # default=argparse.SUPPRESS, + ) + parser.add_argument( + "--neo4j-password", + help="Specify neo4j password.", + metavar="", + # default=argparse.SUPPRESS, + ) + parser.add_argument( + "--neo4j-uri", + help="Specify neo4j uri.eg bolt://localhost:7687", + metavar="", + # default=argparse.SUPPRESS, + ) + + parser.set_defaults(func=CmdInitOSDFRemote) diff --git a/cmflib/storage_backends/osdf_artifacts.py b/cmflib/storage_backends/osdf_artifacts.py new file mode 100644 index 00000000..ff14e51c --- /dev/null +++ b/cmflib/storage_backends/osdf_artifacts.py @@ -0,0 +1,68 @@ +### +# Copyright (2023) Hewlett Packard Enterprise Development LP +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# You may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +### + +import os +import requests +#import urllib3 +#urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) + +class OSDFremoteArtifacts: + def download_artifacts( + self, + dvc_config_op, + host: str, #s_url + current_directory: str, #current_directory where cmf artifact pull is executed + remote_file_path: str, # download_loc of the artifact + local_path: str, #name of the artifact + ): + output = "" + remote_repo = dvc_config_op["remote.osdf.url"] + user = "nobody" + dynamic_password = dvc_config_op["remote.osdf.password"] + custom_auth_header = dvc_config_op["remote.osdf.custom_auth_header"] + #print(f"dynamic password from download_artifacts={dynamic_password}") + #print(f"Fetching artifact={local_path}, surl={host} to {remote_file_path} when this has been called at {current_directory}") + + try: + headers={dvc_config_op["remote.osdf.custom_auth_header"]: dvc_config_op["remote.osdf.password"]} + temp = local_path.split("/") + temp.pop() + dir_path = "/".join(temp) + dir_to_create = os.path.join(current_directory, dir_path) + os.makedirs( + dir_to_create, mode=0o777, exist_ok=True + ) # creates subfolders needed as per artifacts folder structure + local_file_path = os.path.join(current_directory, local_path) + local_file_path = os.path.abspath(local_file_path) + + response = requests.get(host, headers=headers, verify=True) #This should be made True. otherwise this will produce Insecure SSL Warning + if response.status_code == 200 and response.content: + data = response.content + else: + return "No data received from the server." + + except Exception as exception: + return exception + + try: + with open(remote_file_path, 'wb') as file: + file.write(data) + if os.path.exists(remote_file_path) and os.path.getsize(remote_file_path) > 0: + #print(f"object {local_path} downloaded at {remote_file_path}") + stmt = f"object {local_path} downloaded at {remote_file_path}." + return stmt + except Exception as e: + print(f"An error occurred while writing to the file: {e}") diff --git a/cmflib/utils/helper_functions.py b/cmflib/utils/helper_functions.py index 16872721..9dc12ea3 100644 --- a/cmflib/utils/helper_functions.py +++ b/cmflib/utils/helper_functions.py @@ -19,6 +19,14 @@ import subprocess import json +def is_url(url)-> bool: + from urllib.parse import urlparse + try: + result = urlparse(url) + return all([result.scheme, result.netloc]) + except ValueError: + return False + def is_git_repo(): git_dir = os.path.join(os.getcwd(), '.git') print("git_dir", git_dir) @@ -80,3 +88,47 @@ def list_conda_packages_json(): return f"Error: {e.stderr}" +# Generate SciToken dynamically +def generate_osdf_token(key_id, key_path, key_issuer) -> str: + + #for SciToken Generation & Validation + import scitokens + from cryptography.hazmat.primitives import serialization + from cryptography.hazmat.backends import default_backend + + dynamic_pass="" #Initialize Blank dynamic Password + + #Read Private Key using load_pem_private_key() method + if not os.path.exists(key_path): + print(f"File {key_path} does not exist.") + return dynamic_pass + + try: + with open(key_path, "r") as file_pointer: + private_key_contents = file_pointer.read() + + loaded_private_key = serialization.load_pem_private_key( + private_key_contents.encode(), + password=None, # Assumes Private key. Update this if password is used for Private Key + backend=default_backend() + ) + + if is_url(key_issuer): + token = scitokens.SciToken(key=loaded_private_key, key_id=key_id) #Generate SciToken + #token.update_claims({"iss": key_issuer, "scope": "write:/ read:/", "aud": "NRP", "sub": "NRP"}) + token.update_claims({"scope": "write:/ read:/", "aud": "NRP", "sub": "NRP"}) #TODO: Figure out how to supply these as input params + + # Serialize the token to a string + token_ser = token.serialize(issuer=key_issuer) + #Key_issuer is something like ""https://t.nationalresearchplatform.org/fdp" + + #Stringify token_str + token_str=token_ser.decode() + dynamic_pass="Bearer "+ token_str + else: + print(f"{key_issuer} is not a valid URL.") + + except Exception as err: + print(f"Unexpected {err}, {type(err)}") + + return dynamic_pass \ No newline at end of file diff --git a/docs/cmf_client/cmf_client.md b/docs/cmf_client/cmf_client.md index 25ec83e9..2d74ef37 100644 --- a/docs/cmf_client/cmf_client.md +++ b/docs/cmf_client/cmf_client.md @@ -177,7 +177,39 @@ Optional Arguments --neo4j-password [neo4j_password] Specify neo4j password. (default: None) --neo4j-uri Specify neo4j uri. Eg bolt://localhost:7687 (default: None) ``` - +### cmf init osdfremote +``` +Usage: cmf init osdfremote [-h] --path [path] + --endpoint-url [endpoint_url] + --access-key-id [access_key_id] + --secret-key [secret_key] + --git-remote-url[git_remote_url] + --cmf-server-url [cmf_server_url] + --neo4j-user [neo4j_user] + --neo4j-password [neo4j_password] + --neo4j-uri [neo4j_uri] +``` +`cmf init osdfremote` configures a OSDF Origin as a cmf artifact repository. +``` +cmf init osdfremote --path https://[Some Origin]:8443/nrp/fdp/ --key-id c2a5 --key-path ~/.ssh/fdp.pem --key-issuer https://[Token Issuer]] --git-remote-url https://github.com/user/experiment-repo.git --git-remote-url https://github.com/user/experiment-repo.git --cmf-server-url http://127.0.0.1:80 --neo4j-user neo4j --neo4j-password password --neo4j-uri bolt://localhost:7687 +``` +Required Arguments +``` + --path [path] Specify FQDN for OSDF origin including including port and directory path + --key-id [key_id] Specify key_id for provided private key. eg. b2d3 + --key-path [key_path] Specify path for private key on local filesystem. eg. ~/.ssh/XXX.pem + --key-issuer [key_issuer] Specify URL for Key Issuer. eg. https://t.nationalresearchplatform.org/XXX + --git-remote-url [git_remote_url] Specify git repo url. eg: https://github.com/XXX/example.git +``` +Optional Arguments +``` + -h, --help show this help message and exit + --cmf-server-url [cmf_server_url] Specify cmf-server url. (default: http://127.0.0.1:80) + --neo4j-user [neo4j_user] Specify neo4j user. (default: None) + --neo4j-password [neo4j_password] Specify neo4j password. (default: None) + --neo4j-uri Specify neo4j uri. Eg bolt://localhost:7687 (default: None) + +``` ## cmf artifact ``` Usage: cmf artifact [-h] {pull,push} diff --git a/pyproject.toml b/pyproject.toml index 2c783331..06132ae3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -13,6 +13,8 @@ dependencies = [ "click", "minio", "paramiko", + "scitokens", + "cryptography", ] authors = [ { name="Hewlett Packard Enterprise"}, diff --git a/setup.py b/setup.py index 116f9a50..53884306 100644 --- a/setup.py +++ b/setup.py @@ -15,7 +15,7 @@ packages=find_packages(), install_requires=["ml-metadata==1.15.0", "dvc[ssh,s3]==3.51.1", "pandas", "retrying", "pyarrow", "neo4j", \ - "scikit-learn", "tabulate", "click", "minio", "paramiko"], # add any additional packages that + "scikit-learn", "tabulate", "click", "minio", "paramiko", "scitokens", "cryptography"], # add any additional packages that # needs to be installed along with your package. Eg: 'caer' keywords=['python', 'first package'],