From 5cc16a8ce8ac9857a55cf4c58b8cfb6493a05ce0 Mon Sep 17 00:00:00 2001 From: huangxu17 Date: Thu, 11 Apr 2024 23:00:53 +0800 Subject: [PATCH] 1. FedLearner Framework and Core Dependency RA-TLS Configuration To ensure code consistency and implement hash signature-based remote attestation for the FedLearner framework and its core dependencies, an environment variable has been introduced in Gramine that prevents the generation of .pyc files. 2. Gramine Template Configuration for FedLearner The Gramine template configuration has been updated to include the code locations for the FedLearner framework and some essential dependencies. This addition facilitates bidirectional remote attestation between parties. 3. Meituan HDFS File Path Management Optimization The code responsible for handling file paths in Meituan's Hadoop Distributed File System (HDFS) has been migrated from the main entry point to be processed by the master node. This change aims to prevent file read and write conflicts that could arise from multiple workers operating simultaneously. --- fedlearner-sgx-dev.dockerfile | 31 +++++--- fedlearner/trainer/trainer_worker.py | 54 ++++++------- .../generate-token/python.manifest.template | 76 +++++++++++++++++-- 3 files changed, 117 insertions(+), 44 deletions(-) diff --git a/fedlearner-sgx-dev.dockerfile b/fedlearner-sgx-dev.dockerfile index 56e6a9d8e..915c70c09 100644 --- a/fedlearner-sgx-dev.dockerfile +++ b/fedlearner-sgx-dev.dockerfile @@ -6,6 +6,8 @@ ENV DEBIAN_FRONTEND=noninteractive ENV INSTALL_PREFIX=/usr/local ENV LD_LIBRARY_PATH=${INSTALL_PREFIX}/lib:${INSTALL_PREFIX}/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH} ENV PATH=${INSTALL_PREFIX}/bin:${LD_LIBRARY_PATH}:${PATH} +# For Gramine RA-TLS +ENV PYTHONDONTWRITEBYTECODE=1 # Add steps here to set up common dependencies RUN apt-get update \ @@ -60,8 +62,8 @@ RUN apt-get install -y libcurl4-openssl-dev libprotobuf-c-dev python3-protobuf p RUN apt-get install -y libgmp-dev libmpfr-dev libmpc-dev libisl-dev nasm protobuf-compiler RUN ln -s /usr/bin/python3 /usr/bin/python \ - && pip3 install --upgrade pip \ - && pip3 install toml meson pyelftools + && pip3 install --no-compile --upgrade pip -i https://mirrors.aliyun.com/pypi/simple/ \ + && pip3 install --no-compile toml meson pyelftools -i https://mirrors.aliyun.com/pypi/simple/ RUN git clone https://github.com/analytics-zoo/gramine ${GRAMINEDIR} \ && cd ${GRAMINEDIR} \ @@ -101,8 +103,8 @@ ENV GRPC_VERSION=v1.38.1 RUN git clone --recurse-submodules -b ${GRPC_VERSION} https://github.com/grpc/grpc ${GRPC_PATH} -RUN pip3 install --upgrade pip \ - && pip3 install -r ${GRPC_PATH}/requirements.txt +RUN pip3 install --no-compile --upgrade pip -i https://mirrors.aliyun.com/pypi/simple/ \ + && pip3 install --no-compile -r ${GRPC_PATH}/requirements.txt -i https://mirrors.aliyun.com/pypi/simple/ # Tensorflow dependencies ENV BAZEL_VERSION=3.1.0 @@ -110,8 +112,8 @@ ENV TF_VERSION=v2.4.2 ENV TF_BUILD_PATH=/tf/src ENV TF_BUILD_OUTPUT=/tf/output -RUN pip3 install --upgrade pip \ - && pip3 install numpy keras_preprocessing +RUN pip3 install --no-compile --upgrade pip -i https://mirrors.aliyun.com/pypi/simple/ \ + && pip3 install --no-compile numpy keras_preprocessing -i https://mirrors.aliyun.com/pypi/simple/ RUN wget "https://github.com/bazelbuild/bazel/releases/download/${BAZEL_VERSION}/bazel_${BAZEL_VERSION}-linux-x86_64.deb" \ && dpkg -i bazel_*.deb @@ -127,7 +129,7 @@ RUN apt-get install -y libmysqlclient-dev COPY sgx/grpc/common ${GRPC_PATH} COPY sgx/grpc/v1.38.1 ${GRPC_PATH} -RUN pip3 install 'cython==0.29.36' +RUN pip3 install --no-compile 'cython==0.29.36' -i https://mirrors.aliyun.com/pypi/simple/ RUN ${GRPC_PATH}/build_python.sh # Build tensorflow @@ -150,17 +152,17 @@ RUN if [ -f ${FEDLEARNER_PATH}/docker/hadoop-mt-2.7.0.tar.gz ]; then mkdir -p /o # For meituan hadoop auth RUN apt-get install -y libkrb5-dev openjdk-8-jdk -RUN pip3 install --upgrade pip \ - && pip3 install -r ${FEDLEARNER_PATH}/requirements.txt -i https://mirrors.aliyun.com/pypi/simple/ +RUN pip3 install --no-compile --upgrade pip -i https://mirrors.aliyun.com/pypi/simple/ \ + && pip3 install --no-compile -r ${FEDLEARNER_PATH}/requirements.txt -i https://mirrors.aliyun.com/pypi/simple/ RUN cd ${FEDLEARNER_PATH} \ && make protobuf \ && python3 setup.py bdist_wheel \ - && pip3 install ./dist/*.whl + && pip3 install --no-compile ./dist/*.whl # Re-install tensorflow, uninstall tensorflow_io, mock it RUN pip3 uninstall -y tensorflow tensorflow-io \ - && pip3 install ${TF_BUILD_OUTPUT}/*.whl + && pip3 install --no-compile ${TF_BUILD_OUTPUT}/*.whl # Re-install fedlearner plugin RUN cd ${FEDLEARNER_PATH} \ @@ -170,7 +172,7 @@ RUN cd ${FEDLEARNER_PATH} \ # Re-install grpcio RUN pip3 uninstall -y grpcio \ - && pip3 install ${GRPC_PATH}/dist/grpcio*.whl + && pip3 install --no-compile ${GRPC_PATH}/dist/grpcio*.whl # For debug RUN apt-get install -y strace gdb ctags vim @@ -182,6 +184,11 @@ COPY sgx/configs / RUN echo "enabled=0" > /etc/default/apport RUN echo "exit 0" > /usr/sbin/policy-rc.d +# For gramine ra-tls +RUN dpkg --remove --force-depends libgtk2.0-0 \ + && pip3 uninstall -y numpy keras_preprocessing protobuf \ + && pip3 install --no-compile numpy keras_preprocessing protobuf -i https://mirrors.aliyun.com/pypi/simple/ + # Clean tmp files RUN apt-get clean all \ && rm -rf /var/lib/apt/lists/* \ diff --git a/fedlearner/trainer/trainer_worker.py b/fedlearner/trainer/trainer_worker.py index 4827272b6..caa5bf292 100644 --- a/fedlearner/trainer/trainer_worker.py +++ b/fedlearner/trainer/trainer_worker.py @@ -232,6 +232,33 @@ def _run_master(role, cluster_server = ClusterServer(cluster_spec, "master", server_port=args.server_port) + # use Meituan hadoop + # first:convert Meituan HDFS path to local storage path, if local exit psi result file, user local file + # second:if local not exit psi result file,from Meituan HDFS download to local + if args.using_mt_hadoop: + data_path = args.data_path + if data_path: + local_data_path = get_local_temp_path(data_path) + if not exists(local_data_path): + data_path = mt_hadoop_download(data_path) + else: + data_path = local_data_path + args.data_path = data_path + + checkpoint_path = args.checkpoint_path + if checkpoint_path: + args.checkpoint_path = get_local_temp_path(checkpoint_path) + + load_checkpoint_path = args.load_checkpoint_path + if load_checkpoint_path: + args.load_checkpoint_path = get_local_temp_path(load_checkpoint_path) + if not exists(args.load_checkpoint_path): + mt_hadoop_download(load_checkpoint_path) + + export_path = args.export_path + if export_path: + args.export_path = get_local_temp_path(export_path) + checkpoint_filename_with_path = _get_checkpoint_filename_with_path(args) data_visitor = _create_data_visitor(args) master_factory = LeaderTrainerMaster \ @@ -479,33 +506,6 @@ def train(role, if not isinstance(role, str) or role.lower() not in (LEADER, FOLLOER): raise ValueError("--role must set one of %s or %s"%(LEADER, FOLLOER)) - # use Meituan hadoop - # first:convert Meituan HDFS path to local storage path, if local exit psi result file, user local file - # second:if local not exit psi result file,from Meituan HDFS download to local - if args.using_mt_hadoop: - data_path = args.data_path - if data_path: - local_data_path = get_local_temp_path(data_path) - if not exists(local_data_path): - data_path = mt_hadoop_download(data_path) - else: - data_path = local_data_path - args.data_path = data_path - - checkpoint_path = args.checkpoint_path - if checkpoint_path: - args.checkpoint_path = get_local_temp_path(checkpoint_path) - - load_checkpoint_path = args.load_checkpoint_path - if load_checkpoint_path: - args.load_checkpoint_path = get_local_temp_path(load_checkpoint_path) - if not exists(args.load_checkpoint_path): - mt_hadoop_download(load_checkpoint_path) - - export_path = args.export_path - if export_path: - args.export_path = get_local_temp_path(export_path) - if args.loglevel: fl_logging.set_level(args.loglevel) diff --git a/sgx/gramine/CI-Examples/generate-token/python.manifest.template b/sgx/gramine/CI-Examples/generate-token/python.manifest.template index 573bbd4c0..185e75f55 100644 --- a/sgx/gramine/CI-Examples/generate-token/python.manifest.template +++ b/sgx/gramine/CI-Examples/generate-token/python.manifest.template @@ -67,12 +67,75 @@ sgx.trusted_files = [ "file:/usr/{{ arch_libdir }}/", "file:/etc/ssl/certs/ca-certificates.crt", "file:/etc/default/apport", - "file:/usr/local/lib/", - "file:{{ python.stdlib }}/", - "file:{{ python.distlib }}/", "file:/etc/mime.types", "file:/gramine/leader/", - "file:/gramine/follower/" + "file:/gramine/follower/", + "file:/usr/local/lib/x86_64-linux-gnu/", + "file:/usr/local/lib/python3.6/dist-packages/tensorflow/libtensorflow_framework.so.2", + "file:/usr/local/lib/python3.6/dist-packages/tensorflow/_api/v2/__init__.py", + "file:/usr/local/lib/python3.6/dist-packages/tensorflow/_api/v2/__internal__/", + "file:/usr/local/lib/python3.6/dist-packages/tensorflow/_api/v2/__operators__/", + "file:/usr/local/lib/python3.6/dist-packages/tensorflow/_api/v2/audio/", + "file:/usr/local/lib/python3.6/dist-packages/tensorflow/_api/v2/autodiff/", + "file:/usr/local/lib/python3.6/dist-packages/tensorflow/_api/v2/autograph/", + "file:/usr/local/lib/python3.6/dist-packages/tensorflow/_api/v2/bitwise/", + "file:/usr/local/lib/python3.6/dist-packages/tensorflow/_api/v2/compat/", + "file:/usr/local/lib/python3.6/dist-packages/tensorflow/_api/v2/config/", + "file:/usr/local/lib/python3.6/dist-packages/tensorflow/_api/v2/data/", + "file:/usr/local/lib/python3.6/dist-packages/tensorflow/_api/v2/debugging/", + "file:/usr/local/lib/python3.6/dist-packages/tensorflow/_api/v2/distribute/", + "file:/usr/local/lib/python3.6/dist-packages/tensorflow/_api/v2/dtypes/", + "file:/usr/local/lib/python3.6/dist-packages/tensorflow/_api/v2/errors/", + "file:/usr/local/lib/python3.6/dist-packages/tensorflow/_api/v2/experimental/", + "file:/usr/local/lib/python3.6/dist-packages/tensorflow/_api/v2/feature_column/", + "file:/usr/local/lib/python3.6/dist-packages/tensorflow/_api/v2/graph_util/", + "file:/usr/local/lib/python3.6/dist-packages/tensorflow/_api/v2/image/", + "file:/usr/local/lib/python3.6/dist-packages/tensorflow/_api/v2/io/", + "file:/usr/local/lib/python3.6/dist-packages/tensorflow/_api/v2/linalg/", + "file:/usr/local/lib/python3.6/dist-packages/tensorflow/_api/v2/lite/", + "file:/usr/local/lib/python3.6/dist-packages/tensorflow/_api/v2/lookup/", + "file:/usr/local/lib/python3.6/dist-packages/tensorflow/_api/v2/math/", + "file:/usr/local/lib/python3.6/dist-packages/tensorflow/_api/v2/mixed_precision/", + "file:/usr/local/lib/python3.6/dist-packages/tensorflow/_api/v2/mlir/", + "file:/usr/local/lib/python3.6/dist-packages/tensorflow/_api/v2/nest/", + "file:/usr/local/lib/python3.6/dist-packages/tensorflow/_api/v2/nn/", + "file:/usr/local/lib/python3.6/dist-packages/tensorflow/_api/v2/profiler/", + "file:/usr/local/lib/python3.6/dist-packages/tensorflow/_api/v2/quantization/", + "file:/usr/local/lib/python3.6/dist-packages/tensorflow/_api/v2/queue/", + "file:/usr/local/lib/python3.6/dist-packages/tensorflow/_api/v2/ragged/", + "file:/usr/local/lib/python3.6/dist-packages/tensorflow/_api/v2/random/", + "file:/usr/local/lib/python3.6/dist-packages/tensorflow/_api/v2/raw_ops/", + "file:/usr/local/lib/python3.6/dist-packages/tensorflow/_api/v2/saved_model/", + "file:/usr/local/lib/python3.6/dist-packages/tensorflow/_api/v2/sets/", + "file:/usr/local/lib/python3.6/dist-packages/tensorflow/_api/v2/signal/", + "file:/usr/local/lib/python3.6/dist-packages/tensorflow/_api/v2/sparse/", + "file:/usr/local/lib/python3.6/dist-packages/tensorflow/_api/v2/strings/", + "file:/usr/local/lib/python3.6/dist-packages/tensorflow/_api/v2/summary/", + "file:/usr/local/lib/python3.6/dist-packages/tensorflow/_api/v2/sysconfig/", + "file:/usr/local/lib/python3.6/dist-packages/tensorflow/_api/v2/test/", + "file:/usr/local/lib/python3.6/dist-packages/tensorflow/_api/v2/tpu/", + "file:/usr/local/lib/python3.6/dist-packages/tensorflow/_api/v2/train/", + "file:/usr/local/lib/python3.6/dist-packages/tensorflow/_api/v2/types/", + "file:/usr/local/lib/python3.6/dist-packages/tensorflow/_api/v2/version/", + "file:/usr/local/lib/python3.6/dist-packages/tensorflow/_api/v2/xla/", + "file:/usr/local/lib/python3.6/dist-packages/tensorflow/compiler/", + "file:/usr/local/lib/python3.6/dist-packages/tensorflow/core/", + "file:/usr/local/lib/python3.6/dist-packages/tensorflow/include/", + "file:/usr/local/lib/python3.6/dist-packages/tensorflow/keras/", + "file:/usr/local/lib/python3.6/dist-packages/tensorflow/python/", + "file:/usr/local/lib/python3.6/dist-packages/tensorflow/tools/", + "file:/usr/local/lib/python3.6/dist-packages/tensorflow/lite/experimental/", + "file:/usr/local/lib/python3.6/dist-packages/tensorflow/lite/toco/", + "file:/usr/local/lib/python3.6/dist-packages/tensorflow/xla_aot_runtime_src/", + "file:/usr/local/lib/python3.6/dist-packages/pyspark/", + "file:/usr/local/lib/python3.6/dist-packages/Cython/", + "file:/usr/local/lib/python3.6/dist-packages/numpy/", + "file:/usr/local/lib/python3.6/dist-packages/grpc/", + "file:/usr/local/lib/python3.6/dist-packages/google/", + "file:/usr/local/lib/python3.6/dist-packages/tensorflow_estimator/", + "file:/usr/local/lib/python3.6/dist-packages/sklearn/", + "file:/usr/local/lib/python3.6/dist-packages/fedlearner/", + "file:/usr/local/lib/python3.6/dist-packages/pandas/" ] sgx.allowed_files = [ @@ -81,6 +144,9 @@ sgx.allowed_files = [ "file:/opt/meituan/", "file:/usr/lib/ssl/openssl.cnf", "file:/usr/lib/gcc", + "file:/usr/local/lib/", + "file:{{ python.stdlib }}/", + "file:{{ python.distlib }}/", "file:/etc/ethers", "file:/etc/hosts", "file:/etc/group", @@ -103,4 +169,4 @@ sgx.allowed_files = [ "file:/lib/", "file:/bin/", "file:/data/", -] \ No newline at end of file +]