Merge branch 'fix_dev_sgx' into add_meituan_hdfs_to_sgx

bytedance · Mar 20, 2024 · 9260efa · 9260efa
2 parents b0ef68b + 561e259
commit 9260efa
Show file tree

Hide file tree

Showing 9 changed files with 83 additions and 36 deletions.
diff --git a/deploy/scripts/sgx/enclave_env.sh b/deploy/scripts/sgx/enclave_env.sh
@@ -14,11 +14,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+EXEC_DIR=/app/exec_dir
+
 function get_env() {
     gramine-sgx-get-token -s python.sig -o /dev/null | grep $1 | awk -F ":" '{print $2}' | xargs
 }
 
 function make_custom_env() {
+    cd $EXEC_DIR
+
     export DEBUG=0
     export CUDA_VISIBLE_DEVICES=""
     export DNNL_VERBOSE=0
@@ -59,17 +63,20 @@ function make_custom_env() {
     # need meituan's
     jq --arg mr_enclave "$PEER_MR_ENCLAVE" --arg mr_signer "$PEER_MR_SIGNER" \
         '.sgx_mrs[0].mr_enclave = $mr_enclave | .sgx_mrs[0].mr_signer = $mr_signer' \
-        $GRPC_PATH/examples/dynamic_config.json > ./dynamic_config.json
+        $GRPC_PATH/examples/dynamic_config.json > $EXEC_DIR/dynamic_config.json
+
+    cd -
 }
 
 function generate_token() {
     cd /gramine/CI-Examples/generate-token/
     ./generate.sh
-    mkdir -p /app/sgx/token/
-    cp python.sig /app/sgx/token/
-    cp python.manifest.sgx /app/sgx/token/
-    cp python.token /app/sgx/token/
-    cp python.manifest /app/sgx/token/
+    mkdir -p $EXEC_DIR
+    cp /app/sgx/gramine/CI-Examples/tensorflow_io.py $EXEC_DIR
+    cp python.sig $EXEC_DIR
+    cp python.manifest.sgx $EXEC_DIR
+    cp python.token $EXEC_DIR
+    cp python.manifest $EXEC_DIR
     cd -
 }
 
@@ -81,18 +88,44 @@ elif [ -n "$PCCS_URL" ]; then
         sed -i "s|PCCS_URL=[^ ]*|PCCS_URL=$PCCS_URL|" /etc/sgx_default_qcnl.conf
 fi
 
+TEMPLATE_PATH="/gramine/CI-Examples/generate-token/python.manifest.template"
 if [ -n "$GRAMINE_LOG_LEVEL" ]; then
-        FILE="/gramine/CI-Examples/generate-token/python.manifest.template"
-        sed -i "/loader.log_level/ s/\"[^\"]*\"/\"$GRAMINE_LOG_LEVEL\"/" "$FILE"
-        # 检查sed命令是否成功执行
+        sed -i "/loader.log_level/ s/\"[^\"]*\"/\"$GRAMINE_LOG_LEVEL\"/" "$TEMPLATE_PATH"
         if [ $? -eq 0 ]; then
-            echo "Log level changed to $GRAMINE_LOG_LEVEL in $FILE"
+            echo "Log level changed to $GRAMINE_LOG_LEVEL in $TEMPLATE_PATH"
         else
-            echo "Failed to change log level in $FILE"
+            echo "Failed to change log level in $TEMPLATE_PATH"
         fi
 fi
 
+if [ -n "$GRAMINE_ENCLAVE_SIZE" ]; then
+    sed -i "/sgx.enclave_size/ s/\"[^\"]*\"/\"$GRAMINE_ENCLAVE_SIZE\"/" "$TEMPLATE_PATH"
+    if [ $? -eq 0 ]; then
+        echo "Enclave size changed to $GRAMINE_ENCLAVE_SIZE in $TEMPLATE_PATH"
+    else
+        echo "Failed to change enclave size in $TEMPLATE_PATH"
+    fi
+fi
+
+if [ -n "$GRAMINE_THREAD_NUM" ]; then
+    sed -i "s/sgx.thread_num = [0-9]\+/sgx.thread_num = $GRAMINE_THREAD_NUM/" "$TEMPLATE_PATH"
+    if [ $? -eq 0 ]; then
+        echo "Thread number changed to $GRAMINE_THREAD_NUM in $TEMPLATE_PATH"
+    else
+        echo "Failed to change thread number in $TEMPLATE_PATH"
+    fi
+fi
+
+if [ -n "$GRAMINE_STACK_SIZE" ]; then
+    sed -i "/sys.stack.size/ s/\"[^\"]*\"/\"$GRAMINE_STACK_SIZE\"/" "$TEMPLATE_PATH"
+    if [ $? -eq 0 ]; then
+        echo "Stack size changed to $GRAMINE_STACK_SIZE in $TEMPLATE_PATH"
+    else
+        echo "Failed to change stack size in $TEMPLATE_PATH"
+    fi
+fi
+
 sed -i 's/USE_SECURE_CERT=TRUE/USE_SECURE_CERT=FALSE/' /etc/sgx_default_qcnl.conf
 mkdir -p /data
 
-generate_token
+generate_token
diff --git a/deploy/scripts/sgx/run_data_join_master.sh b/deploy/scripts/sgx/run_data_join_master.sh
@@ -24,7 +24,6 @@ kvstore_type=$(normalize_env_to_args '--kvstore_type' $KVSTORE_TYPE)
 
 source /app/deploy/scripts/sgx/enclave_env.sh
 cp /app/sgx/gramine/CI-Examples/tensorflow_io.py ./
-cp /app/sgx/token/* ./
 unset HTTPS_PROXY https_proxy http_proxy ftp_proxy
 
 make_custom_env 4

diff --git a/deploy/scripts/sgx/run_data_join_worker.sh b/deploy/scripts/sgx/run_data_join_worker.sh
@@ -55,7 +55,6 @@ fi
 
 source /app/deploy/scripts/sgx/enclave_env.sh
 cp /app/sgx/gramine/CI-Examples/tensorflow_io.py ./
-cp /app/sgx/token/* ./
 unset HTTPS_PROXY https_proxy http_proxy ftp_proxy
 
 make_custom_env 4

diff --git a/deploy/scripts/sgx/run_data_portal_master.sh b/deploy/scripts/sgx/run_data_portal_master.sh
@@ -26,7 +26,6 @@ files_per_job_limit=$(normalize_env_to_args '--files_per_job_limit' $FILES_PER_J
 
 source /app/deploy/scripts/sgx/enclave_env.sh
 cp /app/sgx/gramine/CI-Examples/tensorflow_io.py ./
-cp /app/sgx/token/* ./
 unset HTTPS_PROXY https_proxy http_proxy ftp_proxy
 
 make_custom_env 4

diff --git a/deploy/scripts/sgx/run_data_portal_worker.sh b/deploy/scripts/sgx/run_data_portal_worker.sh
@@ -38,7 +38,6 @@ input_data_validation_ratio=$(normalize_env_to_args '--input_data_validation_rat
 
 source /app/deploy/scripts/sgx/enclave_env.sh
 cp /app/sgx/gramine/CI-Examples/tensorflow_io.py ./
-cp /app/sgx/token/* ./
 unset HTTPS_PROXY https_proxy http_proxy ftp_proxy
 
 make_custom_env 4

diff --git a/deploy/scripts/sgx/run_trainer_master_sgx.sh b/deploy/scripts/sgx/run_trainer_master_sgx.sh
@@ -89,10 +89,12 @@ if [[ -n "${CODE_KEY}" ]]; then
 else
   pull_code ${CODE_TAR} $PWD
 fi
-cd ${ROLE}
-cp /app/sgx/gramine/CI-Examples/tensorflow_io.py ./
+
+cp /app/sgx/gramine/CI-Examples/tensorflow_io.py /gramine/follower/
+cp /app/sgx/gramine/CI-Examples/tensorflow_io.py /gramine/leader/
 source /app/deploy/scripts/sgx/enclave_env.sh
-cp /app/sgx/token/* ./
+
+unset HTTPS_PROXY https_proxy http_proxy ftp_proxy
 
 make_custom_env 4
 source /root/start_aesm_service.sh
@@ -104,14 +106,15 @@ fi
 
 server_port=$(normalize_env_to_args "--server-port" "$PORT1")
 
+cd $EXEC_DIR
 if [[ -z "${START_CPU_SN}" ]]; then
     START_CPU_SN=0
 fi
 if [[ -z "${END_CPU_SN}" ]]; then
     END_CPU_SN=3
 fi
 
-taskset -c $START_CPU_SN-$END_CPU_SN stdbuf -o0 gramine-sgx python main.py --master \
+taskset -c $START_CPU_SN-$END_CPU_SN stdbuf -o0 gramine-sgx python /gramine/$ROLE/main.py --master \
     --application-id=$APPLICATION_ID \
     --data-source=$DATA_SOURCE \
     --data-path=$DATA_PATH \

diff --git a/deploy/scripts/sgx/run_trainer_ps_sgx.sh b/deploy/scripts/sgx/run_trainer_ps_sgx.sh
@@ -20,25 +20,32 @@ export CUDA_VISIBLE_DEVICES=
 cp /app/sgx/gramine/CI-Examples/tensorflow_io.py ./
 source /app/deploy/scripts/hdfs_common.sh || true
 source /app/deploy/scripts/pre_start_hook.sh || true
+source /app/deploy/scripts/env_to_args.sh
 
 LISTEN_PORT=50052
 if [[ -n "${PORT1}" ]]; then
   LISTEN_PORT=${PORT1}
 fi
 
+if [[ -n "${CODE_KEY}" ]]; then
+  pull_code ${CODE_KEY} $PWD
+else
+  pull_code ${CODE_TAR} $PWD
+fi
+
+cp /app/sgx/gramine/CI-Examples/tensorflow_io.py /gramine/leader
+cp /app/sgx/gramine/CI-Examples/tensorflow_io.py /gramine/follower
 source /app/deploy/scripts/sgx/enclave_env.sh
-cp /app/sgx/gramine/CI-Examples/tensorflow_io.py ./
-cp /app/sgx/token/* ./
-unset HTTPS_PROXY https_proxy http_proxy ftp_proxy
 
 make_custom_env 4
 source /root/start_aesm_service.sh
 
+cd $EXEC_DIR
 if [[ -z "${START_CPU_SN}" ]]; then
     START_CPU_SN=0
 fi
 if [[ -z "${END_CPU_SN}" ]]; then
     END_CPU_SN=3
 fi
 
-taskset -c $START_CPU_SN-$END_CPU_SN stdbuf -o0 gramine-sgx python -m fedlearner.trainer.parameter_server $POD_IP:${LISTEN_PORT}
+taskset -c $START_CPU_SN-$END_CPU_SN stdbuf -o0 gramine-sgx python -m fedlearner.trainer.parameter_server $POD_IP:${LISTEN_PORT}
diff --git a/deploy/scripts/sgx/run_trainer_worker_sgx.sh b/deploy/scripts/sgx/run_trainer_worker_sgx.sh
@@ -23,10 +23,14 @@ LISTEN_PORT=50051
 if [[ -n "${PORT0}" ]]; then
   LISTEN_PORT=${PORT0}
 fi
-
 echo $LISTEN_PORT > /pod-data/listen_port
 
-unset HTTPS_PROXY https_proxy http_proxy ftp_proxy
+PROXY_LOCAL_PORT=50053
+if [[ -n "${PORT2}" ]]; then
+  PROXY_LOCAL_PORT=${PORT2}
+fi
+echo $PROXY_LOCAL_PORT > /pod-data/proxy_local_port
+
 cp /app/sgx/gramine/CI-Examples/tensorflow_io.py ./
 source /app/deploy/scripts/hdfs_common.sh || true
 source /app/deploy/scripts/pre_start_hook.sh || true
@@ -40,10 +44,11 @@ else
   pull_code ${CODE_TAR} $PWD
 fi
 
-cd ${ROLE}
-cp /app/sgx/gramine/CI-Examples/tensorflow_io.py ./
+cp /app/sgx/gramine/CI-Examples/tensorflow_io.py /gramine/follower/
+cp /app/sgx/gramine/CI-Examples/tensorflow_io.py /gramine/leader/
 source /app/deploy/scripts/sgx/enclave_env.sh
-cp /app/sgx/token/* ./
+
+unset HTTPS_PROXY https_proxy http_proxy ftp_proxy
 
 mode=$(normalize_env_to_args "--mode" "$MODE")
 sparse_estimator=$(normalize_env_to_args "--sparse-estimator" "$SPARSE_ESTIMATOR")
@@ -100,14 +105,15 @@ source /root/start_aesm_service.sh
 
 server_port=$(normalize_env_to_args "--server-port" "$PORT1")
 
+cd $EXEC_DIR
 if [[ -z "${START_CPU_SN}" ]]; then
     START_CPU_SN=0
 fi
 if [[ -z "${END_CPU_SN}" ]]; then
     END_CPU_SN=3
 fi
 
-taskset -c $START_CPU_SN-$END_CPU_SN stdbuf -o0 gramine-sgx python main.py --worker \
+taskset -c $START_CPU_SN-$END_CPU_SN stdbuf -o0 gramine-sgx python /gramine/$ROLE/main.py --worker \
     --application-id="$APPLICATION_ID" \
     --master-addr="$MASTER_HOST:50051" \
     --cluster-spec="$CLUSTER_SPEC" \

diff --git a/sgx/gramine/CI-Examples/generate-token/python.manifest.template b/sgx/gramine/CI-Examples/generate-token/python.manifest.template
@@ -17,7 +17,7 @@ loader.env.SECRET_PROVISION_SET_PF_KEY = "1"
 
 sys.enable_sigterm_injection = true
 sys.enable_extra_runtime_domain_names_conf = true
-sys.stack.size = "2M"
+sys.stack.size = "1M"
 
 fs.mounts = [
   { path = "/lib", uri = "file:{{ gramine.runtimedir() }}" },
@@ -26,6 +26,8 @@ fs.mounts = [
   { path = "{{ python.stdlib }}", uri = "file:{{ python.stdlib }}" },
   { path = "{{ python.distlib }}", uri = "file:{{ python.distlib }}" },
   { path = "/opt", uri = "file:/opt" },
+  { path = "/gramine/leader", uri = "file:/gramine/leader"},
+  { path = "/gramine/follower", uri = "file:/gramine/follower"},
   { path = "/etc", uri = "file:/etc" },
   { path = "/tmp", uri = "file:/tmp" },
   { path = "/bin", uri = "file:/bin" },
@@ -47,7 +49,7 @@ sgx.preheat_enclave = false
 sgx.nonpie_binary = true
 sgx.enable_stats = false
 sgx.enclave_size = "16G"
-sgx.thread_num = 512
+sgx.thread_num = 256
 sgx.rpc_thread_num = 0
 sgx.protected_files_key = "ffeeddccbbaa99887766554433221100"
 
@@ -58,13 +60,11 @@ sgx.trusted_files = [
   "file:/usr/{{ arch_libdir }}/",
   "file:/etc/ssl/certs/ca-certificates.crt",
   "file:/etc/default/apport",
-  "file:/etc/mime.types",
+  "file:/etc/mime.types"
 ]
 
 sgx.allowed_files = [
   "file:tensorflow_io.py",
-  "file:main.py",
-  "file:config.py",
   "file:/opt/tiger/",
   "file:/opt/meituan/",
   "file:{{ python.stdlib }}/",
@@ -93,6 +93,8 @@ sgx.allowed_files = [
   "file:/usr/local/lib",
   "file:/lib/",
   "file:/bin/",
-  "file:/data/"
+  "file:/data/",
+  "file:/gramine/leader/",
+  "file:/gramine/follower/"
 ]