From 1ae95e943ce4cdf448ff777170d53b8892365501 Mon Sep 17 00:00:00 2001 From: bio-boris Date: Wed, 23 Mar 2022 15:42:02 -0500 Subject: [PATCH 01/18] Update RELEASE_NOTES.txt --- RELEASE_NOTES.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/RELEASE_NOTES.txt b/RELEASE_NOTES.txt index 3e5ea29..d74dea5 100644 --- a/RELEASE_NOTES.txt +++ b/RELEASE_NOTES.txt @@ -3,7 +3,7 @@ * Add GHA Actions * Move from quay to GHCR.io * New Base Image from HTCondor Dockerhub -* Update security policies for HTCondor Version 9 Series +* Update security policies for HTCondor Version 9 Series 1.0.8.4 ======= From 57bf894dcf1f546b6204e5b1bc079f28752f553a Mon Sep 17 00:00:00 2001 From: bio-boris Date: Wed, 23 Mar 2022 15:44:33 -0500 Subject: [PATCH 02/18] Update RELEASE_NOTES.txt --- RELEASE_NOTES.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/RELEASE_NOTES.txt b/RELEASE_NOTES.txt index d74dea5..3e5ea29 100644 --- a/RELEASE_NOTES.txt +++ b/RELEASE_NOTES.txt @@ -3,7 +3,7 @@ * Add GHA Actions * Move from quay to GHCR.io * New Base Image from HTCondor Dockerhub -* Update security policies for HTCondor Version 9 Series +* Update security policies for HTCondor Version 9 Series 1.0.8.4 ======= From b528df9689bc1a83a020e5f768c8e77b53ad1a9d Mon Sep 17 00:00:00 2001 From: bio-boris Date: Wed, 23 Mar 2022 16:45:44 -0500 Subject: [PATCH 03/18] Update start-condor.sh --- deployment/bin/start-condor.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deployment/bin/start-condor.sh b/deployment/bin/start-condor.sh index 59428d2..d2d7ac2 100755 --- a/deployment/bin/start-condor.sh +++ b/deployment/bin/start-condor.sh @@ -9,7 +9,7 @@ fi if [ "$POOL_PASSWORD" ] ; then /usr/sbin/condor_store_cred -p "$POOL_PASSWORD" -f "$(condor_config_val SEC_PASSWORD_FILE)" - echo "$POOL_PASSWORD" | condor_store_cred -c add + condor_store_cred -p "$POOL_PASSWORD" -c add umask 0077; condor_token_create -identity condor@mypool > /etc/condor/tokens.d/condor@mypool fi From 7e7676c8945545aaaf592bb3450999a3383aba6a Mon Sep 17 00:00:00 2001 From: bio-boris Date: Fri, 25 Mar 2022 10:13:07 -0500 Subject: [PATCH 04/18] Update Dockerfile --- Dockerfile | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Dockerfile b/Dockerfile index 03b2de4..7bac474 100644 --- a/Dockerfile +++ b/Dockerfile @@ -37,6 +37,9 @@ RUN rm -rf /var/cache/yum COPY --chown=kbase deployment/ /kb/deployment/ +# Install dependencies for JobRunner +ENV PATH /miniconda/bin:$PATH +RUN wget https://raw.githubusercontent.com/kbase/JobRunner/master/requirements.txt && pip install -r requirements.txt && rm requirements.txt RUN /kb/deployment/bin/install_python_dependencies.sh # The BUILD_DATE value seem to bust the docker cache when the timestamp changes, move to From 97c95704d213230a93bcf4167809310de03ce1b8 Mon Sep 17 00:00:00 2001 From: bio-boris Date: Fri, 25 Mar 2022 10:43:59 -0500 Subject: [PATCH 05/18] Update build_prodrc_pr.yaml --- .github/workflows/build_prodrc_pr.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/build_prodrc_pr.yaml b/.github/workflows/build_prodrc_pr.yaml index 2e5034e..a548960 100644 --- a/.github/workflows/build_prodrc_pr.yaml +++ b/.github/workflows/build_prodrc_pr.yaml @@ -5,6 +5,7 @@ name: Build Prod RC Image branches: - master - main + - workflow_dispatch types: - opened - synchronize From eafc23d63d1339c463cdc8a9032e31662c2dd6cb Mon Sep 17 00:00:00 2001 From: bio-boris Date: Fri, 25 Mar 2022 10:44:55 -0500 Subject: [PATCH 06/18] Update build_test_pr.yaml --- .github/workflows/build_test_pr.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/build_test_pr.yaml b/.github/workflows/build_test_pr.yaml index b6b5328..80673b0 100644 --- a/.github/workflows/build_test_pr.yaml +++ b/.github/workflows/build_test_pr.yaml @@ -8,6 +8,7 @@ name: Build Test Image - opened - synchronize - ready_for_review + workflow_dispatch: jobs: docker_build: runs-on: ubuntu-latest From 648b13336c7fb0d4f959c8eb0081b98df02432f1 Mon Sep 17 00:00:00 2001 From: bio-boris Date: Fri, 25 Mar 2022 10:45:33 -0500 Subject: [PATCH 07/18] Update tag_test_latest.yaml --- .github/workflows/tag_test_latest.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/tag_test_latest.yaml b/.github/workflows/tag_test_latest.yaml index d8cac46..5231241 100644 --- a/.github/workflows/tag_test_latest.yaml +++ b/.github/workflows/tag_test_latest.yaml @@ -6,6 +6,7 @@ name: Tag Latest Test Image - develop types: - closed + workflow_dispatch: jobs: docker_tag: runs-on: ubuntu-latest From f7103e88c2b1ba35998233afab09aabc2ecc85a0 Mon Sep 17 00:00:00 2001 From: bio-boris Date: Fri, 25 Mar 2022 11:04:39 -0500 Subject: [PATCH 08/18] Update build_prodrc_pr.yaml --- .github/workflows/build_prodrc_pr.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build_prodrc_pr.yaml b/.github/workflows/build_prodrc_pr.yaml index a548960..84f570b 100644 --- a/.github/workflows/build_prodrc_pr.yaml +++ b/.github/workflows/build_prodrc_pr.yaml @@ -5,11 +5,11 @@ name: Build Prod RC Image branches: - master - main - - workflow_dispatch types: - opened - synchronize - ready_for_review + workflow_dispatch: jobs: docker_build: runs-on: ubuntu-latest From 97c5dccc6e28265c6ffd35d7d22f09c9b3b0f984 Mon Sep 17 00:00:00 2001 From: bio-boris Date: Fri, 25 Mar 2022 13:06:00 -0500 Subject: [PATCH 09/18] Update build_test_pr.yaml --- .github/workflows/build_test_pr.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/build_test_pr.yaml b/.github/workflows/build_test_pr.yaml index 80673b0..ee0caf1 100644 --- a/.github/workflows/build_test_pr.yaml +++ b/.github/workflows/build_test_pr.yaml @@ -9,6 +9,7 @@ name: Build Test Image - synchronize - ready_for_review workflow_dispatch: + jobs: docker_build: runs-on: ubuntu-latest From 89a1927d541e508e5c4333c1a9ed777d745f3782 Mon Sep 17 00:00:00 2001 From: bio-boris Date: Wed, 16 Nov 2022 13:51:15 -0600 Subject: [PATCH 10/18] Create manual-build.yml --- .github/workflows/manual-build.yml | 11 +++++++++++ 1 file changed, 11 insertions(+) create mode 100644 .github/workflows/manual-build.yml diff --git a/.github/workflows/manual-build.yml b/.github/workflows/manual-build.yml new file mode 100644 index 0000000..944f903 --- /dev/null +++ b/.github/workflows/manual-build.yml @@ -0,0 +1,11 @@ +--- +name: Manual Build & Push +on: + workflow_dispatch: +jobs: + build-push: + uses: kbase/.github/.github/workflows/reusable_build-push.yml@main + with: + name: '${{ github.event.repository.name }}-develop' + tags: br-${{ github.ref_name }} + secrets: inherit From eee4bee40ba331c30e370619c782ebf4a7e36655 Mon Sep 17 00:00:00 2001 From: bio-boris Date: Fri, 4 Aug 2023 15:26:16 -0500 Subject: [PATCH 11/18] DEVOPS-1465 Update health check (#57) * Create manual-build.yml * Update health_check.py * Update Dockerfile * Update install_python_dependencies.sh * Update RELEASE_NOTES.txt --------- Co-authored-by: Boris --- Dockerfile | 6 +++--- RELEASE_NOTES.txt | 8 ++++++++ deployment/bin/cron/health_check.py | 13 +++++++++---- deployment/bin/install_python_dependencies.sh | 13 ++++++++++--- 4 files changed, 30 insertions(+), 10 deletions(-) diff --git a/Dockerfile b/Dockerfile index 7bac474..b364dee 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,10 +1,10 @@ -FROM htcondor/execute:9.7-el7 +FROM htcondor/execute:lts-el8 ENV container docker # Ge$t commonly used utilities -RUN yum install -y deltarpm RUN yum -y update && yum upgrade -y -RUN yum -y install -y epel-release wget which git deltarpm gcc libcgroup libcgroup-tools stress-ng tmpwatch +RUN yum install -y drpm +RUN yum -y install -y epel-release wget which git gcc libcgroup libcgroup-tools stress-ng tmpwatch # Install docker binaries RUN yum install -y yum-utils device-mapper-persistent-data lvm2 && yum-config-manager --add-repo https://download.docker.com/linux/centos/docker-ce.repo && yum install -y docker-ce diff --git a/RELEASE_NOTES.txt b/RELEASE_NOTES.txt index 3e5ea29..4b70aa6 100644 --- a/RELEASE_NOTES.txt +++ b/RELEASE_NOTES.txt @@ -1,3 +1,11 @@ +1.0.10 +======== +* Fix health check +* Fix dependencies for JobRunner +* Pin versions due for Python +* Update base image to `htcondor/execute:lts-el8` + + 1.0.9 ======= * Add GHA Actions diff --git a/deployment/bin/cron/health_check.py b/deployment/bin/cron/health_check.py index bc5c6db..80cebc4 100755 --- a/deployment/bin/cron/health_check.py +++ b/deployment/bin/cron/health_check.py @@ -203,8 +203,7 @@ def checkEndpoints(): Check auth/njs/catalog/ws """ - services = { - f"{endpoint}/auth": {}, + post_services = { f"{endpoint}/catalog": { "method": "Catalog.status", "version": "1.1", @@ -218,10 +217,14 @@ def checkEndpoints(): "params": [], }, } + get_services = {f"{endpoint}/auth": {}} - for service in services: + for service in {**post_services, **get_services}: try: - response = requests.post(url=service, json=services[service], timeout=30) + if service in post_services: + response = requests.post(url=service, json=post_services[service], timeout=30) + else: + response = requests.get(url=service, timeout=30) if response.status_code != 200: message = f"{service} is not available." exit_unsuccessfully(message) @@ -229,6 +232,8 @@ def checkEndpoints(): message = f"Couldn't reach {service}. {e}" exit_unsuccessfully(message) + + def main(): try: diff --git a/deployment/bin/install_python_dependencies.sh b/deployment/bin/install_python_dependencies.sh index 401325c..191437c 100755 --- a/deployment/bin/install_python_dependencies.sh +++ b/deployment/bin/install_python_dependencies.sh @@ -1,6 +1,13 @@ #!/usr/bin/env bash -#Install Python3 Libraries -#TODO Requirements.txt +#Install Python3 Libraries for cronjobs and for job runner + source /miniconda/bin/activate -pip install requests docker slackclient htcondor psutil lockfile +pip install requests==2.29.0 +pip install docker==6.1.3 +pip install slackclient==2.9.4 +pip install htcondor==10.7.0 +pip install psutil==5.9.5 +pip install lockfile==0.12.2 +pip install sanic==21.9.3 +pip install websockets==10.4 From b9d074e582c82e0c3fe12c4916f69ec3631a4e9a Mon Sep 17 00:00:00 2001 From: bio-boris Date: Wed, 24 Jan 2024 22:12:30 -0600 Subject: [PATCH 12/18] DEVOPS-1593 Update worker cronjob (#58) * Modify cronjobs --------- Co-authored-by: Boris --- .github/workflows/build_prodrc_pr.yaml | 32 -- .github/workflows/build_test_pr.yaml | 29 -- .github/workflows/pr_build.yml | 43 ++ .github/workflows/prod_release.yaml | 38 -- .github/workflows/release-main.yml | 25 ++ .github/workflows/scripts/build_prodrc_pr.sh | 17 - .github/workflows/scripts/build_test_pr.sh | 17 - .github/workflows/scripts/deploy_tag.sh | 34 -- .github/workflows/scripts/prod_release.sh | 24 - .github/workflows/scripts/tag_environments.sh | 22 - .github/workflows/scripts/tag_prod_latest.sh | 12 - .github/workflows/scripts/tag_test_latest.sh | 12 - .github/workflows/tag_environments.yaml | 19 - .github/workflows/tag_prod_latest.yaml | 27 -- .github/workflows/tag_test_latest.yaml | 27 -- Dockerfile | 10 +- README.md | 44 +- RELEASE_NOTES.txt | 9 + deployment/README.md | 3 - .../cron/clients/NarrativeJobServiceClient.py | 416 ------------------ deployment/bin/cron/clients/baseclient.py | 311 ------------- deployment/bin/cron/container_reaper.py | 229 +++++----- deployment/bin/cron/container_reaper_ee2.py | 306 +++++++------ .../bin/cron/delete_exited_containers.py | 21 +- deployment/bin/cron/health_check.py | 77 ++-- .../bin/cruft/check_abandoned_containers.py | 90 ---- .../bin/cruft/check_abandoned_containers.sh | 33 -- .../bin/cruft/delete_exited_containers.sh | 24 - .../conf/.templates/cronjobs.config.templ | 36 +- 29 files changed, 432 insertions(+), 1555 deletions(-) delete mode 100644 .github/workflows/build_prodrc_pr.yaml delete mode 100644 .github/workflows/build_test_pr.yaml create mode 100644 .github/workflows/pr_build.yml delete mode 100644 .github/workflows/prod_release.yaml create mode 100644 .github/workflows/release-main.yml delete mode 100755 .github/workflows/scripts/build_prodrc_pr.sh delete mode 100755 .github/workflows/scripts/build_test_pr.sh delete mode 100755 .github/workflows/scripts/deploy_tag.sh delete mode 100755 .github/workflows/scripts/prod_release.sh delete mode 100755 .github/workflows/scripts/tag_environments.sh delete mode 100755 .github/workflows/scripts/tag_prod_latest.sh delete mode 100755 .github/workflows/scripts/tag_test_latest.sh delete mode 100644 .github/workflows/tag_environments.yaml delete mode 100644 .github/workflows/tag_prod_latest.yaml delete mode 100644 .github/workflows/tag_test_latest.yaml delete mode 100644 deployment/README.md delete mode 100644 deployment/bin/cron/clients/NarrativeJobServiceClient.py delete mode 100644 deployment/bin/cron/clients/baseclient.py delete mode 100644 deployment/bin/cruft/check_abandoned_containers.py delete mode 100644 deployment/bin/cruft/check_abandoned_containers.sh delete mode 100755 deployment/bin/cruft/delete_exited_containers.sh diff --git a/.github/workflows/build_prodrc_pr.yaml b/.github/workflows/build_prodrc_pr.yaml deleted file mode 100644 index 84f570b..0000000 --- a/.github/workflows/build_prodrc_pr.yaml +++ /dev/null @@ -1,32 +0,0 @@ ---- -name: Build Prod RC Image -'on': - pull_request: - branches: - - master - - main - types: - - opened - - synchronize - - ready_for_review - workflow_dispatch: -jobs: - docker_build: - runs-on: ubuntu-latest - steps: - - name: Verify merge is develop -> main - if: github.head_ref != 'develop' - run: echo "Must merge from develop -> main/master"; exit 1 - - name: Check out GitHub Repo - if: github.event.pull_request.draft == false && github.head_ref == 'develop' - with: - ref: "${{ github.event.pull_request.head.sha }}" - uses: actions/checkout@v2 - - name: Build and Push to Packages - if: github.event.pull_request.draft == false && github.head_ref == 'develop' - env: - PR: "${{ github.event.pull_request.number }}" - SHA: "${{ github.event.pull_request.head.sha }}" - DOCKER_ACTOR: "${{ secrets.GHCR_USERNAME }}" - DOCKER_TOKEN: "${{ secrets.GHCR_TOKEN }}" - run: "./.github/workflows/scripts/build_prodrc_pr.sh\n" diff --git a/.github/workflows/build_test_pr.yaml b/.github/workflows/build_test_pr.yaml deleted file mode 100644 index ee0caf1..0000000 --- a/.github/workflows/build_test_pr.yaml +++ /dev/null @@ -1,29 +0,0 @@ ---- -name: Build Test Image -'on': - pull_request: - branches: - - develop - types: - - opened - - synchronize - - ready_for_review - workflow_dispatch: - -jobs: - docker_build: - runs-on: ubuntu-latest - steps: - - name: Check out GitHub Repo - if: github.event.pull_request.draft == false - with: - ref: "${{ github.event.pull_request.head.sha }}" - uses: actions/checkout@v2 - - name: Build and Push to Packages - if: github.event.pull_request.draft == false - env: - PR: "${{ github.event.pull_request.number }}" - SHA: "${{ github.event.pull_request.head.sha }}" - DOCKER_ACTOR: "${{ secrets.GHCR_USERNAME }}" - DOCKER_TOKEN: "${{ secrets.GHCR_TOKEN }}" - run: "./.github/workflows/scripts/build_test_pr.sh\n" diff --git a/.github/workflows/pr_build.yml b/.github/workflows/pr_build.yml new file mode 100644 index 0000000..0fa1c46 --- /dev/null +++ b/.github/workflows/pr_build.yml @@ -0,0 +1,43 @@ +--- +name: Pull Request Build, Tag, & Push +on: + pull_request: + branches: + - develop + - main + - master + types: + - opened + - reopened + - synchronize + - closed +jobs: + build-develop-open: + if: github.base_ref == 'develop' && github.event.pull_request.merged == false + uses: kbase/.github/.github/workflows/reusable_build.yml@main + secrets: inherit + build-develop-merge: + if: github.base_ref == 'develop' && github.event.pull_request.merged == true + uses: kbase/.github/.github/workflows/reusable_build-push.yml@main + with: + name: '${{ github.event.repository.name }}-develop' + tags: pr-${{ github.event.number }},latest + secrets: inherit + build-main-open: + if: (github.base_ref == 'main' || github.base_ref == 'master') && github.event.pull_request.merged == false + uses: kbase/.github/.github/workflows/reusable_build-push.yml@main + with: + name: '${{ github.event.repository.name }}' + tags: pr-${{ github.event.number }} + secrets: inherit + build-main-merge: + if: (github.base_ref == 'main' || github.base_ref == 'master') && github.event.pull_request.merged == true + uses: kbase/.github/.github/workflows/reusable_build-push.yml@main + with: + name: '${{ github.event.repository.name }}' + tags: pr-${{ github.event.number }},latest-rc + secrets: inherit + trivy-scans: + if: (github.base_ref == 'develop' || github.base_ref == 'main' || github.base_ref == 'master' ) && github.event.pull_request.merged == false + uses: kbase/.github/.github/workflows/reusable_trivy-scans.yml@main + secrets: inherit diff --git a/.github/workflows/prod_release.yaml b/.github/workflows/prod_release.yaml deleted file mode 100644 index ffa1453..0000000 --- a/.github/workflows/prod_release.yaml +++ /dev/null @@ -1,38 +0,0 @@ ---- -name: Publish Release Image -'on': - release: - branches: - - main - - master - types: - - published -jobs: - docker_build: - runs-on: ubuntu-latest - steps: - - name: Check Tag - id: check-tag - run: |- - if [[ ${{ github.ref_name }} =~ ^[0-9]+\.[0-9]+\.[0-9]+$ ]]; then - echo ::set-output name=match::true - fi - - name: Report SemVer Check - if: steps.check-tag.outputs.match != 'true' - run: echo "Release version must follow semantic naming (e.g. 1.0.2)"; exit 1 - - name: Check Source Branch - if: github.event.release.target_commitish != 'master' && github.event.release.target_commitish != 'main' - run: echo "Releases must be built from master/main branch"; exit 1 - - name: Check out GitHub Repo - with: - ref: "${{ github.event.pull_request.head.sha }}" - uses: actions/checkout@v2 - - name: Build and Push to Packages - env: - ISH: "${{ github.event.release.target_commitish }}" - PR: "${{ github.event.pull_request.number }}" - SHA: "${{ github.event.pull_request.head.sha }}" - VER: "${{ github.event.release.tag_name }}" - DOCKER_ACTOR: "${{ secrets.GHCR_USERNAME }}" - DOCKER_TOKEN: "${{ secrets.GHCR_TOKEN }}" - run: "./.github/workflows/scripts/prod_release.sh\n" diff --git a/.github/workflows/release-main.yml b/.github/workflows/release-main.yml new file mode 100644 index 0000000..a254678 --- /dev/null +++ b/.github/workflows/release-main.yml @@ -0,0 +1,25 @@ +--- +name: Release - Build & Push Image +on: + release: + branches: + - main + - master + types: [ published ] +jobs: + check-source-branch: + uses: kbase/.github/.github/workflows/reusable_validate-branch.yml@main + with: + build_branch: '${{ github.event.release.target_commitish }}' + validate-release-tag: + needs: check-source-branch + uses: kbase/.github/.github/workflows/reusable_validate-release-tag.yml@main + with: + release_tag: '${{ github.event.release.tag_name }}' + build-push: + needs: validate-release-tag + uses: kbase/.github/.github/workflows/reusable_build-push.yml@main + with: + name: '${{ github.event.repository.name }}' + tags: '${{ github.event.release.tag_name }},latest' + secrets: inherit diff --git a/.github/workflows/scripts/build_prodrc_pr.sh b/.github/workflows/scripts/build_prodrc_pr.sh deleted file mode 100755 index 4c7bdf2..0000000 --- a/.github/workflows/scripts/build_prodrc_pr.sh +++ /dev/null @@ -1,17 +0,0 @@ -#! /usr/bin/env bash - -export MY_ORG=$(echo "${GITHUB_REPOSITORY}" | awk -F / '{print $1}') -export MY_APP=$(echo "${GITHUB_REPOSITORY}" | awk -F / '{print $2}') -export DATE=$(date -u +"%Y-%m-%dT%H:%M:%SZ") -export BUILD_DATE=$(date -u +"%Y-%m-%dT%H:%M:%SZ") -export COMMIT=$(echo "$SHA" | cut -c -7) - -echo "Branch is:" ${GITHUB_HEAD_REF} -docker login -u "$DOCKER_ACTOR" -p "$DOCKER_TOKEN" ghcr.io -docker build --build-arg BUILD_DATE="$DATE" \ - --build-arg COMMIT="$COMMIT" \ - --build-arg BRANCH="$GITHUB_HEAD_REF" \ - --build-arg PULL_REQUEST="$PR" \ - --label us.kbase.vcs-pull-req="$PR" \ - -t ghcr.io/"$MY_ORG"/"$MY_APP":"pr-""$PR" . -docker push ghcr.io/"$MY_ORG"/"$MY_APP":"pr-""$PR" diff --git a/.github/workflows/scripts/build_test_pr.sh b/.github/workflows/scripts/build_test_pr.sh deleted file mode 100755 index 546b1b4..0000000 --- a/.github/workflows/scripts/build_test_pr.sh +++ /dev/null @@ -1,17 +0,0 @@ -#! /usr/bin/env bash - -export MY_ORG=$(echo "${GITHUB_REPOSITORY}" | awk -F / '{print $1}') -export MY_APP=$(echo $(echo "${GITHUB_REPOSITORY}" | awk -F / '{print $2}')"-develop") -export DATE=$(date -u +"%Y-%m-%dT%H:%M:%SZ") -export BUILD_DATE=$(date -u +"%Y-%m-%dT%H:%M:%SZ") -export COMMIT=$(echo "$SHA" | cut -c -7) - -echo $DOCKER_TOKEN | docker login ghcr.io -u $DOCKER_ACTOR --password-stdin -docker build --build-arg BUILD_DATE="$DATE" \ - --build-arg COMMIT="$COMMIT" \ - --build-arg BRANCH="$GITHUB_HEAD_REF" \ - --build-arg PULL_REQUEST="$PR" \ - --label us.kbase.vcs-pull-req="$PR" \ - -t ghcr.io/"$MY_ORG"/"$MY_APP":"pr-""$PR" . -docker push ghcr.io/"$MY_ORG"/"$MY_APP":"pr-""$PR" - \ No newline at end of file diff --git a/.github/workflows/scripts/deploy_tag.sh b/.github/workflows/scripts/deploy_tag.sh deleted file mode 100755 index 5fb928a..0000000 --- a/.github/workflows/scripts/deploy_tag.sh +++ /dev/null @@ -1,34 +0,0 @@ -#! /usr/bin/env bash - -# Usage: ./deploy_tag.sh -e TARGET -o ORG -r REPO -s DEV_PROD -t IMAGE_TAG -# -# Example 1: ./deploy_tag.sh -o "kbase" -r "narrative-traefiker" -s "dev" -t "pr-9001" -e "ci" -# Example 2: ./deploy_tag.sh -o "kbase" -r "narrative" -s "prod" -t "latest" -e "next" -# -# Where: -# -o ORG is the organization (`kbase`, `kbaseapps`, etc.) -# -r REPO is the repository (e.g. `narrative`) -# -s DEV_PROD determines whether to pull the development {APPNAME}-develop or production {APPNAME} image. -# -t IMAGE_TAG is the *current* Docker image tag, typically `pr-#` or `latest` -# -e TARGET is one of: `appdsshev`, `ci`, or `next` -# -# Be sure to set $TOKEN first! -# See: https://docs.github.com/en/packages/getting-started-with-github-container-registry/migrating-to-github-container-registry-for-docker-images#authenticating-with-the-container-registry - - -while getopts e:o:r:s:t: option - do - case "${option}" - in - e) TARGET=${OPTARG};; - o) ORG=${OPTARG};; - r) REPO=${OPTARG};; - s) DEV_PROD=${OPTARG};; - t) IMAGE_TAG=${OPTARG};; - esac -done - -curl -H "Authorization: token $TOKEN" \ - -H 'Accept: application/vnd.github.everest-preview+json' \ - "https://api.github.com/repos/$ORG/$REPO/dispatches" \ - -d '{"event_type":"Tag '"$DEV_PROD"' '"$IMAGE_TAG"' for '"$TARGET"'", "client_payload": {"image_tag": "'"$IMAGE_TAG"'","target": "'"$TARGET"'","dev_prod": "'"$DEV_PROD"'"}}' diff --git a/.github/workflows/scripts/prod_release.sh b/.github/workflows/scripts/prod_release.sh deleted file mode 100755 index 46d008c..0000000 --- a/.github/workflows/scripts/prod_release.sh +++ /dev/null @@ -1,24 +0,0 @@ -#! /usr/bin/env bash - -export MY_ORG=$(echo "${GITHUB_REPOSITORY}" | awk -F / '{print $1}') -export MY_APP=$(echo "${GITHUB_REPOSITORY}" | awk -F / '{print $2}') -export DATE=$(date -u +"%Y-%m-%dT%H:%M:%SZ") -export BUILD_DATE=$(date -u +"%Y-%m-%dT%H:%M:%SZ") -export COMMIT=$(echo "$SHA" | cut -c -7) - -echo "ISH is:" $ISH -echo "GITHUB_REF is:" $GITHUB_REF -echo "HEAD_REF is:" $GITHUB_HEAD_REF -echo "BASE_REF is:" $GITHUB_BASE_REF -echo "Release is:" $GITHUB_REF_NAME -echo $DOCKER_TOKEN | docker login ghcr.io -u $DOCKER_ACTOR --password-stdin -docker build --build-arg BUILD_DATE="$DATE" \ - --build-arg COMMIT="$COMMIT" \ - --build-arg BRANCH="$GITHUB_HEAD_REF" \ - --build-arg PULL_REQUEST="$PR" \ - --build-arg VERSION="$VER" \ - --label us.kbase.vcs-pull-req="$PR" \ - -t ghcr.io/"$MY_ORG"/"$MY_APP":"$VER" \ - -t ghcr.io/"$MY_ORG"/"$MY_APP":"latest" . -docker push ghcr.io/"$MY_ORG"/"$MY_APP":"$VER" -docker push ghcr.io/"$MY_ORG"/"$MY_APP":"latest" diff --git a/.github/workflows/scripts/tag_environments.sh b/.github/workflows/scripts/tag_environments.sh deleted file mode 100755 index b39732a..0000000 --- a/.github/workflows/scripts/tag_environments.sh +++ /dev/null @@ -1,22 +0,0 @@ - -#! /usr/bin/env bash -# Add vars for PR & environments to yaml, as called from external script - -export MY_ORG=$(echo "${GITHUB_REPOSITORY}" | awk -F / '{print $1}') -export MY_APP=$(echo "${GITHUB_REPOSITORY}" | awk -F / '{print $2}') -export DATE=$(date -u +"%Y-%m-%dT%H:%M:%SZ") -export BUILD_DATE=$(date -u +"%Y-%m-%dT%H:%M:%SZ") -export COMMIT=$(echo "$SHA" | cut -c -7) - -if [ $DEV_PROD = "dev" ] || [ $DEV_PROD = "develop" ] -then - IMAGE=$MY_APP"-develop" -else - IMAGE=$MY_APP -fi - -echo "Dev or Prod:" $DEV_PROD -docker login -u "$DOCKER_ACTOR" -p "$DOCKER_TOKEN" ghcr.io -docker pull ghcr.io/"$MY_ORG"/"$IMAGE":"$IMAGE_TAG" -docker tag ghcr.io/"$MY_ORG"/"$IMAGE":"$IMAGE_TAG" ghcr.io/"$MY_ORG"/"$IMAGE":"$TARGET" -docker push ghcr.io/"$MY_ORG"/"$IMAGE":"$TARGET" diff --git a/.github/workflows/scripts/tag_prod_latest.sh b/.github/workflows/scripts/tag_prod_latest.sh deleted file mode 100755 index c3c4225..0000000 --- a/.github/workflows/scripts/tag_prod_latest.sh +++ /dev/null @@ -1,12 +0,0 @@ -#! /usr/bin/env bash - -export MY_ORG=$(echo "${GITHUB_REPOSITORY}" | awk -F / '{print $1}') -export MY_APP=$(echo "${GITHUB_REPOSITORY}" | awk -F / '{print $2}') -export DATE=$(date -u +"%Y-%m-%dT%H:%M:%SZ") -export BUILD_DATE=$(date -u +"%Y-%m-%dT%H:%M:%SZ") -export COMMIT=$(echo "$SHA" | cut -c -7) - -docker login -u "$DOCKER_ACTOR" -p "$DOCKER_TOKEN" ghcr.io -docker pull ghcr.io/"$MY_ORG"/"$MY_APP":"pr-""$PR" -docker tag ghcr.io/"$MY_ORG"/"$MY_APP":"pr-""$PR" ghcr.io/"$MY_ORG"/"$MY_APP":"latest-rc" -docker push ghcr.io/"$MY_ORG"/"$MY_APP":"latest-rc" diff --git a/.github/workflows/scripts/tag_test_latest.sh b/.github/workflows/scripts/tag_test_latest.sh deleted file mode 100755 index c0dc504..0000000 --- a/.github/workflows/scripts/tag_test_latest.sh +++ /dev/null @@ -1,12 +0,0 @@ -#! /usr/bin/env bash - -export MY_ORG=$(echo "${GITHUB_REPOSITORY}" | awk -F / '{print $1}') -export MY_APP=$(echo $(echo "${GITHUB_REPOSITORY}" | awk -F / '{print $2}')"-develop") -export DATE=$(date -u +"%Y-%m-%dT%H:%M:%SZ") -export BUILD_DATE=$(date -u +"%Y-%m-%dT%H:%M:%SZ") -export COMMIT=$(echo "$SHA" | cut -c -7) - -docker login -u "$DOCKER_ACTOR" -p "$DOCKER_TOKEN" ghcr.io -docker pull ghcr.io/"$MY_ORG"/"$MY_APP":"pr-""$PR" -docker tag ghcr.io/"$MY_ORG"/"$MY_APP":"pr-""$PR" ghcr.io/"$MY_ORG"/"$MY_APP":"latest" -docker push ghcr.io/"$MY_ORG"/"$MY_APP":"latest" diff --git a/.github/workflows/tag_environments.yaml b/.github/workflows/tag_environments.yaml deleted file mode 100644 index 6dba743..0000000 --- a/.github/workflows/tag_environments.yaml +++ /dev/null @@ -1,19 +0,0 @@ ---- -name: Tag Image For Deploy -'on': - repository_dispatch -jobs: - tag_environments: - runs-on: ubuntu-latest - steps: - - name: Check out GitHub Repo - uses: actions/checkout@v2 - - name: Tag Deploy Environments - env: - DOCKER_ACTOR: "${{ secrets.GHCR_USERNAME }}" - DOCKER_TOKEN: ${{ secrets.GHCR_TOKEN }} - IMAGE_TAG: ${{ github.event.client_payload.image_tag }} - SHA: ${{ github.event.pull_request.head.sha }} - TARGET: ${{ github.event.client_payload.target }} - DEV_PROD: ${{ github.event.client_payload.dev_prod }} - run: './.github/workflows/scripts/tag_environments.sh' diff --git a/.github/workflows/tag_prod_latest.yaml b/.github/workflows/tag_prod_latest.yaml deleted file mode 100644 index 12b23df..0000000 --- a/.github/workflows/tag_prod_latest.yaml +++ /dev/null @@ -1,27 +0,0 @@ ---- -name: Tag Prod Latest -'on': - pull_request: - branches: - - master - - main - types: - - closed -jobs: - docker_tag: - runs-on: ubuntu-latest - steps: - - name: Check out GitHub Repo - if: github.event_name == 'pull_request' && github.event.action == 'closed' && - github.event.pull_request.merged == true - with: - ref: "${{ github.event.pull_request.head.sha }}" - uses: actions/checkout@v2 - - name: Build and Push to Packages - if: github.event.pull_request.draft == false - env: - PR: "${{ github.event.pull_request.number }}" - SHA: "${{ github.event.pull_request.head.sha }}" - DOCKER_ACTOR: "${{ secrets.GHCR_USERNAME }}" - DOCKER_TOKEN: "${{ secrets.GHCR_TOKEN }}" - run: "./.github/workflows/scripts/tag_prod_latest.sh\n" diff --git a/.github/workflows/tag_test_latest.yaml b/.github/workflows/tag_test_latest.yaml deleted file mode 100644 index 5231241..0000000 --- a/.github/workflows/tag_test_latest.yaml +++ /dev/null @@ -1,27 +0,0 @@ ---- -name: Tag Latest Test Image -'on': - pull_request: - branches: - - develop - types: - - closed - workflow_dispatch: -jobs: - docker_tag: - runs-on: ubuntu-latest - steps: - - name: Check out GitHub Repo - if: github.event_name == 'pull_request' && github.event.action == 'closed' && - github.event.pull_request.merged == true - with: - ref: "${{ github.event.pull_request.head.sha }}" - uses: actions/checkout@v2 - - name: Build and Push to Packages - if: github.event.pull_request.draft == false - env: - PR: "${{ github.event.pull_request.number }}" - SHA: "${{ github.event.pull_request.head.sha }}" - DOCKER_ACTOR: "${{ secrets.GHCR_USERNAME }}" - DOCKER_TOKEN: "${{ secrets.GHCR_TOKEN }}" - run: "./.github/workflows/scripts/tag_test_latest.sh\n" diff --git a/Dockerfile b/Dockerfile index b364dee..d134337 100644 --- a/Dockerfile +++ b/Dockerfile @@ -4,7 +4,8 @@ ENV container docker # Ge$t commonly used utilities RUN yum -y update && yum upgrade -y RUN yum install -y drpm -RUN yum -y install -y epel-release wget which git gcc libcgroup libcgroup-tools stress-ng tmpwatch +RUN yum -y install -y epel-release wget which git gcc libcgroup libcgroup-tools stress-ng tmpwatch procps + # Install docker binaries RUN yum install -y yum-utils device-mapper-persistent-data lvm2 && yum-config-manager --add-repo https://download.docker.com/linux/centos/docker-ce.repo && yum install -y docker-ce @@ -14,7 +15,9 @@ RUN yum install -y yum-utils device-mapper-persistent-data lvm2 && yum-config-ma RUN yum install -y bzip2 \ && wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda.sh \ && bash ~/miniconda.sh -b -p /miniconda \ -&& export PATH="/miniconda/bin:$PATH" + + +ENV PATH="/miniconda/bin:${PATH}" # Add kbase user and set up directories RUN useradd -c "KBase user" -rd /kb/deployment/ -u 998 -s /bin/bash kbase && \ @@ -37,9 +40,6 @@ RUN rm -rf /var/cache/yum COPY --chown=kbase deployment/ /kb/deployment/ -# Install dependencies for JobRunner -ENV PATH /miniconda/bin:$PATH -RUN wget https://raw.githubusercontent.com/kbase/JobRunner/master/requirements.txt && pip install -r requirements.txt && rm requirements.txt RUN /kb/deployment/bin/install_python_dependencies.sh # The BUILD_DATE value seem to bust the docker cache when the timestamp changes, move to diff --git a/README.md b/README.md index 0f5a173..a855d50 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# condor-worker requirements +# Condor-worker requirements The condor workers require @@ -10,22 +10,54 @@ The condor workers require * Docker needs privileges to set cgroups/namespaces * CLIENTGROUPS set with extra apostrophes -Environmental variables to be set in rancher +# Required Environmental Variables for the worker * COLLECTOR_HOST * CONDOR_HOST * POOL_PASSWORD * SCHEDD_HOST * UID_DOMAIN * USE_TCP -* SET_NOBODY_USER_GUID +* SET_NOBODY_USER_GUID * SET_NOBODY_USER_UID * CONDOR_SUBMIT_WORKDIR * EXECUTE_SUFFIX -* SLACK_WEBHOOK_URL -* DELETE_ABANDONED_CONTAINERS -* NJS_ENDPOINT * EE2_ENDPOINT * SERVICE_ENDPOINT * DOCKER_CACHE * CGROUP_MEMORY_LIMIT_POLICY * USE_POOL_PASSWORD=yes + +## HTCondor STARTD_CRON Environment Variables + +* The cronjobs pass their environmental variables to the scripts they run. +* You can check the condor start log for their status and output when something goes wrong. +* You won't know if the cronjob is running unless you check the condor start log for a missing env var or possibly a job is stuck in a NODE_IS_HEALTHY=false state +* If an env var is present in the cronjobs.config, it is required, otherwise the template engine won't render it +* Q: Why are they in both the cronjobs.config and ALSO in environmental vars section? A: I'm not sure. Need to look at that why https://github.com/kbase/condor-worker/issues/59 + + +### NodeHealth Health Check + +#### Required Environmental Variables +* SLACK_WEBHOOK_URL (dev or prod channels) +* SERVICE_ENDPOINT, e.g. https://kbase.us/services/ee2 + +#### Optional Environmental Variables +* DOCKER_CACHE (default: /var/lib/docker/) +* CONDOR_SUBMIT_WORKDIR (default: /cdr) +* EXECUTE_SUFFIX (default: "") +* CHECK_CONDOR_STARTER_HEALTH (default: true) +* DEBUG (default: false) +* CHECK_CONDOR_STARTER_HEALTH (default: true) + +### DeleteExitedContainers +#### Required Environmental Variables +* SLACK_WEBHOOK_URL (dev or prod channels) + + +### EE2ContainerREAPER +#### Required Environmental Variables +* SLACK_WEBHOOK_URL (dev or prod channels) +* CONTAINER_REAPER_ENDPOINTS, e.g. https://kbase.us/services/ee2,https://services.kbase.us/services/ee2, +* DELETE_ABANDONED_CONTAINERS required to be set to true in order to run both checks + diff --git a/RELEASE_NOTES.txt b/RELEASE_NOTES.txt index 4b70aa6..68f6411 100644 --- a/RELEASE_NOTES.txt +++ b/RELEASE_NOTES.txt @@ -1,3 +1,12 @@ +1.0.10 +======== +* Modify Cronjobs that look for runaway containers +* Update Documentation +* Deprecate container_reaper_ee2.py in favor of container_reaper.py + + + + 1.0.10 ======== * Fix health check diff --git a/deployment/README.md b/deployment/README.md deleted file mode 100644 index 139083d..0000000 --- a/deployment/README.md +++ /dev/null @@ -1,3 +0,0 @@ -# condor -KBase specific Condor scheduler image. Based on the andypohl/htcondor image. - diff --git a/deployment/bin/cron/clients/NarrativeJobServiceClient.py b/deployment/bin/cron/clients/NarrativeJobServiceClient.py deleted file mode 100644 index ad12f30..0000000 --- a/deployment/bin/cron/clients/NarrativeJobServiceClient.py +++ /dev/null @@ -1,416 +0,0 @@ -# -*- coding: utf-8 -*- -############################################################ -# -# Autogenerated by the KBase type compiler - -# any changes made here will be overwritten -# -############################################################ - -from __future__ import print_function - -# the following is a hack to get the baseclient to import whether we're in a -# package or not. This makes pep8 unhappy hence the annotations. -try: - # baseclient and this client are in a package - from .baseclient import BaseClient as _BaseClient # @UnusedImport -except Exception: - # no they aren't - from baseclient import BaseClient as _BaseClient # @Reimport - - -class NarrativeJobService(object): - def __init__( - self, - url=None, - timeout=30 * 60, - user_id=None, - password=None, - token=None, - ignore_authrc=False, - trust_all_ssl_certificates=False, - auth_svc="https://ci.kbase.us/services/auth/api/legacy/KBase/Sessions/Login", - ): - if url is None: - raise ValueError("A url is required") - self._service_ver = None - self._client = _BaseClient( - url, - timeout=timeout, - user_id=user_id, - password=password, - token=token, - ignore_authrc=ignore_authrc, - trust_all_ssl_certificates=trust_all_ssl_certificates, - auth_svc=auth_svc, - ) - - def list_config(self, context=None): - """ - :returns: instance of mapping from String to String - """ - return self._client.call_method( - "NarrativeJobService.list_config", [], self._service_ver, context - ) - - def ver(self, context=None): - """ - Returns the current running version of the NarrativeJobService. - :returns: instance of String - """ - return self._client.call_method( - "NarrativeJobService.ver", [], self._service_ver, context - ) - - def status(self, context=None): - """ - Simply check the status of this service to see queue details - :returns: instance of type "Status" -> structure: parameter - "reboot_mode" of type "boolean" (@range [0,1]), parameter - "stopping_mode" of type "boolean" (@range [0,1]), parameter - "running_tasks_total" of Long, parameter "running_tasks_per_user" - of mapping from String to Long, parameter "tasks_in_queue" of - Long, parameter "config" of mapping from String to String, - parameter "git_commit" of String - """ - return self._client.call_method( - "NarrativeJobService.status", [], self._service_ver, context - ) - - def run_job(self, params, context=None): - """ - Start a new job (long running method of service registered in ServiceRegistery). - Such job runs Docker image for this service in script mode. - :param params: instance of type "RunJobParams" (method - service - defined in standard JSON RPC way, typically it's module name from - spec-file followed by '.' and name of funcdef from spec-file - corresponding to running method (e.g. - 'KBaseTrees.construct_species_tree' from trees service); params - - the parameters of the method that performed this call; Optional - parameters: service_ver - specific version of deployed service, - last version is used if this parameter is not defined rpc_context - - context of current method call including nested call history - remote_url - run remote service call instead of local command line - execution. source_ws_objects - denotes the workspace objects that - will serve as a source of data when running the SDK method. These - references will be added to the autogenerated provenance. app_id - - the id of the Narrative application running this job (e.g. - repo/name) mapping meta - user defined metadata to - associate with the job. This data is passed to the User and Job - State (UJS) service. wsid - a workspace id to associate with the - job. This is passed to the UJS service, which will share the job - based on the permissions of the workspace rather than UJS ACLs. - parent_job_id - UJS id of the parent of a batch job. Sub jobs will - add this id to the NJS database under the field "parent_job_id") - -> structure: parameter "method" of String, parameter "params" of - list of unspecified object, parameter "service_ver" of String, - parameter "rpc_context" of type "RpcContext" (call_stack - - upstream calls details including nested service calls and parent - jobs where calls are listed in order from outer to inner.) -> - structure: parameter "call_stack" of list of type "MethodCall" - (time - the time the call was started; method - service defined in - standard JSON RPC way, typically it's module name from spec-file - followed by '.' and name of funcdef from spec-file corresponding - to running method (e.g. 'KBaseTrees.construct_species_tree' from - trees service); job_id - job id if method is asynchronous - (optional field).) -> structure: parameter "time" of type - "timestamp" (A time in the format YYYY-MM-DDThh:mm:ssZ, where Z is - either the character Z (representing the UTC timezone) or the - difference in time to UTC in the format +/-HHMM, eg: - 2012-12-17T23:24:06-0500 (EST time) 2013-04-03T08:56:32+0000 (UTC - time) 2013-04-03T08:56:32Z (UTC time)), parameter "method" of - String, parameter "job_id" of type "job_id" (A job id.), parameter - "run_id" of String, parameter "remote_url" of String, parameter - "source_ws_objects" of list of type "wsref" (A workspace object - reference of the form X/Y/Z, where X is the workspace name or id, - Y is the object name or id, Z is the version, which is optional.), - parameter "app_id" of String, parameter "meta" of mapping from - String to String, parameter "wsid" of Long, parameter - "parent_job_id" of String - :returns: instance of type "job_id" (A job id.) - """ - return self._client.call_method( - "NarrativeJobService.run_job", [params], self._service_ver, context - ) - - def get_job_params(self, job_id, context=None): - """ - Get job params necessary for job execution - :param job_id: instance of type "job_id" (A job id.) - :returns: multiple set - (1) parameter "params" of type - "RunJobParams" (method - service defined in standard JSON RPC way, - typically it's module name from spec-file followed by '.' and name - of funcdef from spec-file corresponding to running method (e.g. - 'KBaseTrees.construct_species_tree' from trees service); params - - the parameters of the method that performed this call; Optional - parameters: service_ver - specific version of deployed service, - last version is used if this parameter is not defined rpc_context - - context of current method call including nested call history - remote_url - run remote service call instead of local command line - execution. source_ws_objects - denotes the workspace objects that - will serve as a source of data when running the SDK method. These - references will be added to the autogenerated provenance. app_id - - the id of the Narrative application running this job (e.g. - repo/name) mapping meta - user defined metadata to - associate with the job. This data is passed to the User and Job - State (UJS) service. wsid - a workspace id to associate with the - job. This is passed to the UJS service, which will share the job - based on the permissions of the workspace rather than UJS ACLs. - parent_job_id - UJS id of the parent of a batch job. Sub jobs will - add this id to the NJS database under the field "parent_job_id") - -> structure: parameter "method" of String, parameter "params" of - list of unspecified object, parameter "service_ver" of String, - parameter "rpc_context" of type "RpcContext" (call_stack - - upstream calls details including nested service calls and parent - jobs where calls are listed in order from outer to inner.) -> - structure: parameter "call_stack" of list of type "MethodCall" - (time - the time the call was started; method - service defined in - standard JSON RPC way, typically it's module name from spec-file - followed by '.' and name of funcdef from spec-file corresponding - to running method (e.g. 'KBaseTrees.construct_species_tree' from - trees service); job_id - job id if method is asynchronous - (optional field).) -> structure: parameter "time" of type - "timestamp" (A time in the format YYYY-MM-DDThh:mm:ssZ, where Z is - either the character Z (representing the UTC timezone) or the - difference in time to UTC in the format +/-HHMM, eg: - 2012-12-17T23:24:06-0500 (EST time) 2013-04-03T08:56:32+0000 (UTC - time) 2013-04-03T08:56:32Z (UTC time)), parameter "method" of - String, parameter "job_id" of type "job_id" (A job id.), parameter - "run_id" of String, parameter "remote_url" of String, parameter - "source_ws_objects" of list of type "wsref" (A workspace object - reference of the form X/Y/Z, where X is the workspace name or id, - Y is the object name or id, Z is the version, which is optional.), - parameter "app_id" of String, parameter "meta" of mapping from - String to String, parameter "wsid" of Long, parameter - "parent_job_id" of String, (2) parameter "config" of mapping from - String to String - """ - return self._client.call_method( - "NarrativeJobService.get_job_params", [job_id], self._service_ver, context - ) - - def update_job(self, params, context=None): - """ - :param params: instance of type "UpdateJobParams" (is_started - - optional flag marking job as started (and triggering - exec_start_time statistics to be stored).) -> structure: parameter - "job_id" of type "job_id" (A job id.), parameter "is_started" of - type "boolean" (@range [0,1]) - :returns: instance of type "UpdateJobResults" -> structure: parameter - "messages" of list of String - """ - return self._client.call_method( - "NarrativeJobService.update_job", [params], self._service_ver, context - ) - - def add_job_logs(self, job_id, lines, context=None): - """ - :param job_id: instance of type "job_id" (A job id.) - :param lines: instance of list of type "LogLine" -> structure: - parameter "line" of String, parameter "is_error" of type "boolean" - (@range [0,1]) - :returns: instance of Long - """ - return self._client.call_method( - "NarrativeJobService.add_job_logs", - [job_id, lines], - self._service_ver, - context, - ) - - def get_job_logs(self, params, context=None): - """ - :param params: instance of type "GetJobLogsParams" (skip_lines - - optional parameter, number of lines to skip (in case they were - already loaded before).) -> structure: parameter "job_id" of type - "job_id" (A job id.), parameter "skip_lines" of Long - :returns: instance of type "GetJobLogsResults" (last_line_number - - common number of lines (including those in skip_lines parameter), - this number can be used as next skip_lines value to skip already - loaded lines next time.) -> structure: parameter "lines" of list - of type "LogLine" -> structure: parameter "line" of String, - parameter "is_error" of type "boolean" (@range [0,1]), parameter - "last_line_number" of Long - """ - return self._client.call_method( - "NarrativeJobService.get_job_logs", [params], self._service_ver, context - ) - - def finish_job(self, job_id, params, context=None): - """ - Register results of already started job - :param job_id: instance of type "job_id" (A job id.) - :param params: instance of type "FinishJobParams" (Either 'result', - 'error' or 'is_canceled' field should be defined; result - keeps - exact copy of what original server method puts in result block of - JSON RPC response; error - keeps exact copy of what original - server method puts in error block of JSON RPC response; - is_cancelled - Deprecated (field is kept for backward - compatibility), please use 'is_canceled' instead.) -> structure: - parameter "result" of unspecified object, parameter "error" of - type "JsonRpcError" (Error block of JSON RPC response) -> - structure: parameter "name" of String, parameter "code" of Long, - parameter "message" of String, parameter "error" of String, - parameter "is_cancelled" of type "boolean" (@range [0,1]), - parameter "is_canceled" of type "boolean" (@range [0,1]) - """ - return self._client.call_method( - "NarrativeJobService.finish_job", - [job_id, params], - self._service_ver, - context, - ) - - def check_job(self, job_id, context=None): - """ - Check if a job is finished and get results/error - :param job_id: instance of type "job_id" (A job id.) - :returns: instance of type "JobState" (job_id - id of job running - method finished - indicates whether job is done (including - error/cancel cases) or not, if the value is true then either of - 'returned_data' or 'detailed_error' should be defined; ujs_url - - url of UserAndJobState service used by job service status - tuple - returned by UserAndJobState.get_job_status method result - keeps - exact copy of what original server method puts in result block of - JSON RPC response; error - keeps exact copy of what original - server method puts in error block of JSON RPC response; job_state - - 'queued', 'in-progress', 'completed', or 'suspend'; position - - position of the job in execution waiting queue; creation_time, - exec_start_time and finish_time - time moments of submission, - execution start and finish events in milliseconds since Unix - Epoch, canceled - whether the job is canceled or not. cancelled - - Deprecated field, please use 'canceled' field instead.) -> - structure: parameter "job_id" of String, parameter "finished" of - type "boolean" (@range [0,1]), parameter "ujs_url" of String, - parameter "status" of unspecified object, parameter "result" of - unspecified object, parameter "error" of type "JsonRpcError" - (Error block of JSON RPC response) -> structure: parameter "name" - of String, parameter "code" of Long, parameter "message" of - String, parameter "error" of String, parameter "job_state" of - String, parameter "position" of Long, parameter "creation_time" of - Long, parameter "exec_start_time" of Long, parameter "finish_time" - of Long, parameter "cancelled" of type "boolean" (@range [0,1]), - parameter "canceled" of type "boolean" (@range [0,1]) - """ - return self._client.call_method( - "NarrativeJobService.check_job", [job_id], self._service_ver, context - ) - - def check_jobs(self, params, context=None): - """ - :param params: instance of type "CheckJobsParams" -> structure: - parameter "job_ids" of list of type "job_id" (A job id.), - parameter "with_job_params" of type "boolean" (@range [0,1]) - :returns: instance of type "CheckJobsResults" (job_states - states of - jobs, job_params - parameters of jobs, check_error - this map - includes info about errors happening during job checking.) -> - structure: parameter "job_states" of mapping from type "job_id" (A - job id.) to type "JobState" (job_id - id of job running method - finished - indicates whether job is done (including error/cancel - cases) or not, if the value is true then either of 'returned_data' - or 'detailed_error' should be defined; ujs_url - url of - UserAndJobState service used by job service status - tuple - returned by UserAndJobState.get_job_status method result - keeps - exact copy of what original server method puts in result block of - JSON RPC response; error - keeps exact copy of what original - server method puts in error block of JSON RPC response; job_state - - 'queued', 'in-progress', 'completed', or 'suspend'; position - - position of the job in execution waiting queue; creation_time, - exec_start_time and finish_time - time moments of submission, - execution start and finish events in milliseconds since Unix - Epoch, canceled - whether the job is canceled or not. cancelled - - Deprecated field, please use 'canceled' field instead.) -> - structure: parameter "job_id" of String, parameter "finished" of - type "boolean" (@range [0,1]), parameter "ujs_url" of String, - parameter "status" of unspecified object, parameter "result" of - unspecified object, parameter "error" of type "JsonRpcError" - (Error block of JSON RPC response) -> structure: parameter "name" - of String, parameter "code" of Long, parameter "message" of - String, parameter "error" of String, parameter "job_state" of - String, parameter "position" of Long, parameter "creation_time" of - Long, parameter "exec_start_time" of Long, parameter "finish_time" - of Long, parameter "cancelled" of type "boolean" (@range [0,1]), - parameter "canceled" of type "boolean" (@range [0,1]), parameter - "job_params" of mapping from type "job_id" (A job id.) to type - "RunJobParams" (method - service defined in standard JSON RPC way, - typically it's module name from spec-file followed by '.' and name - of funcdef from spec-file corresponding to running method (e.g. - 'KBaseTrees.construct_species_tree' from trees service); params - - the parameters of the method that performed this call; Optional - parameters: service_ver - specific version of deployed service, - last version is used if this parameter is not defined rpc_context - - context of current method call including nested call history - remote_url - run remote service call instead of local command line - execution. source_ws_objects - denotes the workspace objects that - will serve as a source of data when running the SDK method. These - references will be added to the autogenerated provenance. app_id - - the id of the Narrative application running this job (e.g. - repo/name) mapping meta - user defined metadata to - associate with the job. This data is passed to the User and Job - State (UJS) service. wsid - a workspace id to associate with the - job. This is passed to the UJS service, which will share the job - based on the permissions of the workspace rather than UJS ACLs. - parent_job_id - UJS id of the parent of a batch job. Sub jobs will - add this id to the NJS database under the field "parent_job_id") - -> structure: parameter "method" of String, parameter "params" of - list of unspecified object, parameter "service_ver" of String, - parameter "rpc_context" of type "RpcContext" (call_stack - - upstream calls details including nested service calls and parent - jobs where calls are listed in order from outer to inner.) -> - structure: parameter "call_stack" of list of type "MethodCall" - (time - the time the call was started; method - service defined in - standard JSON RPC way, typically it's module name from spec-file - followed by '.' and name of funcdef from spec-file corresponding - to running method (e.g. 'KBaseTrees.construct_species_tree' from - trees service); job_id - job id if method is asynchronous - (optional field).) -> structure: parameter "time" of type - "timestamp" (A time in the format YYYY-MM-DDThh:mm:ssZ, where Z is - either the character Z (representing the UTC timezone) or the - difference in time to UTC in the format +/-HHMM, eg: - 2012-12-17T23:24:06-0500 (EST time) 2013-04-03T08:56:32+0000 (UTC - time) 2013-04-03T08:56:32Z (UTC time)), parameter "method" of - String, parameter "job_id" of type "job_id" (A job id.), parameter - "run_id" of String, parameter "remote_url" of String, parameter - "source_ws_objects" of list of type "wsref" (A workspace object - reference of the form X/Y/Z, where X is the workspace name or id, - Y is the object name or id, Z is the version, which is optional.), - parameter "app_id" of String, parameter "meta" of mapping from - String to String, parameter "wsid" of Long, parameter - "parent_job_id" of String, parameter "check_error" of mapping from - type "job_id" (A job id.) to type "JsonRpcError" (Error block of - JSON RPC response) -> structure: parameter "name" of String, - parameter "code" of Long, parameter "message" of String, parameter - "error" of String - """ - return self._client.call_method( - "NarrativeJobService.check_jobs", [params], self._service_ver, context - ) - - def cancel_job(self, params, context=None): - """ - :param params: instance of type "CancelJobParams" -> structure: - parameter "job_id" of type "job_id" (A job id.) - """ - return self._client.call_method( - "NarrativeJobService.cancel_job", [params], self._service_ver, context - ) - - def check_job_canceled(self, params, context=None): - """ - Check whether a job has been canceled. This method is lightweight compared to check_job. - :param params: instance of type "CancelJobParams" -> structure: - parameter "job_id" of type "job_id" (A job id.) - :returns: instance of type "CheckJobCanceledResult" (job_id - id of - job running method finished - indicates whether job is done - (including error/cancel cases) or not canceled - whether the job - is canceled or not. ujs_url - url of UserAndJobState service used - by job service) -> structure: parameter "job_id" of type "job_id" - (A job id.), parameter "finished" of type "boolean" (@range - [0,1]), parameter "canceled" of type "boolean" (@range [0,1]), - parameter "ujs_url" of String - """ - return self._client.call_method( - "NarrativeJobService.check_job_canceled", - [params], - self._service_ver, - context, - ) diff --git a/deployment/bin/cron/clients/baseclient.py b/deployment/bin/cron/clients/baseclient.py deleted file mode 100644 index 1f78b54..0000000 --- a/deployment/bin/cron/clients/baseclient.py +++ /dev/null @@ -1,311 +0,0 @@ -############################################################ -# -# Autogenerated by the KBase type compiler - -# any changes made here will be overwritten -# -############################################################ - -from __future__ import print_function - -import json as _json -import requests as _requests -import random as _random -import os as _os -import traceback as _traceback -from requests.exceptions import ConnectionError -from urllib3.exceptions import ProtocolError - -try: - from configparser import ConfigParser as _ConfigParser # py 3 -except ImportError: - from ConfigParser import ConfigParser as _ConfigParser # py 2 - -try: - from urllib.parse import urlparse as _urlparse # py3 -except ImportError: - from urlparse import urlparse as _urlparse # py2 -import time - -_CT = "content-type" -_AJ = "application/json" -_URL_SCHEME = frozenset(["http", "https"]) -_CHECK_JOB_RETRYS = 3 - - -def _get_token(user_id, password, auth_svc): - # This is bandaid helper function until we get a full - # KBase python auth client released - # note that currently globus usernames, and therefore kbase usernames, - # cannot contain non-ascii characters. In python 2, quote doesn't handle - # unicode, so if this changes this client will need to change. - body = ( - "user_id=" - + _requests.utils.quote(user_id) - + "&password=" - + _requests.utils.quote(password) - + "&fields=token" - ) - ret = _requests.post(auth_svc, data=body, allow_redirects=True) - status = ret.status_code - if status >= 200 and status <= 299: - tok = _json.loads(ret.text) - elif status == 403: - raise Exception( - "Authentication failed: Bad user_id/password " - + "combination for user %s" % (user_id) - ) - else: - raise Exception(ret.text) - return tok["token"] - - -def _read_inifile( - file=_os.environ.get( # @ReservedAssignment - "KB_DEPLOYMENT_CONFIG", _os.environ["HOME"] + "/.kbase_config" - ) -): - # Another bandaid to read in the ~/.kbase_config file if one is present - authdata = None - if _os.path.exists(file): - try: - config = _ConfigParser() - config.read(file) - # strip down whatever we read to only what is legit - authdata = { - x: config.get("authentication", x) - if config.has_option("authentication", x) - else None - for x in ( - "user_id", - "token", - "client_secret", - "keyfile", - "keyfile_passphrase", - "password", - ) - } - except Exception as e: - print("Error while reading INI file {}: {}".format(file, e)) - return authdata - - -class ServerError(Exception): - def __init__(self, name, code, message, data=None, error=None): - super(Exception, self).__init__(message) - self.name = name - self.code = code - self.message = "" if message is None else message - self.data = data or error or "" - # data = JSON RPC 2.0, error = 1.1 - - def __str__(self): - return ( - self.name + ": " + str(self.code) + ". " + self.message + "\n" + self.data - ) - - -class _JSONObjectEncoder(_json.JSONEncoder): - def default(self, obj): - if isinstance(obj, set): - return list(obj) - if isinstance(obj, frozenset): - return list(obj) - return _json.JSONEncoder.default(self, obj) - - -class BaseClient(object): - """ - The KBase base client. - Required initialization arguments (positional): - url - the url of the the service to contact: - For SDK methods: either the url of the callback service or the - Narrative Job Service Wrapper. - For SDK dynamic services: the url of the Service Wizard. - For other services: the url of the service. - Optional arguments (keywords in positional order): - timeout - methods will fail if they take longer than this value in seconds. - Default 1800. - user_id - a KBase user name. - password - the password corresponding to the user name. - token - a KBase authentication token. - ignore_authrc - if True, don't read auth configuration from - ~/.kbase_config. - trust_all_ssl_certificates - set to True to trust self-signed certificates. - If you don't understand the implications, leave as the default, False. - auth_svc - the url of the KBase authorization service. - lookup_url - set to true when contacting KBase dynamic services. - async_job_check_time_ms - the wait time between checking job state for - asynchronous jobs run with the run_job method. - """ - - def __init__( - self, - url=None, - timeout=30 * 60, - user_id=None, - password=None, - token=None, - ignore_authrc=False, - trust_all_ssl_certificates=False, - auth_svc="https://kbase.us/services/auth/api/legacy/KBase/Sessions/Login", - lookup_url=False, - async_job_check_time_ms=100, - async_job_check_time_scale_percent=150, - async_job_check_max_time_ms=300000, - ): - if url is None: - raise ValueError("A url is required") - scheme, _, _, _, _, _ = _urlparse(url) - if scheme not in _URL_SCHEME: - raise ValueError(url + " isn't a valid http url") - self.url = url - self.timeout = int(timeout) - self._headers = dict() - self.trust_all_ssl_certificates = trust_all_ssl_certificates - self.lookup_url = lookup_url - self.async_job_check_time = async_job_check_time_ms / 1000.0 - self.async_job_check_time_scale_percent = async_job_check_time_scale_percent - self.async_job_check_max_time = async_job_check_max_time_ms / 1000.0 - # token overrides user_id and password - if token is not None: - self._headers["AUTHORIZATION"] = token - elif user_id is not None and password is not None: - self._headers["AUTHORIZATION"] = _get_token(user_id, password, auth_svc) - elif "KB_AUTH_TOKEN" in _os.environ: - self._headers["AUTHORIZATION"] = _os.environ.get("KB_AUTH_TOKEN") - elif not ignore_authrc: - authdata = _read_inifile() - if authdata is not None: - if authdata.get("token") is not None: - self._headers["AUTHORIZATION"] = authdata["token"] - elif ( - authdata.get("user_id") is not None - and authdata.get("password") is not None - ): - self._headers["AUTHORIZATION"] = _get_token( - authdata["user_id"], authdata["password"], auth_svc - ) - if self.timeout < 1: - raise ValueError("Timeout value must be at least 1 second") - - def _call(self, url, method, params, context=None): - arg_hash = { - "method": method, - "params": params, - "version": "1.1", - "id": str(_random.random())[2:], - } - if context: - if type(context) is not dict: - raise ValueError("context is not type dict as required.") - arg_hash["context"] = context - - body = _json.dumps(arg_hash, cls=_JSONObjectEncoder) - ret = _requests.post( - url, - data=body, - headers=self._headers, - timeout=self.timeout, - verify=not self.trust_all_ssl_certificates, - ) - ret.encoding = "utf-8" - if ret.status_code == 500: - if ret.headers.get(_CT) == _AJ: - err = ret.json() - if "error" in err: - raise ServerError(**err["error"]) - else: - raise ServerError("Unknown", 0, ret.text) - else: - raise ServerError("Unknown", 0, ret.text) - if not ret.ok: - ret.raise_for_status() - resp = ret.json() - if "result" not in resp: - raise ServerError("Unknown", 0, "An unknown server error occurred") - if not resp["result"]: - return - if len(resp["result"]) == 1: - return resp["result"][0] - return resp["result"] - - def _get_service_url(self, service_method, service_version): - if not self.lookup_url: - return self.url - service, _ = service_method.split(".") - service_status_ret = self._call( - self.url, - "ServiceWizard.get_service_status", - [{"module_name": service, "version": service_version}], - ) - return service_status_ret["url"] - - def _set_up_context(self, service_ver=None, context=None): - if service_ver: - if not context: - context = {} - context["service_ver"] = service_ver - return context - - def _check_job(self, service, job_id): - return self._call(self.url, service + "._check_job", [job_id]) - - def _submit_job(self, service_method, args, service_ver=None, context=None): - context = self._set_up_context(service_ver, context) - mod, meth = service_method.split(".") - return self._call(self.url, mod + "._" + meth + "_submit", args, context) - - def run_job(self, service_method, args, service_ver=None, context=None): - """ - Run a SDK method asynchronously. - Required arguments: - service_method - the service and method to run, e.g. myserv.mymeth. - args - a list of arguments to the method. - Optional arguments: - service_ver - the version of the service to run, e.g. a git hash - or dev/beta/release. - context - the rpc context dict. - """ - mod, _ = service_method.split(".") - job_id = self._submit_job(service_method, args, service_ver, context) - async_job_check_time = self.async_job_check_time - check_job_failures = 0 - while check_job_failures < _CHECK_JOB_RETRYS: - time.sleep(async_job_check_time) - async_job_check_time = ( - async_job_check_time * self.async_job_check_time_scale_percent / 100.0 - ) - if async_job_check_time > self.async_job_check_max_time: - async_job_check_time = self.async_job_check_max_time - - try: - job_state = self._check_job(mod, job_id) - except (ConnectionError, ProtocolError): - _traceback.print_exc() - check_job_failures += 1 - continue - - if job_state["finished"]: - if not job_state["result"]: - return - if len(job_state["result"]) == 1: - return job_state["result"][0] - return job_state["result"] - raise RuntimeError( - "_check_job failed {} times and exceeded limit".format(check_job_failures) - ) - - def call_method(self, service_method, args, service_ver=None, context=None): - """ - Call a standard or dynamic service synchronously. - Required arguments: - service_method - the service and method to run, e.g. myserv.mymeth. - args - a list of arguments to the method. - Optional arguments: - service_ver - the version of the service to run, e.g. a git hash - or dev/beta/release. - context - the rpc context dict. - """ - url = self._get_service_url(service_method, service_ver) - context = self._set_up_context(service_ver, context) - return self._call(url, service_method, args, context) diff --git a/deployment/bin/cron/container_reaper.py b/deployment/bin/cron/container_reaper.py index 8a6f948..522fadd 100755 --- a/deployment/bin/cron/container_reaper.py +++ b/deployment/bin/cron/container_reaper.py @@ -1,163 +1,140 @@ #!/miniconda/bin/python -import datetime -import fnmatch +""" +This script is automatically run by the condor cronjob periodically +in order to clean up containers > 7 days or running without a starter +Required env vars are +# CONTAINER_REAPER_ENDPOINTS - A comma separated list of EE2 endpoints to manage containers for +# DELETE_ABANDONED_CONTAINERS - Set to true to enable the container reaper +# SLACK_WEBHOOK_URL - The slack webhook url to send messages to +""" + import json -import logging import os import socket +import subprocess +import time +from datetime import datetime, timedelta +from typing import Set import docker -import psutil import requests -from clients.NarrativeJobServiceClient import NarrativeJobService - -from typing import List, Dict - -slack_key = os.environ.get("SLACK_WEBHOOK_KEY", None) -# ee_notifications_channel -webhook_url = os.environ.get("SLACK_WEBHOOK_URL", None) - -kill = os.environ.get("DELETE_ABANDONED_CONTAINERS", "false") -if kill.lower() == "true": - kill = True -else: - kill = False - -njs_endpoint_url = os.environ.get("NJS_ENDPOINT", None) - -if njs_endpoint_url is None: - raise Exception("NJS Endpoint not set") - -hostname = socket.gethostname() -dc = docker.from_env() - - -def find_dockerhub_jobs() -> Dict: - # send_slack_message(f"Job CONTAINER_REAPER is FINDING DOCKERHUB JOBS at {datetime.datetime.now()}") - - try: - all_containers = dc.containers - list = all_containers.list() - except Exception as e: - send_slack_message(str(e) + hostname) - - job_containers = {} - - for container in list: - cnt_id = container.id - try: - cnt = all_containers.get(cnt_id) - labels = cnt.labels - if "condor_id" in labels.keys() and "njs_endpoint" in labels.keys(): - labels["image"] = cnt.image - job_containers[cnt_id] = labels - except Exception as e: - logging.error(f"Container {cnt_id} doesn't exist anymore") - logging.error(e) - - return job_containers - - -def find_running_jobs(ps_name: str): - # send_slack_message(f"Job CONTAINER_REAPER is FINDING RUNNING JOBS at {datetime.datetime.now()}") - - "Return a list of processes matching 'name'." - ls = [] - for p in psutil.process_iter(attrs=["name", "cmdline"]): - if ps_name in p.info["cmdline"]: - ls.append(p.info["cmdline"][-2]) - return ls +from docker.models.containers import Container def send_slack_message(message: str): """ - :param message: Escaped Message to send to slack - :return: """ - + webhook_url = os.environ.get("SLACK_WEBHOOK_URL", None) slack_data = {"text": message} - response = requests.post( + requests.post( webhook_url, data=json.dumps(slack_data), headers={"Content-Type": "application/json"}, ) -def notify_slack(cnt_id: str, labels: dict(), running_job_ids: List): - now = datetime.datetime.now() +def filter_containers_by_time(potential_containers, days=0, minutes=0): + filtered_containers = [] + seven_days_ago = datetime.now() - timedelta(days=days, minutes=minutes) - job_id = labels.get("job_id", None) - # app_id = labels['app_id'] - app_name = labels.get("app_name", None) - method_name = labels.get("method_name", None) - condor_id = labels.get("condor_id", None) - username = labels.get("user_name", None) + for old_container in potential_containers: + # Do we need to catch the chance that there is no created attribute? + created_time_str = old_container.attrs['Created'][:26] + created_time = datetime.fromisoformat(created_time_str) + if created_time <= seven_days_ago: + filtered_containers.append(old_container) + return filtered_containers - msg = f"cnt_id:{cnt_id} job_id:{job_id} condor_id:{condor_id} for {username} not in running_job_ids {running_job_ids} ({now}) hostname:({hostname}) app:{app_name} method:{method_name} (kill = {kill}) " - send_slack_message(msg) +def get_running_time_message(container, title=""): + image_name = container.attrs['Config']['Image'] + if "kbase" in image_name: + image_name = image_name.split(":")[1] + user_name = container.attrs['Config']['Labels'].get('user_name') -# @deprecated for EVENTLOG -def notify_user(cnt_id: str, labels: Dict): - username = labels.get("user_name", None) - job_id = labels.get("job_id", None) - # TODO add this to a configuration somewhere or ENV variable - job_directory = f"/mnt/awe/condor/{username}/{job_id}" + total_running_time = datetime.now() - datetime.fromisoformat(container.attrs['Created'][:26]) + days = total_running_time.days + hours = total_running_time.seconds // 3600 - print("About to notify") - print(labels) + formatted_running_time = f"{days}D:{hours}H" + return f"{title}:{hostname} {image_name}:{user_name}:{formatted_running_time}" - env_files = [] - for file in os.listdir(job_directory): - if fnmatch.fnmatch(file, "env_*"): - env_files.append(file) +def remove_with_backoff(container,message,backoff=30): + try: + container.stop() + time.sleep(backoff) # Wait for backoff period before attempting to remove + container.remove() + except Exception as e: + # Not much we can do here, just hope that the next pass will remove it + pass +def reap_containers_running_more_than_7_days(potential_containers: Set[Container]): + old_containers = filter_containers_by_time(potential_containers, days=7) - print(env_files) - env_filepath = env_files[0] - if os.path.isfile(env_filepath): - with open(env_filepath, "r") as content_file: - content = content_file.readlines() + if old_containers: + for old_container in old_containers: + message = get_running_time_message(old_container, title="reaper7daylimit") + send_slack_message(message) + remove_with_backoff(old_container, message) - token = None - for line in content: - if "KB_AUTH_TOKEN" in line: - token = line.split("=")[1] - if token: - njs = NarrativeJobService(token=token, url=njs_endpoint_url) - status = njs.check_job(job_id) - print(status) +def reap_containers_when_there_is_no_starter(potential_containers: Set[Container]): + """ + This function will reap containers that are running but have no starter, and have been running for 30 mins + """ + condor_starter = check_for_condor_starter() + if condor_starter: + return -def kill_docker_container(cnt_id: str): - if kill is True: - cnt = dc.containers.get(cnt_id) - cnt.kill() - else: - pass + runaway_containers = filter_containers_by_time(potential_containers, minutes=30) + if runaway_containers: + for runaway_container in runaway_containers: + message = get_running_time_message(runaway_container, title="reaper_no_starter") + send_slack_message(message) + remove_with_backoff(container,message) -def kill_dead_jobs(running_jobs: List, docker_processes: Dict): - # send_slack_message(f"Job CONTAINER_REAPER is KILLING DEAD JOBS at {datetime.datetime.now()}") - for cnt_id in docker_processes: - labels = docker_processes[cnt_id] - job_id = labels.get("job_id", None) - if job_id not in running_jobs: - if kill is True: - kill_docker_container(cnt_id) - notify_slack(cnt_id, labels, running_jobs) +def check_for_condor_starter(): + result = subprocess.run("ps -ef | grep '[c]ondor_starter'", shell=True, stdout=subprocess.PIPE, text=True) + count = len(result.stdout.strip().split('\n')) if result.stdout.strip() else 0 + return count > 0 if __name__ == "__main__": - try: - # send_slack_message(f"Job CONTAINER_REAPER is beginning at {datetime.datetime.now()}") - name = "us.kbase.narrativejobservice.sdkjobs.SDKLocalMethodRunner" + """ + PDSH_SSH_ARGS_APPEND="-o StrictHostKeyChecking=no -q" pdsh -w rancher@km[2-28]-p "docker ps | grep kbase| grep days" | sort -V | grep -v worker + """ - running_java_jobs = find_running_jobs(name) - docker_jobs = find_dockerhub_jobs() - kill_dead_jobs(running_java_jobs, docker_jobs) - # send_slack_message(f"Job CONTAINER_REAPER is ENDING at {datetime.datetime.now()}") - except Exception as e: - send_slack_message(f"FAILURE on {hostname}" + str(e.with_traceback())) - logging.error(e.with_traceback()) + CONTAINER_REAPER_ENDPOINTS = os.environ.get("CONTAINER_REAPER_ENDPOINTS", "").split(",") + DELETE_ABANDONED_CONTAINERS = os.environ.get("DELETE_ABANDONED_CONTAINERS", "false").lower() == "true" + + if not DELETE_ABANDONED_CONTAINERS: + exit("DELETE_ABANDONED_CONTAINERS is not set to true") + if not CONTAINER_REAPER_ENDPOINTS or CONTAINER_REAPER_ENDPOINTS == [""]: + exit("No CONTAINER_REAPER_ENDPOINTS set, unsure where to manage containers") + + hostname = socket.gethostname() + dc = docker.from_env() + + # Define the filters to specify that you are searching for only your specific containers in a multi worker environment + # Also add user_name as a filter to make sure you aren't killing containers that happen to have EE2_ENDPOINT set, + # The chances of EE2_endpoint and user_name as labels on a container should be very small. + # CONTAINER_REAPER_ENDPOINTS = ["https://kbase.us/services/ee2", "https://appdev.kbase.us/services/ee2", "https://services.kbase.us/services/ee2/"] + unique_containers = set() + filters = {} + for endpoint in CONTAINER_REAPER_ENDPOINTS: + + filters.update({ + "status": "running", + "label": [ + f"ee2_endpoint={endpoint.strip()}", + "user_name" + ] + }) + containers = dc.containers.list(filters=filters) + for container in containers: + unique_containers.add(container) + + reap_containers_running_more_than_7_days(potential_containers=unique_containers) + reap_containers_when_there_is_no_starter(potential_containers=unique_containers) diff --git a/deployment/bin/cron/container_reaper_ee2.py b/deployment/bin/cron/container_reaper_ee2.py index d93ca77..56d936d 100755 --- a/deployment/bin/cron/container_reaper_ee2.py +++ b/deployment/bin/cron/container_reaper_ee2.py @@ -1,154 +1,152 @@ -#!/miniconda/bin/python -import datetime -import json -import logging -import os -import socket -from typing import List, Dict - -import docker -from docker.models.containers import Container -import psutil -import requests - -logging.basicConfig(level=logging.INFO) - -slack_key = os.environ.get("SLACK_WEBHOOK_KEY", None) -# ee_notifications_channel -webhook_url = os.environ.get("SLACK_WEBHOOK_URL", None) - -kill = os.environ.get("DELETE_ABANDONED_CONTAINERS", "false") -if kill.lower() == "true": - kill = True -else: - kill = False - -ee2_endpoint_url = os.environ.get("EE2_ENDPOINT", None) - -if ee2_endpoint_url is None: - raise Exception("EE2 Endpoint not set") - -hostname = socket.gethostname() -dc = docker.from_env() - - -def find_dockerhub_jobs() -> Dict: - # send_slack_message(f"Job CONTAINER_REAPER is FINDING DOCKERHUB JOBS at {datetime.datetime.now()}") - - try: - all_containers = dc.containers - container_list = all_containers.list() - except Exception as e: - send_slack_message(str(e) + hostname) - raise e - - job_containers = {} - - for container in container_list: - cnt_id = container.id - try: - cnt = all_containers.get(cnt_id) - labels = cnt.labels - label_keys = labels.keys() - if ( - "condor_id" in label_keys - and "ee2_endpoint" in label_keys - and "worker_hostname" in label_keys - ): - if ( - labels.get("worker_hostname") == hostname - and labels.get("ee2_endpoint") == ee2_endpoint_url - ): - labels["image"] = cnt.image - job_containers[cnt_id] = labels - except Exception as e: - logging.error(f"Container {cnt_id} doesn't exist anymore") - logging.error(e) - - return job_containers - - -def find_running_jobs(): - "Return a list of job ids from running job processes. Since python procs have multiple entries, keep only 1 version" - # send_slack_message(f"Job CONTAINER_REAPER is FINDING RUNNING JOBS at {datetime.datetime.now()}") - ls = [] - for p in psutil.process_iter(attrs=["name", "cmdline"]): - if ( - "/miniconda/bin/python" in p.info["cmdline"] - and "./jobrunner.py" in p.info["cmdline"] - ): - ls.append(p.info["cmdline"][-2]) - return list(set(ls)) - - -def send_slack_message(message: str): - """ - - :param message: Escaped Message to send to slack - :return: - """ - - slack_data = {"text": message} - response = requests.post( - webhook_url, - data=json.dumps(slack_data), - headers={"Content-Type": "application/json"}, - ) - - -def notify_slack(cnt_id: str, labels: dict(), running_job_ids: List): - now = datetime.datetime.now() - - job_id = labels.get("job_id", None) - # app_id = labels['app_id'] - app_name = labels.get("app_name", None) - method_name = labels.get("method_name", None) - condor_id = labels.get("condor_id", None) - username = labels.get("user_name", None) - - msg = f"cnt_id:{cnt_id} job_id:{job_id} condor_id:{condor_id} for {username} not in running_job_ids {running_job_ids} ({now}) hostname:({hostname}) app:{app_name} method:{method_name} (kill = {kill}) " - send_slack_message(msg) - - -def kill_docker_container(cnt_id: str): - """ - Kill a docker container. The job finish script should clean up after itself. - :param cnt_id: The container to kill/remove - """ - if kill is True: - cnt = dc.containers.get(cnt_id) # type: Container - try: - cnt.kill() - except Exception: - try: - cnt.remove(force=True) - except Exception: - send_slack_message(f"Couldn't delete {cnt_id} on {hostname}") - - -def kill_dead_jobs(running_jobs: List, docker_processes: Dict): - """ - Check whether there are runaway docker containers - :param running_jobs: A list of condor jobs gathered from the starter scripts - :param docker_processes: A list of docker containers - """ - # send_slack_message(f"Job CONTAINER_REAPER is KILLING DEAD JOBS at {datetime.datetime.now()}") - for cnt_id in docker_processes: - labels = docker_processes[cnt_id] - job_id = labels.get("job_id", None) - if job_id not in running_jobs: - notify_slack(cnt_id, labels, running_jobs) - if kill is True: - kill_docker_container(cnt_id) - - -if __name__ == "__main__": - try: - # send_slack_message(f"Job CONTAINER_REAPER is beginning at {datetime.datetime.now()}") - locally_running_jobrunners = find_running_jobs() - docker_jobs = find_dockerhub_jobs() - kill_dead_jobs(locally_running_jobrunners, docker_jobs) - # send_slack_message(f"Job CONTAINER_REAPER is ENDING at {datetime.datetime.now()}") - except Exception as e: - send_slack_message(f"FAILURE on {hostname}" + str(e)) - logging.error(str(e)) +# #!/miniconda/bin/python +# import datetime +# import json +# import logging +# import os +# import socket +# from typing import List, Dict +# +# import docker +# import psutil +# import requests +# from docker.models.containers import Container +# +# # REQUIRED ENVIRONMENT VARIABLES +# ee2_endpoint_url = os.environ.get("EE2_ENDPOINT") +# if not ee2_endpoint_url: +# raise Exception("EE2 Endpoint not set") +# +# webhook_url = os.environ.get("SLACK_WEBHOOK_URL") +# if not webhook_url: +# raise Exception("SLACK_WEBHOOK_URL is not defined") +# +# # OPTIONAL ENVIRONMENT VARIABLES +# kill = os.environ.get("DELETE_ABANDONED_CONTAINERS", "false").lower() == "true" +# +# logging.basicConfig(level=logging.INFO) +# hostname = socket.gethostname() +# dc = docker.from_env() +# +# +# def find_dockerhub_jobs() -> Dict: +# try: +# all_containers = dc.containers +# container_list = all_containers.list() +# except Exception as e: +# send_slack_message(str(e) + hostname) +# raise e +# +# job_containers = {} +# +# for container in container_list: +# cnt_id = container.id +# try: +# cnt = all_containers.get(cnt_id) +# labels = cnt.labels +# label_keys = labels.keys() +# if ( +# "condor_id" in label_keys +# and "ee2_endpoint" in label_keys +# and "worker_hostname" in label_keys +# ): +# if ( +# labels.get("worker_hostname") == hostname +# and labels.get("ee2_endpoint") == ee2_endpoint_url +# ): +# labels["image"] = cnt.image +# job_containers[cnt_id] = labels +# except Exception as e: +# logging.error(f"Container {cnt_id} doesn't exist anymore") +# logging.error(e) +# +# return job_containers +# +# +# def find_running_jobs(): +# """ +# Return a list of job ids from running job processes. +# Since python procs have multiple entries, keep only 1 version +# """ +# +# # send_slack_message(f"Job CONTAINER_REAPER is FINDING RUNNING JOBS at {datetime.datetime.now()}") +# ls = [] +# for p in psutil.process_iter(attrs=["name", "cmdline"]): +# if ( +# "/miniconda/bin/python" in p.info["cmdline"] +# and "./jobrunner.py" in p.info["cmdline"] +# ): +# ls.append(p.info["cmdline"][-2]) +# return list(set(ls)) +# +# +# def send_slack_message(message: str): +# """ +# +# :param message: Escaped Message to send to slack +# :return: +# """ +# +# slack_data = {"text": message} +# requests.post( +# webhook_url, +# data=json.dumps(slack_data), +# headers={"Content-Type": "application/json"}, +# ) +# +# +# def notify_slack(cnt_id: str, labels: dict(), running_job_ids: List): +# now = datetime.datetime.now() +# +# job_id = labels.get("job_id", None) +# # app_id = labels['app_id'] +# app_name = labels.get("app_name", None) +# method_name = labels.get("method_name", None) +# condor_id = labels.get("condor_id", None) +# username = labels.get("user_name", None) +# +# msg = f"cnt_id:{cnt_id} job_id:{job_id} condor_id:{condor_id} for {username} not in running_job_ids {running_job_ids} ({now}) hostname:({hostname}) app:{app_name} method:{method_name} (kill = {kill}) " +# send_slack_message(msg) +# +# +# def kill_docker_container(cnt_id: str): +# """ +# Kill a docker container. The job finish script should clean up after itself. +# :param cnt_id: The container to kill/remove +# """ +# if kill is True: +# cnt = dc.containers.get(cnt_id) # type: Container +# try: +# cnt.kill() +# except Exception: +# try: +# cnt.remove(force=True) +# except Exception: +# send_slack_message(f"Couldn't delete {cnt_id} on {hostname}") +# +# +# def kill_dead_jobs(running_jobs: List, docker_processes: Dict): +# """ +# Check whether there are runaway docker containers +# :param running_jobs: A list of condor jobs gathered from the starter scripts +# :param docker_processes: A list of docker containers +# """ +# # send_slack_message(f"Job CONTAINER_REAPER is KILLING DEAD JOBS at {datetime.datetime.now()}") +# for cnt_id in docker_processes: +# labels = docker_processes[cnt_id] +# job_id = labels.get("job_id", None) +# if job_id not in running_jobs: +# notify_slack(cnt_id, labels, running_jobs) +# if kill is True: +# kill_docker_container(cnt_id) +# +# +# if __name__ == "__main__": +# try: +# # send_slack_message(f"Job CONTAINER_REAPER is beginning at {datetime.datetime.now()}") +# locally_running_jobrunners = find_running_jobs() +# docker_jobs = find_dockerhub_jobs() +# kill_dead_jobs(locally_running_jobrunners, docker_jobs) +# # send_slack_message(f"Job CONTAINER_REAPER is ENDING at {datetime.datetime.now()}") +# except Exception as ev: +# send_slack_message(f"FAILURE on {hostname}" + str(ev)) +# logging.error(str(ev)) diff --git a/deployment/bin/cron/delete_exited_containers.py b/deployment/bin/cron/delete_exited_containers.py index 60c9ca5..fe9640a 100755 --- a/deployment/bin/cron/delete_exited_containers.py +++ b/deployment/bin/cron/delete_exited_containers.py @@ -1,17 +1,18 @@ #!/miniconda/bin/python -import os +# This script is automatically run by the condor cronjob periodically +# in order to clean up exited docker containers. import json -import requests -import docker +import os import socket -import datetime + +import docker +import requests def send_slack_message(message: str): """ :param message: Escaped Message to send to slack """ - # ee_notifications_channel webhook_url = os.environ.get("SLACK_WEBHOOK_URL", None) slack_data = {"text": message} requests.post( @@ -22,14 +23,10 @@ def send_slack_message(message: str): if __name__ == "__main__": - # send_slack_message(f"Job DELETE_EXITED is beginning at {datetime.datetime.now()}") hostname = socket.gethostname() dc = docker.from_env() ec = dc.containers.list(filters={"status": "exited"}) - count = len(ec) - - if count > 0: + container_image_names = [c.attrs["Config"]["Image"] for c in ec] + if container_image_names: dc.containers.prune() - send_slack_message(f"Deleted {count} stopped containers on {hostname}") - - # send_slack_message(f"Job DELETE_EXITED is ENDING at {datetime.datetime.now()}") + send_slack_message(f"Deleted {len(ec)} `exited` containers on {hostname} {container_image_names}") diff --git a/deployment/bin/cron/health_check.py b/deployment/bin/cron/health_check.py index 80cebc4..69d1c1c 100755 --- a/deployment/bin/cron/health_check.py +++ b/deployment/bin/cron/health_check.py @@ -17,44 +17,38 @@ import psutil import requests - -def send_slack_message(message: str): - """ - :param message: Escaped Message to send to slack - """ - # ee_notifications_channel - webhook_url = os.environ.get("SLACK_WEBHOOK_URL", None) - slack_data = {"text": message} - requests.post( - webhook_url, - data=json.dumps(slack_data), - headers={"Content-Type": "application/json"}, - ) - - -debug = False +# Optional environment variables +var_lib_docker = os.environ.get("DOCKER_CACHE", "/var/lib/docker/") scratch = os.environ.get("CONDOR_SUBMIT_WORKDIR", "/cdr") scratch += os.environ.get("EXECUTE_SUFFIX", "") -check_condor_starter_health = ( - os.environ.get("CHECK_CONDOR_STARTER_HEALTH", "true").lower() == "true" -) - -# Endpoint +check_condor_starter_health = (os.environ.get("CHECK_CONDOR_STARTER_HEALTH", "true").lower() == "true") +debug = (os.environ.get("DEBUG", "false").lower() == "true") +# Required environment variables endpoint = os.environ.get("SERVICE_ENDPOINT", None) - if endpoint is None: exit("SERVICE_ENDPOINT is not defined") -# Docker Cache -var_lib_docker = os.environ.get("DOCKER_CACHE", "/var/lib/docker/") +webhook_url = os.environ.get("SLACK_WEBHOOK_URL", None) +if webhook_url is None: + exit("SLACK_WEBHOOK_URL is not defined") + + user = "nobody" pid = pwd.getpwnam(user).pw_uid gid = pwd.getpwnam(user).pw_gid - -# TODO Report to nagios +def send_slack_message(message: str): + """ + :param message: Escaped Message to send to slack + """ + slack_data = {"text": message} + requests.post( + webhook_url, + data=json.dumps(slack_data), + headers={"Content-Type": "application/json"}, + ) def exit_unsuccessfully(message: str, send_to_slack=True): @@ -66,7 +60,7 @@ def exit_unsuccessfully(message: str, send_to_slack=True): print("- update:true") now = datetime.datetime.now() - if send_to_slack is True: + if send_to_slack: send_slack_message( f"POSSIBLE BLACK HOLE: Ran healthcheck at {now} on {socket.gethostname()} with failure: {message}" ) @@ -136,8 +130,8 @@ def test_docker_socket(): """ Check to see if the nobody user has access to the docker socket """ - socket = "/var/run/docker.sock" - socket_gid = os.stat(socket).st_gid + socket_location = "/var/run/docker.sock" + socket_gid = os.stat(socket_location).st_gid # TODO FIX THIS TEST.. GROUPS ARE NOT BEING CORRECTLY SET INSIDE THE DOCKER CONTAINER gids = [999, 996, 995, 987] @@ -145,7 +139,7 @@ def test_docker_socket(): return message = ( - f"Cannot access docker socket, check to make sure permissions of user in {gids}" + f"test_docker_socket: Cannot access docker socket, check to make sure permissions of user in {gids}" ) exit_unsuccessfully(message) @@ -156,13 +150,13 @@ def test_docker_socket2(): """ dc = docker.from_env() if len(dc.containers.list()) < 1: - message = f"Cannot access docker socket" + message = f"Nobody User cannot access docker socket" exit_unsuccessfully(message) -def test_world_writeable(): +def test_scratch_world_writeable(): """ - Check to see if /mnt/awe/condor is writeable + Check to see if /cdr/scratch is writeable """ # Strip out octal 0o perms = str(oct(stat.S_IMODE(os.stat(scratch).st_mode))).lstrip("0").lstrip("o") @@ -170,7 +164,7 @@ def test_world_writeable(): if perms == "01777" or perms == "1777" or perms == "0o1777": return else: - message = f"Cannot access {scratch} gid={os.stat(scratch).st_gid} perms={perms}" + message = f"Scratch not world writeable: Cannot access {scratch} gid={os.stat(scratch).st_gid} perms={perms}" exit_unsuccessfully(message) @@ -184,21 +178,19 @@ def test_enough_space(mount_point, nickname, percentage): try: usage = subprocess.check_output(cmd, shell=True).decode().strip() if int(usage) < percentage: - # send_slack_message( - # f"The amount of usage {usage} for {mount_point} ({nickname}) which is less than {percentage}") return else: message = f"Can't access {mount_point} ({nickname}) or not enough space ({usage}% > {percentage}%)" exit_unsuccessfully(message) except Exception as e: message = ( - f"Can't access {mount_point} ({nickname}) or not enough space {usage}" - + str(e) + f"Can't access {mount_point} ({nickname}) or not enough space {usage}" + + str(e) ) exit_unsuccessfully(message) -def checkEndpoints(): +def check_kbase_endpoints(): """ Check auth/njs/catalog/ws """ @@ -232,21 +224,18 @@ def checkEndpoints(): message = f"Couldn't reach {service}. {e}" exit_unsuccessfully(message) - - def main(): try: # send_slack_message(f"Job HEALTH_CHECK is beginning at {datetime.datetime.now()}") test_docker_socket() test_docker_socket2() - test_world_writeable() + test_scratch_world_writeable() test_enough_space(scratch, "scratch", 95) test_enough_space(var_lib_docker, "docker", 95) test_free_memory() test_condor_starter() - checkEndpoints() - # send_slack_message(f"Job HEALTH_CHECK is ENDING at {datetime.datetime.now()}") + check_kbase_endpoints() except Exception as e: exit_unsuccessfully(str(e)) exit_successfully() diff --git a/deployment/bin/cruft/check_abandoned_containers.py b/deployment/bin/cruft/check_abandoned_containers.py deleted file mode 100644 index 156d60a..0000000 --- a/deployment/bin/cruft/check_abandoned_containers.py +++ /dev/null @@ -1,90 +0,0 @@ -#!/usr/bin/env python -# This script is used to find abandoned containers running on a condor worker. -# It requires a webhook URL environmental variable in order to send a notification to a slack channel -import datetime -import json -import logging -import os -import subprocess -import time - -import requests - -logging.basicConfig(level=logging.DEBUG) - -# Improvements: Use a library - -while True: - delete = os.environ.get("DELETE_ABANDONED_CONTAINERS") - webhook_url = os.environ.get("SLACK_WEBHOOK_URL") - - hostname = subprocess.check_output("hostname").strip() - logging.info("About to check for jobs on" + str(hostname)) - - try: - cmd = "docker ps | grep dockerhub | cut -f1 -d' '" - running_containers = subprocess.check_output(cmd, shell=True) - - container_ids = running_containers.split("\n") - container_ids = filter(None, container_ids) - - cmd = 'ps -ax -o command | egrep "java -cp /mnt/awe/condor/.+/NJSWrapper-all.jar us.kbase.narrativejobservice.sdkjobs.SDKLocalMethodRunner" | grep -v grep | cut -f5 -d" "' - java_procs = str(subprocess.check_output(cmd, shell=True)) - running_job_ids = java_procs.split("\n") - - running_job_ids = filter(None, running_job_ids) - - logging.info(running_job_ids) - - now = datetime.datetime.now() - - for container_id in container_ids: - - # Try catch here so the script can keep going - try: - cmd = ( - "docker inspect --format '{{ index .Config.Labels \"job_id\"}}' " - + str(container_id) - ) - ujs_id = str(subprocess.check_output(cmd, shell=True).strip()) - cmd = ( - "docker inspect --format '{{ index .Config.Labels \"condor_id\"}}' " - + str(container_id) - ) - condor_id = str(subprocess.check_output(cmd, shell=True).strip()) - - # Skip containers without a condor or worker id - if len(ujs_id) == 0 and len(condor_id) == 0: - continue - - if ujs_id not in running_job_ids: - message = "container:[{}] job_id:[{}] condor_id:[{}] is dead ({}) {} ".format( - container_id, ujs_id, condor_id, hostname, now - ) - - slack_data = {"text": message} - - response = requests.post( - webhook_url, - data=json.dumps(slack_data), - headers={"Content-Type": "application/json"}, - ) - - if delete == "true": - cmd = "docker stop {} && docker container rm -v {}".format( - container_id, container_id - ) - logging.error(message) - logging.error(cmd) - output = subprocess.check_output(cmd, shell=True) - - elif ujs_id in running_job_ids: - logging.info("Job still running: " + ujs_id) - - except Exception as e: - print(e) - - except Exception as e: - print(e) - - time.sleep(60) diff --git a/deployment/bin/cruft/check_abandoned_containers.sh b/deployment/bin/cruft/check_abandoned_containers.sh deleted file mode 100644 index 579fbe3..0000000 --- a/deployment/bin/cruft/check_abandoned_containers.sh +++ /dev/null @@ -1,33 +0,0 @@ -#!/usr/bin/env bash -#This script is used to find abandoned containers running on a condor worker. -#It requires a webhook URL environmental variable in order to send a notification to a slack channel - -delete=${DELETE_ABANDONDED_CONTAINERS} -webhook_url=${SLACK_WEBHOOK_URL} -hostname=`hostname` -running="2" - -while true -do - running_containers=`docker ps | grep dockerhub | cut -f1 -d' '` - for container_id in ${running_containers} - do - condor_id=`docker inspect ${container_id} | grep condor_id | egrep -o "[0-9]+\.[0-9]"` - last_job_status=`condor_q ${condor_id} -attributes JobStatus -long | egrep -o "[0-9]"` - remote_host=`condor_q ${condor_id} -attributes RemoteHost -long | cut -f2 -d'='` - last_remote_host=`condor_q ${condor_id} -attributes LastRemoteHost -long | cut -f2 -d'='` - - if [[ ${last_job_status} = 2 ]]; - then - message="container_id ${condor_id} ${last_job_status} ${remote_host} ${last_remote_host} running" - else - message="DOCKER_ID:${container_id} CONDOR_ID:${condor_id} STATUS:${last_job_status} HOST:${remote_host} ${last_remote_host} (${hostname}) container is abandoned" - curl -X POST -H 'Content-type: application/json' --data "{'text':'${message}'}" $webhook_url - if [[ ${delete} = true ]]; - then - docker stop ${container_id} && docker container rm -v ${container_id} - fi - fi - done -sleep 60 -done diff --git a/deployment/bin/cruft/delete_exited_containers.sh b/deployment/bin/cruft/delete_exited_containers.sh deleted file mode 100755 index e143119..0000000 --- a/deployment/bin/cruft/delete_exited_containers.sh +++ /dev/null @@ -1,24 +0,0 @@ -#!/usr/bin/env bash -# Usage - chmod +x -# ./remove_exited_containers.sh -# Sends a message about the containers you are going to delete on the host you run this on - - -#ee_notifications channel -webhook_url=${SLACK_WEBHOOK_URL} - -hostname=`hostname` - -# This will avoid inadvertently removing any containers which happen to have -# the word Exit in the name or command, and won't stop working if the output format of "docker ps -a" ever changes. - -exited_containers=`docker ps -a --filter status=exited --format {{.ID}}` -n=`docker ps -a --filter status=exited --format {{.ID}} | wc -l` -message="Deleting $n exiting containers from $hostname" - -# This cannot remove running containers -if [[ ${n} > 0 ]]; -then - `echo $exited_containers | xargs docker rm` - curl -X POST -H 'Content-type: application/json' --data "{'text':'${message}'}" $webhook_url -fi diff --git a/deployment/conf/.templates/cronjobs.config.templ b/deployment/conf/.templates/cronjobs.config.templ index b0cbb03..f3c6057 100644 --- a/deployment/conf/.templates/cronjobs.config.templ +++ b/deployment/conf/.templates/cronjobs.config.templ @@ -1,32 +1,26 @@ -# SLACK_WEBHOOK_KEY={{ .Env.SLACK_WEBHOOK_KEY }} - -# startd hook to check if node is healthy +# This checks if the node is healthy and reports to slack if it is not. Sets NODE_IS_HEALTHY to True or False STARTD_CRON_NodeHealth_EXECUTABLE = /kb/deployment/bin/cron/health_check.py STARTD_CRON_NodeHealth_PERIOD = 6m STARTD_CRON_NodeHealth_MODE = Periodic STARTD_CRON_NodeHealth_RECONFIG_RERUN = True -STARTD_CRON_NodeHealth_ENV = "SLACK_WEBHOOK_URL={{ .Env.SLACK_WEBHOOK_URL }} SERVICE_ENDPOINT={{ .Env.SERVICE_ENDPOINT }} CONDOR_SUBMIT_WORKDIR={{ .Env.CONDOR_SUBMIT_WORKDIR }} DOCKER_CACHE={{ .Env.DOCKER_CACHE }} DELETE_ABANDONED_CONTAINERS={{ .Env.DELETE_ABANDONED_CONTAINERS }}" +STARTD_CRON_NodeHealth_ENV = "SLACK_WEBHOOK_URL={{ .Env.SLACK_WEBHOOK_URL }} SERVICE_ENDPOINT={{ .Env.SERVICE_ENDPOINT }} CONDOR_SUBMIT_WORKDIR={{ .Env.CONDOR_SUBMIT_WORKDIR }} DOCKER_CACHE={{ .Env.DOCKER_CACHE }} EXECUTE_SUFFIX={{ .Env.EXECUTE_SUFFIX }} CHECK_CONDOR_STARTER_HEALTH={{ .Env.CHECK_CONDOR_STARTER_HEALTH }} " + -# startd hook to delete exited containers +# startd hook to delete exited containers (Might want to leave this longer for debugging) STARTD_CRON_DeleteExitedContainers_EXECUTABLE = /kb/deployment/bin/cron/delete_exited_containers.py -STARTD_CRON_DeleteExitedContainers_PERIOD = 10m +STARTD_CRON_DeleteExitedContainers_PERIOD = 30m STARTD_CRON_DeleteExitedContainers_MODE = Periodic STARTD_CRON_DeleteExitedContainers_RECONFIG_RERUN = True STARTD_CRON_DeleteExitedContainers_ENV = "SLACK_WEBHOOK_URL={{ .Env.SLACK_WEBHOOK_URL }}" -# startd hook to delete abandoned containers -STARTD_CRON_ReapAbandondedContainers_EXECUTABLE = /kb/deployment/bin/cron/container_reaper.py -STARTD_CRON_ReapAbandondedContainers_PERIOD = 6m -STARTD_CRON_ReapAbandondedContainers_MODE = Periodic -STARTD_CRON_ReapAbandondedContainers_RECONFIG_RERUN = True -STARTD_CRON_ReapAbandondedContainers_ENV = "SLACK_WEBHOOK_URL={{ .Env.SLACK_WEBHOOK_URL }} SERVICE_ENDPOINT={{ .Env.SERVICE_ENDPOINT }} CONDOR_SUBMIT_WORKDIR={{ .Env.CONDOR_SUBMIT_WORKDIR }} DOCKER_CACHE={{ .Env.DOCKER_CACHE }} DELETE_ABANDONED_CONTAINERS={{ .Env.DELETE_ABANDONED_CONTAINERS }}" -# startd hook to delete abandoned containers -STARTD_CRON_ReapAbandondedContainersEE2_EXECUTABLE = /kb/deployment/bin/cron/container_reaper_ee2.py -STARTD_CRON_ReapAbandondedContainersEE2_PERIOD = 6m -STARTD_CRON_ReapAbandondedContainersEE2_MODE = Periodic -STARTD_CRON_ReapAbandondedContainersEE2_RECONFIG_RERUN = True -STARTD_CRON_ReapAbandondedContainersEE2_ENV = "EE2_ENDPOINT={{ .Env.EE2_ENDPOINT }} SLACK_WEBHOOK_URL={{ .Env.SLACK_WEBHOOK_URL }} SERVICE_ENDPOINT={{ .Env.SERVICE_ENDPOINT }} CONDOR_SUBMIT_WORKDIR={{ .Env.CONDOR_SUBMIT_WORKDIR }} DOCKER_CACHE={{ .Env.DOCKER_CACHE }} DELETE_ABANDONED_CONTAINERS={{ .Env.DELETE_ABANDONED_CONTAINERS }}" +# Container Reaper Version 2024 +STARTD_CRON_ContainerReaper_EXECUTABLE = /kb/deployment/bin/cron/container_reaper.py +STARTD_CRON_ContainerReaper_PERIOD = 6m +STARTD_CRON_ContainerReaper_MODE = Periodic +STARTD_CRON_ContainerReaper_RECONFIG_RERUN = True +STARTD_CRON_ContainerReaper_ENV = "SLACK_WEBHOOK_URL={{ .Env.SLACK_WEBHOOK_URL }} CONTAINER_REAPER_ENDPOINTS={{ .Env.CONTAINER_REAPER_ENDPOINTS }} DELETE_ABANDONED_CONTAINERS={{ .Env.DELETE_ABANDONED_CONTAINERS }}" + # Tmpwatch $CONDOR_SUBMIT_WORKDIR STARTD_CRON_ManageCondorSubmitWorkdir_EXECUTABLE = /usr/sbin/tmpwatch @@ -36,12 +30,12 @@ STARTD_CRON_ManageCondorSubmitWorkdir_MODE = Periodic STARTD_CRON_ManageCondorSubmitWorkdir_RECONFIG_RERUN = True STARTD_CRON_ManageCondorSubmitWorkdir_ENV = "CONDOR_SUBMIT_WORKDIR={{ .Env.CONDOR_SUBMIT_WORKDIR }} " -# Prune docker every 14 days.. This works right now, but need to redirect to a script +# Prune docker every 14 days STARTD_CRON_ManageVarLibDocker_EXECUTABLE = /usr/bin/docker STARTD_CRON_ManageVarLibDocker_ARGS = system prune -a -f STARTD_CRON_ManageVarLibDocker_PERIOD = 336h STARTD_CRON_ManageVarLibDocker_MODE = Periodic -STARTD_CRON_ManageCondorSubmitWorkdir_RECONFIG_RERUN = True +STARTD_CRON_ManageVarLibDocker_RECONFIG_RERUN = True -STARTD_CRON_JOBLIST = NodeHealth ReapAbandondedContainersEE2 ManageVarLibDocker ManageCondorSubmitWorkdir +STARTD_CRON_JOBLIST = NodeHealth ContainerReaper ManageVarLibDocker ManageCondorSubmitWorkdir # STARTD_CRON_AUTOPUBLISH = If_Changed From db01164112bf90108795e2a1604e2f410f368224 Mon Sep 17 00:00:00 2001 From: bio-boris Date: Wed, 24 Jan 2024 22:20:35 -0600 Subject: [PATCH 13/18] Update README.md (#62) --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index a855d50..cbf8ec3 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# Condor-worker requirements +# Custom DockerFile, Configurations, Helper Scripts, and CronJobs for KBase Condor Worker The condor workers require From f363d533d1b89bee8d4642e27cd2b2b13b70f0a3 Mon Sep 17 00:00:00 2001 From: bio-boris Date: Wed, 24 Jan 2024 22:25:03 -0600 Subject: [PATCH 14/18] Update RELEASE_NOTES.txt --- RELEASE_NOTES.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/RELEASE_NOTES.txt b/RELEASE_NOTES.txt index 68f6411..f70fbed 100644 --- a/RELEASE_NOTES.txt +++ b/RELEASE_NOTES.txt @@ -1,4 +1,4 @@ -1.0.10 +1.0.11 ======== * Modify Cronjobs that look for runaway containers * Update Documentation From 939a5c1e04a8c5e741f34293775e6bd392bb08b2 Mon Sep 17 00:00:00 2001 From: Boris Date: Wed, 24 Jan 2024 23:30:19 -0600 Subject: [PATCH 15/18] Fix conflicts --- RELEASE_NOTES.txt | 9 -- deployment/bin/cron/container_reaper_ee2.py | 152 -------------------- 2 files changed, 161 deletions(-) delete mode 100755 deployment/bin/cron/container_reaper_ee2.py diff --git a/RELEASE_NOTES.txt b/RELEASE_NOTES.txt index e7ea601..75b7879 100644 --- a/RELEASE_NOTES.txt +++ b/RELEASE_NOTES.txt @@ -11,15 +11,6 @@ * Pin versions due for Python * Update base image to `htcondor/execute:lts-el8` - -1.0.10 -======== -* Fix health check -* Fix dependencies for JobRunner -* Pin versions due for Python -* Update base image to `htcondor/execute:lts-el8` - - 1.0.9 ======= * Add GHA Actions diff --git a/deployment/bin/cron/container_reaper_ee2.py b/deployment/bin/cron/container_reaper_ee2.py deleted file mode 100755 index 56d936d..0000000 --- a/deployment/bin/cron/container_reaper_ee2.py +++ /dev/null @@ -1,152 +0,0 @@ -# #!/miniconda/bin/python -# import datetime -# import json -# import logging -# import os -# import socket -# from typing import List, Dict -# -# import docker -# import psutil -# import requests -# from docker.models.containers import Container -# -# # REQUIRED ENVIRONMENT VARIABLES -# ee2_endpoint_url = os.environ.get("EE2_ENDPOINT") -# if not ee2_endpoint_url: -# raise Exception("EE2 Endpoint not set") -# -# webhook_url = os.environ.get("SLACK_WEBHOOK_URL") -# if not webhook_url: -# raise Exception("SLACK_WEBHOOK_URL is not defined") -# -# # OPTIONAL ENVIRONMENT VARIABLES -# kill = os.environ.get("DELETE_ABANDONED_CONTAINERS", "false").lower() == "true" -# -# logging.basicConfig(level=logging.INFO) -# hostname = socket.gethostname() -# dc = docker.from_env() -# -# -# def find_dockerhub_jobs() -> Dict: -# try: -# all_containers = dc.containers -# container_list = all_containers.list() -# except Exception as e: -# send_slack_message(str(e) + hostname) -# raise e -# -# job_containers = {} -# -# for container in container_list: -# cnt_id = container.id -# try: -# cnt = all_containers.get(cnt_id) -# labels = cnt.labels -# label_keys = labels.keys() -# if ( -# "condor_id" in label_keys -# and "ee2_endpoint" in label_keys -# and "worker_hostname" in label_keys -# ): -# if ( -# labels.get("worker_hostname") == hostname -# and labels.get("ee2_endpoint") == ee2_endpoint_url -# ): -# labels["image"] = cnt.image -# job_containers[cnt_id] = labels -# except Exception as e: -# logging.error(f"Container {cnt_id} doesn't exist anymore") -# logging.error(e) -# -# return job_containers -# -# -# def find_running_jobs(): -# """ -# Return a list of job ids from running job processes. -# Since python procs have multiple entries, keep only 1 version -# """ -# -# # send_slack_message(f"Job CONTAINER_REAPER is FINDING RUNNING JOBS at {datetime.datetime.now()}") -# ls = [] -# for p in psutil.process_iter(attrs=["name", "cmdline"]): -# if ( -# "/miniconda/bin/python" in p.info["cmdline"] -# and "./jobrunner.py" in p.info["cmdline"] -# ): -# ls.append(p.info["cmdline"][-2]) -# return list(set(ls)) -# -# -# def send_slack_message(message: str): -# """ -# -# :param message: Escaped Message to send to slack -# :return: -# """ -# -# slack_data = {"text": message} -# requests.post( -# webhook_url, -# data=json.dumps(slack_data), -# headers={"Content-Type": "application/json"}, -# ) -# -# -# def notify_slack(cnt_id: str, labels: dict(), running_job_ids: List): -# now = datetime.datetime.now() -# -# job_id = labels.get("job_id", None) -# # app_id = labels['app_id'] -# app_name = labels.get("app_name", None) -# method_name = labels.get("method_name", None) -# condor_id = labels.get("condor_id", None) -# username = labels.get("user_name", None) -# -# msg = f"cnt_id:{cnt_id} job_id:{job_id} condor_id:{condor_id} for {username} not in running_job_ids {running_job_ids} ({now}) hostname:({hostname}) app:{app_name} method:{method_name} (kill = {kill}) " -# send_slack_message(msg) -# -# -# def kill_docker_container(cnt_id: str): -# """ -# Kill a docker container. The job finish script should clean up after itself. -# :param cnt_id: The container to kill/remove -# """ -# if kill is True: -# cnt = dc.containers.get(cnt_id) # type: Container -# try: -# cnt.kill() -# except Exception: -# try: -# cnt.remove(force=True) -# except Exception: -# send_slack_message(f"Couldn't delete {cnt_id} on {hostname}") -# -# -# def kill_dead_jobs(running_jobs: List, docker_processes: Dict): -# """ -# Check whether there are runaway docker containers -# :param running_jobs: A list of condor jobs gathered from the starter scripts -# :param docker_processes: A list of docker containers -# """ -# # send_slack_message(f"Job CONTAINER_REAPER is KILLING DEAD JOBS at {datetime.datetime.now()}") -# for cnt_id in docker_processes: -# labels = docker_processes[cnt_id] -# job_id = labels.get("job_id", None) -# if job_id not in running_jobs: -# notify_slack(cnt_id, labels, running_jobs) -# if kill is True: -# kill_docker_container(cnt_id) -# -# -# if __name__ == "__main__": -# try: -# # send_slack_message(f"Job CONTAINER_REAPER is beginning at {datetime.datetime.now()}") -# locally_running_jobrunners = find_running_jobs() -# docker_jobs = find_dockerhub_jobs() -# kill_dead_jobs(locally_running_jobrunners, docker_jobs) -# # send_slack_message(f"Job CONTAINER_REAPER is ENDING at {datetime.datetime.now()}") -# except Exception as ev: -# send_slack_message(f"FAILURE on {hostname}" + str(ev)) -# logging.error(str(ev)) From 2a452c9a2265d63a11ace99fb173acc3ea1c1009 Mon Sep 17 00:00:00 2001 From: Boris Date: Thu, 25 Jan 2024 18:18:38 -0600 Subject: [PATCH 16/18] Add tini --- Dockerfile | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/Dockerfile b/Dockerfile index 848dcf4..3e2a7be 100644 --- a/Dockerfile +++ b/Dockerfile @@ -61,3 +61,10 @@ CMD [ "-template", "/kb/deployment/conf/.templates/deployment.cfg.templ:/kb/depl "/kb/deployment/bin/start_server.sh" ] WORKDIR /kb/deployment/jettybase + +ENV TINI_VERSION v0.19.0 +ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /tini +ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini.asc /tini.asc +RUN gpg --batch --keyserver hkp://keyserver.ubuntu.com:80 --recv-keys 595E85A6B1B4779EA4DAAEC70B588DFF0527A9B7 \ + && gpg --batch --verify /tini.asc /tini +RUN chmod +x /tini && cp /tini /usr/bin/docker-init From 82bc93f3cd16717984bf2403478977bfd0d219e5 Mon Sep 17 00:00:00 2001 From: bio-boris Date: Thu, 25 Jan 2024 18:26:59 -0600 Subject: [PATCH 17/18] Update RELEASE_NOTES.txt --- RELEASE_NOTES.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/RELEASE_NOTES.txt b/RELEASE_NOTES.txt index 75b7879..0f04db3 100644 --- a/RELEASE_NOTES.txt +++ b/RELEASE_NOTES.txt @@ -3,6 +3,7 @@ * Modify Cronjobs that look for runaway containers * Update Documentation * Deprecate container_reaper_ee2.py in favor of container_reaper.py +* Update htcondor image to add TINI back in until we use supervisor.d 1.0.10 ======== From 81d7bc2b111adbfbcb2465cef58936cc4cb018ea Mon Sep 17 00:00:00 2001 From: bio-boris Date: Fri, 26 Jan 2024 18:29:19 -0600 Subject: [PATCH 18/18] DEVOPS-1593 Condor Cronjob Cleanup (#63) * Delete un-needed configs * Remove Shared Port Demon * Fix cronjob scripts --------- Co-authored-by: Boris --- Dockerfile | 31 ++++--- RELEASE_NOTES.txt | 1 + deployment/bin/README.md | 3 - .../bin/cron/delete_exited_containers.py | 13 ++- deployment/bin/cron/health_check.py | 13 +-- deployment/bin/docker-init.sh | 11 +++ deployment/bin/misc/java_stats.sh | 10 --- deployment/bin/misc/jshell-wrapper | 7 -- deployment/bin/start-condor.sh | 8 +- .../.templates/condor_config_worker.templ | 80 ++++++++----------- .../conf/.templates/cronjobs.config.templ | 2 +- .../conf/.templates/deployment.cfg.templ | 61 -------------- .../conf/.templates/shared_port_config.templ | 4 - .../conf/legacy/condor_config_worker2.templ | 71 ++++++++++++++++ .../limitBigMemSlots.templ | 0 .../start_server.sh.templ | 0 16 files changed, 157 insertions(+), 158 deletions(-) delete mode 100644 deployment/bin/README.md create mode 100755 deployment/bin/docker-init.sh delete mode 100755 deployment/bin/misc/java_stats.sh delete mode 100755 deployment/bin/misc/jshell-wrapper delete mode 100644 deployment/conf/.templates/deployment.cfg.templ delete mode 100644 deployment/conf/.templates/shared_port_config.templ create mode 100644 deployment/conf/legacy/condor_config_worker2.templ rename deployment/conf/{.templates => legacy}/limitBigMemSlots.templ (100%) rename deployment/conf/{.templates => legacy}/start_server.sh.templ (100%) diff --git a/Dockerfile b/Dockerfile index 3e2a7be..3b358a1 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,7 +1,7 @@ FROM htcondor/execute:lts-el8 ENV container docker -# Ge$t commonly used utilities +# Get commonly used utilities RUN yum -y update && yum upgrade -y RUN yum install -y drpm RUN yum -y install -y epel-release wget which git gcc libcgroup libcgroup-tools stress-ng tmpwatch procps @@ -14,7 +14,7 @@ RUN yum install -y yum-utils device-mapper-persistent-data lvm2 && yum-config-ma #Install Python3 and Libraries (source /root/miniconda/bin/activate) RUN yum install -y bzip2 \ && wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda.sh \ -&& bash ~/miniconda.sh -b -p /miniconda \ +&& bash ~/miniconda.sh -b -p /miniconda ENV PATH="/miniconda/bin:${PATH}" @@ -22,8 +22,6 @@ ENV PATH="/miniconda/bin:${PATH}" # Add kbase user and set up directories RUN useradd -c "KBase user" -rd /kb/deployment/ -u 998 -s /bin/bash kbase && \ mkdir -p /kb/deployment/bin && \ - mkdir -p /kb/deployment/jettybase/logs/ && \ - touch /kb/deployment/jettybase/logs/request.log && \ chown -R kbase /kb/deployment #INSTALL DOCKERIZE @@ -38,9 +36,7 @@ RUN mkdir -p /var/run/condor && mkdir -p /var/log/condor && mkdir -p /var/lock/c # Maybe you want: rm -rf /var/cache/yum, to also free up space taken by orphaned data from disabled or removed repos RUN rm -rf /var/cache/yum -COPY --chown=kbase deployment/ /kb/deployment/ -RUN /kb/deployment/bin/install_python_dependencies.sh # The BUILD_DATE value seem to bust the docker cache when the timestamp changes, move to # the end @@ -51,16 +47,6 @@ LABEL org.label-schema.build-date=$BUILD_DATE \ us.kbase.vcs-branch=$BRANCH \ maintainer="Steve Chan sychan@lbl.gov" -ENTRYPOINT [ "/kb/deployment/bin/dockerize" ] -CMD [ "-template", "/kb/deployment/conf/.templates/deployment.cfg.templ:/kb/deployment/conf/deployment.cfg", \ - "-template", "/kb/deployment/conf/.templates/http.ini.templ:/kb/deployment/jettybase/start.d/http.ini", \ - "-template", "/kb/deployment/conf/.templates/server.ini.templ:/kb/deployment/jettybase/start.d/server.ini", \ - "-template", "/kb/deployment/conf/.templates/start_server.sh.templ:/kb/deployment/bin/start_server.sh", \ - "-template", "/kb/deployment/conf/.templates/condor_config.templ:/etc/condor/condor_config.local", \ - "-stdout", "/kb/deployment/jettybase/logs/request.log", \ - "/kb/deployment/bin/start_server.sh" ] - -WORKDIR /kb/deployment/jettybase ENV TINI_VERSION v0.19.0 ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /tini @@ -68,3 +54,16 @@ ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini.asc / RUN gpg --batch --keyserver hkp://keyserver.ubuntu.com:80 --recv-keys 595E85A6B1B4779EA4DAAEC70B588DFF0527A9B7 \ && gpg --batch --verify /tini.asc /tini RUN chmod +x /tini && cp /tini /usr/bin/docker-init + +# Delete un-needed-configs from htcondor/execute:lts-el8 +# Revisit this when we change dockerize and token auth +RUN rm -f /etc/condor/config.d/00-htcondor-9.0.config +RUN rm -f /etc/condor/config.d/01-* + + +COPY --chown=kbase deployment/ /kb/deployment/ +RUN /kb/deployment/bin/install_python_dependencies.sh + +ENTRYPOINT [ "/usr/bin/docker-init" ] +CMD ["/kb/deployment/bin/docker-init.sh"] +WORKDIR /kb/deployment diff --git a/RELEASE_NOTES.txt b/RELEASE_NOTES.txt index 0f04db3..145569d 100644 --- a/RELEASE_NOTES.txt +++ b/RELEASE_NOTES.txt @@ -3,6 +3,7 @@ * Modify Cronjobs that look for runaway containers * Update Documentation * Deprecate container_reaper_ee2.py in favor of container_reaper.py +* Remove un-needed configs * Update htcondor image to add TINI back in until we use supervisor.d 1.0.10 diff --git a/deployment/bin/README.md b/deployment/bin/README.md deleted file mode 100644 index 318b193..0000000 --- a/deployment/bin/README.md +++ /dev/null @@ -1,3 +0,0 @@ -This directory is intended to be mounted in a running docker image under -/kb/deployment/bin so that it's contents can be used to build and store helper -binaries diff --git a/deployment/bin/cron/delete_exited_containers.py b/deployment/bin/cron/delete_exited_containers.py index fe9640a..7dfb2e3 100755 --- a/deployment/bin/cron/delete_exited_containers.py +++ b/deployment/bin/cron/delete_exited_containers.py @@ -26,7 +26,12 @@ def send_slack_message(message: str): hostname = socket.gethostname() dc = docker.from_env() ec = dc.containers.list(filters={"status": "exited"}) - container_image_names = [c.attrs["Config"]["Image"] for c in ec] - if container_image_names: - dc.containers.prune() - send_slack_message(f"Deleted {len(ec)} `exited` containers on {hostname} {container_image_names}") + kbase_containers = [c for c in ec if "kbase" in c.attrs["Config"]["Image"]] + container_image_names = [c.attrs["Config"]["Image"] for c in kbase_containers] + if kbase_containers: + for container in kbase_containers: + container.remove() + debug_mode = os.environ.get("DEBUG", "false").lower() == "true" + if debug_mode: + send_slack_message( + f"Deleted {len(kbase_containers)} `exited` containers with 'kbase' in image name on {hostname}: {container_image_names}") diff --git a/deployment/bin/cron/health_check.py b/deployment/bin/cron/health_check.py index a5fc8dc..7fcb008 100755 --- a/deployment/bin/cron/health_check.py +++ b/deployment/bin/cron/health_check.py @@ -4,6 +4,7 @@ """ import datetime +import inspect import json import logging import os @@ -33,12 +34,11 @@ if webhook_url is None: exit("SLACK_WEBHOOK_URL is not defined") - - user = "nobody" pid = pwd.getpwnam(user).pw_uid gid = pwd.getpwnam(user).pw_gid + def send_slack_message(message: str): """ :param message: Escaped Message to send to slack @@ -61,8 +61,13 @@ def exit_unsuccessfully(message: str, send_to_slack=True): now = datetime.datetime.now() if send_to_slack: + try: + function_name = lambda: inspect.stack()[1][3] + except Exception: + function_name = "" + send_slack_message( - f"POSSIBLE BLACK HOLE: Ran healthcheck at {now} on {socket.gethostname()} with failure: {message}" + f"POSSIBLE BLACK HOLE: {function_name} Ran healthcheck at {now} on {socket.gethostname()} with failure: {message}" ) sys.exit(1) @@ -224,8 +229,6 @@ def check_kbase_endpoints(): message = f"Couldn't reach {service}. {e}" exit_unsuccessfully(message) - - def main(): try: diff --git a/deployment/bin/docker-init.sh b/deployment/bin/docker-init.sh new file mode 100755 index 0000000..41973c9 --- /dev/null +++ b/deployment/bin/docker-init.sh @@ -0,0 +1,11 @@ +#!/usr/bin/env bash + +# Meant to be called by /usr/bin/docker-init -- + +/kb/deployment/bin/dockerize \ +-template /kb/deployment/conf/.templates/condor_config_worker.templ:/etc/condor/condor_config.local \ +-template /kb/deployment/conf/.templates/cronjobs.config.templ:/etc/condor/config.d/cronjobs.config \ +-timeout 120s \ +-stdout /var/log/condor/ProcLog \ +-stdout /var/log/condor/StartLog \ +/kb/deployment/bin/start-condor.sh diff --git a/deployment/bin/misc/java_stats.sh b/deployment/bin/misc/java_stats.sh deleted file mode 100755 index aed096e..0000000 --- a/deployment/bin/misc/java_stats.sh +++ /dev/null @@ -1,10 +0,0 @@ -#!/usr/bin/env jshell-wrapper - -System.out.println("Available Processors"); -System.out.println(Runtime.getRuntime().availableProcessors()); - -System.out.println("Free Memory"); -System.out.println(Runtime.getRuntime().freeMemory() + " " + Runtime.getRuntime().freeMemory() / 1000000000 + "G"); - -System.out.println("Max Memory"); -System.out.println(Runtime.getRuntime().maxMemory() + " " + Runtime.getRuntime().maxMemory() / 1000000000 + "G"); diff --git a/deployment/bin/misc/jshell-wrapper b/deployment/bin/misc/jshell-wrapper deleted file mode 100755 index ab5ad0f..0000000 --- a/deployment/bin/misc/jshell-wrapper +++ /dev/null @@ -1,7 +0,0 @@ -TMP=`mktemp` -tail -n +2 $@ >> $TMP -echo "/exit" >> $TMP -$JAVA_HOME/bin/jshell -q --execution local $TMP -rm $TMP - -#put this file in /usr/local/bin/ or somewhere in your $PATH diff --git a/deployment/bin/start-condor.sh b/deployment/bin/start-condor.sh index d2d7ac2..bd6bbb8 100755 --- a/deployment/bin/start-condor.sh +++ b/deployment/bin/start-condor.sh @@ -4,7 +4,7 @@ # condor pool password if [ "$GROUPMOD_DOCKER" ] ; then - groupmod -g $GROUPMOD_DOCKER docker + groupmod -g "$GROUPMOD_DOCKER" docker fi if [ "$POOL_PASSWORD" ] ; then @@ -42,11 +42,15 @@ fi # Ensure condor user can write to logs, since this is now mounted from host # Ensure condor user can modify the lock files and run files as of 8.9.10 -chown condor $(condor_config_val log) $(condor_config_val lock) $(condor_config_val run) +chown condor "$(condor_config_val log)" "$(condor_config_val lock)" "$(condor_config_val run)" docker system prune -a -f + +# Required for htcondor docker image to pick up changes from configs +/update-config + exec "$(condor_config_val MASTER)" -f -t 2>&1 diff --git a/deployment/conf/.templates/condor_config_worker.templ b/deployment/conf/.templates/condor_config_worker.templ index 10a34a1..c487377 100644 --- a/deployment/conf/.templates/condor_config_worker.templ +++ b/deployment/conf/.templates/condor_config_worker.templ @@ -1,26 +1,26 @@ -## What machine is your central manager? +# Central Manager Configuration CONDOR_HOST = {{ default .Env.CONDOR_HOST "condor" }} SCHEDD_HOST = {{ default .Env.SCHEDD_HOST "kbase@condor" }} CCB_ADDRESS = {{ default .Env.CCB_ADDRESS "condor" }} PRIVATE_NETWORK_NAME = {{ default .Env.HOSTNAME "condor" }} -## Allow commands to execute from this machine +# Access Control +# Allow commands to execute from this machine ALLOW_WRITE = $(ALLOW_WRITE) *.$(UID_DOMAIN) $(HOSTNAME) -ALLOW_ADMINISTRATOR = *.$(UID_DOMAIN) $(HOSTNAME) -ALLOW_NEGOTIATOR = *.$(UID_DOMAIN) $(HOSTNAME) +ALLOW_ADMINISTRATOR = *.$(UID_DOMAIN) $(HOSTNAME) +ALLOW_NEGOTIATOR = *.$(UID_DOMAIN) $(HOSTNAME) -# Set COLLECTOR_HOST if collector's internal hostname doesn't match DNS name -# and also set sock=COLLECTOR if using shared port -# example: +# Collector Configuration +# Set COLLECTOR_HOST to match the internal hostname, if different from DNS name. +# Use 'sock=collector' for shared port configuration if using shared port # COLLECTOR_HOST = ci.kbase.us:9618?sock=collector {{ if .Env.COLLECTOR_HOST -}} COLLECTOR_HOST = {{ .Env.COLLECTOR_HOST }} {{- end }} -# If the environment variable USE_TCP is set to true, the template will enable +# TCP Communication (optional) +# If the environment variable USE_TCP is set to True, the template will enable # this group of directives that convert communications to TCP # per https://lists.cs.wisc.edu/archive/htcondor-users/2011-August/msg00085.shtml -# UPDATE_COLLECTOR_WITH_TCP = True -# WANT_UDP_COMMAND_SOCKET = False -# COLLECTOR_MAX_FILE_DESCRIPTORS = 3000 + {{ if .Env.USE_TCP -}} UPDATE_COLLECTOR_WITH_TCP = True UPDATE_VIEW_COLLECTOR_WITH_TCP = True @@ -28,60 +28,50 @@ WANT_UDP_COMMAND_SOCKET = False COLLECTOR_MAX_FILE_DESCRIPTORS = 3000 {{- end }} +# Security Settings SEC_DEFAULT_AUTHENTICATION = {{ default .Env.SEC_DEFAULT_AUTHENTICATION "PASSWORD" }} -SEC_DEFAULT_AUTHENTICATION_METHODS = {{ default .Env.SEC_AUTHENTICATION_METHODS "PASSWORD" }} -SEC_DEFAULT_NEGOTIATION = {{ default .Env.SEC_DEFAULT_NEGOTIATION "REQUIRED" }} +SEC_DEFAULT_AUTHENTICATION_METHODS = {{ default .Env.SEC_AUTHENTICATION_METHODS "PASSWORD" }} +SEC_DEFAULT_NEGOTIATION = {{ default .Env.SEC_DEFAULT_NEGOTIATION "REQUIRED" }} SEC_PASSWORD_FILE = {{ default .Env.SEC_PASSWORD_FILE "/etc/condor/password" }} +# Daemon and Domain Settings UID_DOMAIN = {{ default .Env.UID_DOMAIN "condor" }} DAEMON_LIST = MASTER, STARTD DISCARD_SESSION_KEYRING_ON_STARTUP = False - - - -# When is this node willing to run jobs? -#StartJobs = True -#NODE_IS_HEALTHY = False -#START = (NODE_IS_HEALTHY =?= True) && (StartJobs =?= True) - +## Node Health and Job Management NODE_IS_HEALTHY = False -START = (NODE_IS_HEALTHY =?= True) - -# Check this with condor_config_val START -evaluate - - -#START = TRUE +# Expression Conditions to start jobs +START = (NODE_IS_HEALTHY =?= True) SUSPEND = False PREEMPT = False KILL = False - -# Set 12 hours maximum wait time for jobs to finish for using condor_drain +# Max time (in seconds) to retire jobs # Set 12 hours maximum wait time for jobs to finish for using condor_drain MAXJOBRETIREMENTTIME = 43200 +CGROUP_MEMORY_LIMIT_POLICY = {{ default .Env.CGROUP_MEMORY_LIMIT_POLICY "soft" }} -# Machine resource settings -# {{ if .Env.NUM_SLOTS -}} NUM_SLOTS = {{ .Env.NUM_SLOTS }} {{- end }} - -# Dynamic Slots +# Partitionable slot configuration NUM_SLOTS = 1 NUM_SLOTS_TYPE_1 = 1 SLOT_TYPE_1 = 100% -SLOT_TYPE_1_PARTITIONABLE = TRUE -CGROUP_MEMORY_LIMIT_POLICY = {{ default .Env.CGROUP_MEMORY_LIMIT_POLICY "soft" }} - -CLIENTGROUP = {{ default .Env.CLIENTGROUP "\"njs\"" }} - -#LEGACY : /mnt/awe/condor/condor_job_execute/$(HOSTNAME)/ to +SLOT_TYPE_1_PARTITIONABLE = True +# Client Group Setting +CLIENTGROUP = {{ default .Env.CLIENTGROUP "\"njs\"" }} +# Execution and Resource Management EXECUTE = {{ default .Env.condor_submit_workdir "/cdr/" }}{{ .Env.EXECUTE_SUFFIX }} - -STARTD_RECOMPUTE_DISK_FREE = true -#STARTD_ATTRS = StartJobs, CLIENTGROUP, $(STARTD_ATTRS) +STARTD_RECOMPUTE_DISK_FREE = True STARTD_ATTRS = CLIENTGROUP, $(STARTD_ATTRS) -# We don't need core files, but can enable them for persistent crashes -CREATE_CORE_FILES = false +# Miscellaneous Settings +CREATE_CORE_FILES = False +USE_SHARED_PORT = False +AUTO_INCLUDE_SHARED_PORT_IN_DAEMON_LIST=False +# Log File Management +# Configuration for log file rotation and size management. +# Max log size is calculated based on the number of cores and daemons. # $(MAX_DEFAULT_LOG) is set to 10MB, before it is moved to .old. The .old gets over-written after 1 rotation -# That means for 32 cores + 5 daemons, the max size = 37procs * 10mb = 370 * 2 = 740MB +# That means for 32 cores + 5 daemons, the max size = 37procs * 10mb = 370 * 2 = 740MB + diff --git a/deployment/conf/.templates/cronjobs.config.templ b/deployment/conf/.templates/cronjobs.config.templ index f3c6057..f490f93 100644 --- a/deployment/conf/.templates/cronjobs.config.templ +++ b/deployment/conf/.templates/cronjobs.config.templ @@ -37,5 +37,5 @@ STARTD_CRON_ManageVarLibDocker_PERIOD = 336h STARTD_CRON_ManageVarLibDocker_MODE = Periodic STARTD_CRON_ManageVarLibDocker_RECONFIG_RERUN = True -STARTD_CRON_JOBLIST = NodeHealth ContainerReaper ManageVarLibDocker ManageCondorSubmitWorkdir +STARTD_CRON_JOBLIST = NodeHealth ContainerReaper ManageVarLibDocker ManageCondorSubmitWorkdir DeleteExitedContainers # STARTD_CRON_AUTOPUBLISH = If_Changed diff --git a/deployment/conf/.templates/deployment.cfg.templ b/deployment/conf/.templates/deployment.cfg.templ deleted file mode 100644 index e01e8c1..0000000 --- a/deployment/conf/.templates/deployment.cfg.templ +++ /dev/null @@ -1,61 +0,0 @@ -[NarrativeJobService] -port = {{ default .Env.port "8200" }} -# server thread count - this determines the number of requests that can be -# processed simultaneously. -server-threads = {{ default .Env.server_threads "20" }} -# Minimum memory size in MB. -min-memory = {{ default .Env.min_memory "1000" }} -# Maximum memory size in MB. -max-memory = {{ default .Env.max_memory "1500" }} - -queue.db.dir={{ default .Env.queue.db.dir "/tmp/njs/queue" }} -basedir={{ default .Env.basedir "njs_wrapper" }} -scratch={{ default .Env.scratch "/tmp" }} -ref.data.base={{ default .Env.ref_data_base "/kb/data" }} - -self.external.url={{ default .Env.self_external_url "https://ci.kbase.us/services/njs_wrapper" }} -kbase.endpoint={{ default .Env.kbase_endpoint "https://ci.kbase.us/services" }} -workspace.srv.url={{ default .Env.workspace_srv_url "https://ci.kbase.us/services/ws" }} -jobstatus.srv.url={{ default .Env.jobstatus_srv_url "https://ci.kbase.us/services/userandjobstate" }} -shock.url={{ default .Env.shock_url "https://ci.kbase.us/services/shock-api" }} -awe.srv.url={{ default .Env.awe_srv_url "http://ci.kbase.us/services/awe-api" }} -docker.registry.url={{ default .Env.docker_registry_url "dockerhub-ci.kbase.us" }} -awe.client.docker.uri={{ default .Env.awe_client_docker_uri "unix:///var/run/docker.sock" }} -catalog.srv.url={{ default .Env.catalog_srv_url "https://ci.kbase.us/services/catalog" }} -handle.url={{ default .Env.handle_url "https://ci.kbase.us/services/handle_service" }} -srv.wiz.url={{ default .Env.srv_wiz_url "https://ci.kbase.us/services/service_wizard" }} -auth-service-url = {{ default .Env.auth_service_url "https://ci.kbase.us/services/auth/api/legacy/KBase/Sessions/Login" }} -auth.service.url.v2 = {{ default .Env.auth_service_url_v2 "https://ci.kbase.us/services/auth/api/V2/token" }} -auth-service-url-allow-insecure={{ default .Env.auth_service_url_allow_insecure "false" }} - - -## This user can run list_running_apps method to get states -## of all running apps (running internally on wrapper side). -admin.user={{default .Env.admin_user "" }} - -# Following parameters define Catalog admin creds for pushing exec-stats: -catalog.admin.token={{ default .Env.catalog_token "" }} - -default.awe.client.groups={{ default .Env.default_awe_client_groups "ci" }} -awe.readonly.admin.token={{ default .Env.awe_token "" }} -awe.client.callback.networks={{ default .Env.awe_client_callback_networks "docker0,eth0" }} -running.tasks.per.user={{ default .Env.running_tasks_per_user "5" }} - -mongodb-host = {{ default .Env.mongodb_host "localhost:27017" }} -mongodb-database = {{ default .Env.mongodb_database "exec_engine" }} -mongodb-user = {{ default .Env.mongodb_user "" }} -mongodb-pwd = {{ default .Env.mongodb_pwd "" }} - -ujs-mongodb-host = {{ default .Env.ujs_mongodb_host "localhost:27017" }} -ujs-mongodb-database = {{ default .Env.ujs_mongodb_database "userjobstate" }} -ujs-mongodb-user = {{ default .Env.ujs_mongodb_user "" }} -ujs-mongodb-pwd = {{ default .Env.ujs_mongodb_pwd "" }} - -narrative.proxy.sharing.user={{ default .Env.narrative_proxy_sharing_user "narrativejoblistener" }} - -condor.mode={{ default .Env.condor_mode "1" }} -condor.submit.desc.file.path={{ default .Env.condor_submit_desc_file_path "/kb/deployment/misc/" }} -condor-submit-workdir={{ default .Env.condor_submit_workdir "/mnt/condor" }} - -## Formula = Token Expiration in ms - (Time_Before_Expiration / 60 ) * 1000 -time.before.expiration = {{ default .Env.time_before_expiration "10" }} \ No newline at end of file diff --git a/deployment/conf/.templates/shared_port_config.templ b/deployment/conf/.templates/shared_port_config.templ deleted file mode 100644 index 1ff2405..0000000 --- a/deployment/conf/.templates/shared_port_config.templ +++ /dev/null @@ -1,4 +0,0 @@ -SHARED_PORT_ARGS = -p {{ default .Env.SHARED_PORT_CONFIG "9618" }} -DAEMON_LIST = $(DAEMON_LIST), SHARED_PORT -COLLECTOR_HOST = $(CONDOR_HOST)?sock=collector -USE_SHARED_PORT = TRUE diff --git a/deployment/conf/legacy/condor_config_worker2.templ b/deployment/conf/legacy/condor_config_worker2.templ new file mode 100644 index 0000000..c7182eb --- /dev/null +++ b/deployment/conf/legacy/condor_config_worker2.templ @@ -0,0 +1,71 @@ +## What machine is your central manager? +CONDOR_HOST = {{ default .Env.CONDOR_HOST "condor" }} +SCHEDD_HOST = {{ default .Env.SCHEDD_HOST "kbase@condor" }} +CCB_ADDRESS = {{ default .Env.CCB_ADDRESS "condor" }} +PRIVATE_NETWORK_NAME = {{ default .Env.HOSTNAME "condor" }} + +## Allow commands to execute from this machine +ALLOW_WRITE = $(ALLOW_WRITE) *.$(UID_DOMAIN) $(HOSTNAME) +ALLOW_ADMINISTRATOR = *.$(UID_DOMAIN) $(HOSTNAME) +ALLOW_NEGOTIATOR = *.$(UID_DOMAIN) $(HOSTNAME) + +# Set COLLECTOR_HOST if collector's internal hostname doesn't match DNS name +# and also set sock=COLLECTOR if using shared port +# example: +# COLLECTOR_HOST = ci.kbase.us:9618?sock=collector +{{ if .Env.COLLECTOR_HOST -}} COLLECTOR_HOST = {{ .Env.COLLECTOR_HOST }} {{- end }} + +# If the environment variable USE_TCP is set to true, the template will enable +# this group of directives that convert communications to TCP +# per https://lists.cs.wisc.edu/archive/htcondor-users/2011-August/msg00085.shtml +# UPDATE_COLLECTOR_WITH_TCP = True +# WANT_UDP_COMMAND_SOCKET = False +# COLLECTOR_MAX_FILE_DESCRIPTORS = 3000 +{{ if .Env.USE_TCP -}} +UPDATE_COLLECTOR_WITH_TCP = True +UPDATE_VIEW_COLLECTOR_WITH_TCP = True +WANT_UDP_COMMAND_SOCKET = False +COLLECTOR_MAX_FILE_DESCRIPTORS = 3000 +{{- end }} + +SEC_DEFAULT_AUTHENTICATION = {{ default .Env.SEC_DEFAULT_AUTHENTICATION "PASSWORD" }} +SEC_DEFAULT_AUTHENTICATION_METHODS = {{ default .Env.SEC_AUTHENTICATION_METHODS "PASSWORD" }} +SEC_DEFAULT_NEGOTIATION = {{ default .Env.SEC_DEFAULT_NEGOTIATION "REQUIRED" }} +SEC_PASSWORD_FILE = {{ default .Env.SEC_PASSWORD_FILE "/etc/condor/password" }} + +UID_DOMAIN = {{ default .Env.UID_DOMAIN "condor" }} +DAEMON_LIST = MASTER, STARTD +DISCARD_SESSION_KEYRING_ON_STARTUP = False + +NODE_IS_HEALTHY = False +START = (NODE_IS_HEALTHY =?= True) +SUSPEND = False +PREEMPT = False +KILL = False + +# Set 12 hours maximum wait time for jobs to finish for using condor_drain +MAXJOBRETIREMENTTIME = 43200 + + +# Dynamic Slots +NUM_SLOTS = 1 +NUM_SLOTS_TYPE_1 = 1 +SLOT_TYPE_1 = 100% +SLOT_TYPE_1_PARTITIONABLE = TRUE +CGROUP_MEMORY_LIMIT_POLICY = {{ default .Env.CGROUP_MEMORY_LIMIT_POLICY "soft" }} + +CLIENTGROUP = {{ default .Env.CLIENTGROUP "\"njs\"" }} + + +EXECUTE = {{ default .Env.condor_submit_workdir "/cdr/" }}{{ .Env.EXECUTE_SUFFIX }} + +STARTD_RECOMPUTE_DISK_FREE = true +STARTD_ATTRS = CLIENTGROUP, $(STARTD_ATTRS) + +# We don't need core files, but can enable them for persistent crashes +CREATE_CORE_FILES = false +AUTO_INCLUDE_SHARED_PORT_IN_DAEMON_LIST=False + +# $(MAX_DEFAULT_LOG) is set to 10MB, before it is moved to .old. The .old gets over-written after 1 rotation +# That means for 32 cores + 5 daemons, the max size = 37procs * 10mb = 370 * 2 = 740MB + diff --git a/deployment/conf/.templates/limitBigMemSlots.templ b/deployment/conf/legacy/limitBigMemSlots.templ similarity index 100% rename from deployment/conf/.templates/limitBigMemSlots.templ rename to deployment/conf/legacy/limitBigMemSlots.templ diff --git a/deployment/conf/.templates/start_server.sh.templ b/deployment/conf/legacy/start_server.sh.templ similarity index 100% rename from deployment/conf/.templates/start_server.sh.templ rename to deployment/conf/legacy/start_server.sh.templ