Merge pull request #88 from refgenie/staging

v0.5.0
refgenie · Jul 6, 2020 · 88809df · 88809df
2 parents 52ed286 + 7caa79d
commit 88809df
Show file tree

Hide file tree

Showing 14 changed files with 385 additions and 61 deletions.
diff --git a/.github/workflows/deploy_release_software.yml b/.github/workflows/deploy_release_software.yml
@@ -0,0 +1,23 @@
+on:
+  release:
+    types:
+      - created
+
+name: Deploy to Dockerhub on release
+
+jobs:
+  deploy:
+    name: Deploy
+    runs-on: ubuntu-latest
+    steps:
+    - name: Checkout
+      uses: actions/checkout@v2
+
+    - name: Push to DockerHub
+      uses: docker/build-push-action@v1
+      with:
+        username: ${{ secrets.DOCKER_USERNAME }}
+        password: ${{ secrets.DOCKER_PASSWORD }}
+        repository: databio/refgenieserver
+        tags: latest
+        tag_with_ref: true
diff --git a/.github/workflows/deploy_staging_software.yml b/.github/workflows/deploy_staging_software.yml
@@ -0,0 +1,108 @@
+# This workflow will build and push a new container image to Amazon ECR,
+# and then will deploy a new task definition to Amazon ECS, when a release is created
+#
+# To use this workflow, you will need to complete the following set-up steps:
+#
+# 1. Create an ECR repository to store your images.
+#    For example: `aws ecr create-repository --repository-name my-ecr-repo --region us-east-2`.
+#    Replace the value of `ECR_REPOSITORY` in the workflow below with your repository's name.
+#    Replace the value of `aws-region` in the workflow below with your repository's region.
+#
+# 2. Create an ECS task definition, an ECS cluster, and an ECS service.
+#    For example, follow the Getting Started guide on the ECS console:
+#      https://us-east-2.console.aws.amazon.com/ecs/home?region=us-east-2#/firstRun
+#    Replace the values for `service` and `cluster` in the workflow below with your service and cluster names.
+#
+# 3. Store your ECS task definition as a JSON file in your repository.
+#    The format should follow the output of `aws ecs register-task-definition --generate-cli-skeleton`.
+#    Replace the value of `task-definition` in the workflow below with your JSON file's name.
+#    Replace the value of `container-name` in the workflow below with the name of the container
+#    in the `containerDefinitions` section of the task definition.
+#
+# 4. Store an IAM user access key in GitHub Actions secrets named `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY`.
+#    See the documentation for each action used below for the recommended IAM policies for this IAM user,
+#    and best practices on handling the access key credentials.
+
+
+
+# 1. push the base refgenieserver image to dockerhub
+# 2. check out the demo repo, then run the aws deploy
+  # 2a. build aws package with demo config file, push, deploy to cluster
+
+on:
+  push:
+    branches:
+      - staging
+  pull_request:
+    branches:
+      - staging
+
+name: Deploy to Amazon ECS - software-staging
+
+jobs:
+  deploy:
+    env:
+      server: software-staging
+    name: Deploy
+    runs-on: ubuntu-latest
+    steps:
+    - name: Checkout
+      uses: actions/checkout@v2
+
+    - name: Push to DockerHub
+      uses: docker/build-push-action@v1
+      with:
+        username: ${{ secrets.DOCKER_USERNAME }}
+        password: ${{ secrets.DOCKER_PASSWORD }}
+        repository: databio/refgenieserver
+        tags: staging
+
+
+    - name: Checkout demo repo
+      uses: actions/checkout@v2   
+      with:
+        repository: refgenie/refgenomes.databio.org
+        refs: refs/heads/master
+        path: server_data
+
+    - name: Configure AWS credentials
+      uses: aws-actions/configure-aws-credentials@v1
+      with:
+        aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
+        aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+        aws-region: us-east-1
+
+    - name: Login to Amazon ECR
+      id: login-ecr
+      uses: aws-actions/amazon-ecr-login@v1
+
+    - name: Build, tag, and push image to Amazon ECR
+      id: build-image
+      env:
+        ECR_REGISTRY: ${{ steps.login-ecr.outputs.registry }}
+        ECR_REPOSITORY: my-ecr-repo
+        IMAGE_TAG: ${{ github.sha }}
+      run: |
+        # Build a docker container and
+        # push it to ECR so that it can
+        # be deployed to ECS.
+        cd server_data
+        docker build -t $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG -f Dockerfiles/staging.Dockerfile .
+        docker push $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG
+        echo "::set-output name=image::$ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG"
+
+    - name: Fill in the new image ID in the Amazon ECS task definition
+      id: task-def
+      uses: aws-actions/amazon-ecs-render-task-definition@v1
+      with:
+        task-definition: task_defs/${{ env.server }}.json
+        container-name: rgs-container
+        image: ${{ steps.build-image.outputs.image }}
+
+    - name: Deploy Amazon ECS task definition
+      uses: aws-actions/amazon-ecs-deploy-task-definition@v1
+      with:
+        task-definition: ${{ steps.task-def.outputs.task-definition }}
+        service: rgs-service-${{ env.server }}
+        cluster: hydra
+        wait-for-service-stability: true
diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml
@@ -0,0 +1,31 @@
+# This workflows will upload a Python Package using Twine when a release is created
+# For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries
+
+name: Upload Python Package
+
+on:
+  release:
+    types: [created]
+
+jobs:
+  deploy:
+
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: actions/checkout@v2
+    - name: Set up Python
+      uses: actions/setup-python@v2
+      with:
+        python-version: '3.x'
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install setuptools wheel twine
+    - name: Build and publish
+      env:
+        TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
+        TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
+      run: |
+        python setup.py sdist bdist_wheel
+        twine upload dist/*
diff --git a/README.md b/README.md
@@ -16,7 +16,13 @@ docker build -t refgenieserverim .
 
 ### Running container for development:
 
-Mount a directory of files to serve at `/genomes`:
+You can run it directly after installing with `pip install`, like this:
+
+```
+refgenieserver serve -c refgenie.yaml -p 5000
+```
+
+Better, though, is to use the container. Mount a directory of files to serve at `/genomes`:
 
 ```
 docker run --rm -p 80:80 --name refgenieservercon \
@@ -78,7 +84,9 @@ docker exec -it refgenieservercon sh
 
 Refgenieserver can also archive your assets, creating the directory for asset archives needed to `serve`.
 
-First, make sure the config has a `genome_archive` key that points to the directory where you want to store the servable archives (`genome_archive` is __not__ added automatically by [`refgenie init`](http://refgenie.databio.org/en/latest/usage/#refgenie-init-help)). Your first time you will need to manually add this to tell refgenieserver where to store the archives.
+First, make sure the config has a `genome_archive_folder` key that points to the directory where you want to store the servable archives (`genome_archive_folder` is __not__ added automatically by [`refgenie init`](http://refgenie.databio.org/en/latest/usage/#refgenie-init-help)). Your first time you will need to manually add this to tell refgenieserver where to store the archives.
+
+Secondly, if you wish to store the refgenieserver configuration file separately from the `genome_archive_folder`, specify a `genome_archive_config` key. The path that this key points to will be considered relative to the refgenie configuration file, unless it's absolute.
 
 Then run: 
 ```
@@ -113,4 +121,4 @@ Use it simply as follows:
 /path/to/test_refgenie.sh
 ```
 
-The script also requires Python's [virtual environment module](https://docs.python.org/3/tutorial/venv.html), [Docker](https://www.docker.com/), and [Bulker](https://bulker.databio.org/en/latest/) to successfully test all components.
+The script also requires Python's [virtual environment module](https://docs.python.org/3/tutorial/venv.html), [Docker](https://www.docker.com/), and [Bulker](https://bulker.databio.org/en/latest/) to successfully test all components.
diff --git a/.travis.yml → _.travis.yml b/.travis.yml → _.travis.yml
diff --git a/changelog.md b/changelog.md
@@ -2,6 +2,17 @@
 
 This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html) and [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) format. 
 
+## [0.5.0] -- 2020-07-06
+### Added
+- support for external asset sources via `remote_url_base` key in the config
+### Changed
+- path specified in `genome_archive_config` is considered relative to the refgenie genome config file, unless absolute.
+- non-servable assets purging is now performed prior to serving rather than after each archive job completion
+- dropped Python 2 support 
+### Removed
+- support for old `genome_archive` key; use `genome_archive_folder` and `genome_archive_config` from now on.
+
+
 ## [0.4.4] -- 2020-03-17
 ### Changed
 - `refgenieserver archive` requires all assets to be complete prior to archiving

diff --git a/refgenieserver/_version.py b/refgenieserver/_version.py
@@ -1 +1 @@
-__version__ = "0.4.4"
+__version__ = "0.5.0"
diff --git a/refgenieserver/helpers.py b/refgenieserver/helpers.py
@@ -1,8 +1,14 @@
+import logging
 from .const import *
 from ._version import __version__ as v
 from yacman import get_first_env_var
 from ubiquerg import VersionInHelpParser
 
+from string import Formatter
+
+global _LOGGER
+_LOGGER = logging.getLogger(PKG_NAME)
+
 
 def build_parser():
     """
@@ -103,3 +109,53 @@ def get_openapi_version(app):
         return app.openapi()["openapi"]
     except Exception as e:
         return "3.0.2"
+
+
+def get_datapath_for_genome(rgc, fill_dict,
+                            pth_templ="{base}/{genome}/{file_name}"):
+    """
+    Get the path to the data file to serve.
+
+    Depending on the remote URL base being set or not, the function will return
+    either a remote URL to the file or a file path along with a flag indicating
+    the source
+
+    :param refgenconf.RefGenConf rgc: configiration object to use
+    :param dict fill_dict: a dictionary to use to fill in the path template
+    :param str fill_dict: the path template
+    :return (str, bool): a pair of file source and the flag indicationg whether
+     the source is remote
+    """
+    req_keys = [i[1] for i in Formatter().parse(pth_templ) if i[1] is not None]
+    assert all([k in req_keys for k in list(fill_dict.keys())]), \
+        "Only the following keys are allowed in the fill_dict: {}".format(req_keys)
+    remote = False
+    fill_dict.update({"base": BASE_DIR})
+    if CFG_REMOTE_URL_BASE_KEY in rgc \
+            and rgc[CFG_REMOTE_URL_BASE_KEY] is not None:
+        fill_dict["base"] = rgc[CFG_REMOTE_URL_BASE_KEY].rstrip("/")
+        remote = True
+    return pth_templ.format(**fill_dict), remote
+
+
+def purge_nonservable(rgc):
+    """
+    Remove entries in RefGenConf object that were not processed by the archiver and should not be served
+
+    :param refgenconf.RefGenConf rgc: object to check
+    :return refgenconf.RefGenConf: object with just the servable entries
+    """
+    def _check_servable(rgc, genome, asset, tag):
+        tag_data = rgc[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY][asset][CFG_ASSET_TAGS_KEY][tag]
+        return all([r in tag_data for r in [CFG_ARCHIVE_CHECKSUM_KEY, CFG_ARCHIVE_SIZE_KEY]])
+
+    for genome_name, genome in rgc[CFG_GENOMES_KEY].items():
+        for asset_name, asset in genome[CFG_ASSETS_KEY].items():
+            try:
+                for tag_name, tag in asset[CFG_ASSET_TAGS_KEY].items():
+                    if not _check_servable(rgc, genome_name, asset_name, tag_name):
+                        _LOGGER.debug("Removing '{}/{}:{}', it's not servable".format(genome_name, asset_name, tag_name))
+                        rgc.cfg_remove_assets(genome_name, asset_name, tag_name)
+            except KeyError:
+                rgc.cfg_remove_assets(genome_name, asset_name)
+    return rgc
diff --git a/refgenieserver/main.py b/refgenieserver/main.py
@@ -1,5 +1,5 @@
 from .const import *
-from .helpers import build_parser
+from .helpers import build_parser, purge_nonservable
 from .server_builder import archive
 from refgenconf import RefGenConf, select_genome_config
 from fastapi import FastAPI
@@ -41,6 +41,8 @@ def main():
         archive(rgc, arp, args.force, args.remove, selected_cfg, args.genomes_desc)
     elif args.command == "serve":
         # the router imports need to be after the RefGenConf object is declared
+        with rgc as r:
+            purge_nonservable(r)
         from .routers import version1, version2
         app.include_router(version1.router)
         app.include_router(version1.router, prefix="/v1")

diff --git a/refgenieserver/routers/version2.py b/refgenieserver/routers/version2.py
@@ -1,4 +1,5 @@
 from starlette.responses import FileResponse, JSONResponse
+from starlette.responses import RedirectResponse
 from starlette.requests import Request
 from fastapi import HTTPException, APIRouter
 
@@ -7,7 +8,7 @@
 
 from ..const import *
 from ..main import rgc, templates, _LOGGER, app
-from ..helpers import get_openapi_version
+from ..helpers import get_openapi_version, get_datapath_for_genome
 
 router = APIRouter()
 
@@ -68,10 +69,14 @@ async def download_asset(genome: str, asset: str, tag: str = None):
     """
     tag = tag or rgc.get_default_tag(genome, asset)  # returns 'default' for nonexistent genome/asset; no need to catch
     file_name = "{}__{}{}".format(asset, tag, ".tgz")
-    asset_file = "{base}/{genome}/{file_name}".format(base=BASE_DIR, genome=genome, file_name=file_name)
-    _LOGGER.info("serving asset file: '{}'".format(asset_file))
-    if os.path.isfile(asset_file):
-        return FileResponse(asset_file, filename=file_name, media_type="application/octet-stream")
+    path, remote = get_datapath_for_genome(rgc, dict(genome=genome, file_name=file_name))
+    _LOGGER.info("file source: {}".format(path))
+    if remote:
+        _LOGGER.info("redirecting to URL: '{}'".format(path))
+        return RedirectResponse(path)
+    _LOGGER.info("serving asset file: '{}'".format(path))
+    if os.path.isfile(path):
+        return FileResponse(path, filename=file_name, media_type="application/octet-stream")
     else:
         msg = MSG_404.format("asset ({})".format(asset))
         _LOGGER.warning(msg)
@@ -121,10 +126,13 @@ async def download_asset_build_log(genome: str, asset: str, tag: str = None):
     """
     tag = tag or rgc.get_default_tag(genome, asset)  # returns 'default' for nonexistent genome/asset; no need to catch
     file_name = TEMPLATE_LOG.format(asset, tag)
-    log_file = "{base}/{genome}/{file_name}".format(base=BASE_DIR, genome=genome, file_name=file_name)
-    _LOGGER.info("serving build log file: '{}'".format(log_file))
-    if os.path.isfile(log_file):
-        return FileResponse(log_file, filename=file_name, media_type="application/octet-stream")
+    path, remote = get_datapath_for_genome(rgc, dict(genome=genome, file_name=file_name))
+    if remote:
+        _LOGGER.info("redirecting to URL: '{}'".format(path))
+        return RedirectResponse(path)
+    _LOGGER.info("serving build log file: '{}'".format(path))
+    if os.path.isfile(path):
+        return FileResponse(path, filename=file_name, media_type="application/octet-stream")
     else:
         msg = MSG_404.format("asset ({})".format(asset))
         _LOGGER.warning(msg)
@@ -138,13 +146,16 @@ async def download_asset_build_recipe(genome: str, asset: str, tag: str = None):
 
     Optionally, 'tag' query parameter can be specified to get a tagged asset archive. Default tag is returned otherwise.
     """
-    import json
     tag = tag or rgc.get_default_tag(genome, asset)  # returns 'default' for nonexistent genome/asset; no need to catch
     file_name = TEMPLATE_RECIPE_JSON.format(asset, tag)
-    recipe_file = "{base}/{genome}/{file_name}".format(base=BASE_DIR, genome=genome, file_name=file_name)
-    _LOGGER.info("serving build recipe file: '{}'".format(recipe_file))
-    if os.path.isfile(recipe_file):
-        with open(recipe_file, 'r') as f:
+    path, remote = get_datapath_for_genome(rgc, dict(genome=genome, file_name=file_name))
+    if remote:
+        _LOGGER.info("redirecting to URL: '{}'".format(path))
+        return RedirectResponse(path)
+    _LOGGER.info("serving build log file: '{}'".format(path))
+    if os.path.isfile(path):
+        import json
+        with open(path, 'r') as f:
             recipe = json.load(f)
         return JSONResponse(recipe)
     else: