-
Notifications
You must be signed in to change notification settings - Fork 119
/
Dockerfile
165 lines (138 loc) · 7.66 KB
/
Dockerfile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
FROM mambaorg/micromamba:1.5.1 as app
# build and run as root users since micromamba image has 'mambauser' set as the $USER
USER root
# set workdir to default for building; set to /data at the end
WORKDIR /
# ARG variables only persist during build time
# had to include the v for some of these due to GitHub tags.
# using pangolin-data github tag, NOT what is in the GH release title "v1.2.133"
ARG PANGOLIN_VER="v4.3.1"
ARG PANGOLIN_DATA_VER="v1.23.1"
ARG SCORPIO_VER="v0.3.19"
ARG CONSTELLATIONS_VER="v0.1.12"
ARG USHER_VER="0.6.2"
# metadata labels
LABEL base.image="mambaorg/micromamba:1.5.1"
LABEL dockerfile.version="1"
LABEL software="pangolin"
LABEL software.version=${PANGOLIN_VER}
LABEL description="Conda environment for Pangolin. Pangolin: Software package for assigning SARS-CoV-2 genome sequences to global lineages."
LABEL website="https://github.com/cov-lineages/pangolin"
LABEL license="GNU General Public License v3.0"
LABEL license.url="https://github.com/cov-lineages/pangolin/blob/master/LICENSE.txt"
LABEL maintainer="Curtis Kapsak"
LABEL maintainer.email="[email protected]"
# install dependencies; cleanup apt garbage
RUN apt-get update && apt-get install -y --no-install-recommends \
wget \
ca-certificates \
git \
procps \
bsdmainutils && \
apt-get autoclean && rm -rf /var/lib/apt/lists/*
# get the pangolin repo
RUN wget "https://github.com/cov-lineages/pangolin/archive/${PANGOLIN_VER}.tar.gz" && \
tar -xf ${PANGOLIN_VER}.tar.gz && \
rm -v ${PANGOLIN_VER}.tar.gz && \
mv -v pangolin-* pangolin
# set the environment; PATH is unnecessary here, but leaving anyways. It's reset later in dockerfile
ENV PATH="$PATH" \
LC_ALL=C.UTF-8
# modify environment.yml to pin specific versions during install
# create the conda environment using modified environment.yml
RUN sed -i "s|usher.*|usher=${USHER_VER}|" /pangolin/environment.yml && \
sed -i "s|scorpio.git|scorpio.git@${SCORPIO_VER}|" /pangolin/environment.yml && \
sed -i "s|pangolin-data.git|pangolin-data.git@${PANGOLIN_DATA_VER}|" /pangolin/environment.yml && \
sed -i "s|constellations.git|constellations.git@${CONSTELLATIONS_VER}|" /pangolin/environment.yml && \
micromamba create -n pangolin -y -f /pangolin/environment.yml
# so that mamba/conda env is active when running below commands
ENV ENV_NAME="pangolin"
ARG MAMBA_DOCKERFILE_ACTIVATE=1
WORKDIR /pangolin
# run pip install step; download optional pre-computed assignment hashes for UShER (useful for running on large batches of samples)
# best to skip using the assigment-cache if running on one sample for speed
# print versions
RUN pip install . && \
pangolin --add-assignment-cache && \
micromamba clean -a -y && \
mkdir /data && \
pangolin --all-versions && \
usher --version
WORKDIR /data
# hardcode pangolin executable into the PATH variable
ENV PATH="${PATH}:/opt/conda/envs/pangolin/bin/"
# default command is to pull up help options for virulencefinder; can be overridden of course
CMD ["pangolin", "-h"]
# new base for testing
FROM app as test
# so that mamba/conda env is active when running below commands
ENV ENV_NAME="pangolin"
ARG MAMBA_DOCKERFILE_ACTIVATE=1
# test on test sequences supplied with Pangolin code
RUN pangolin /pangolin/pangolin/test/test_seqs.fasta --analysis-mode usher -o /data/test_seqs-output-pusher && \
column -t -s, /data/test_seqs-output-pusher/lineage_report.csv
# test functionality of assignment-cache option
RUN pangolin --use-assignment-cache /pangolin/pangolin/test/test_seqs.fasta
# download B.1.1.7 genome from Utah
ADD https://raw.githubusercontent.com/StaPH-B/docker-builds/master/tests/SARS-CoV-2/SRR13957123.consensus.fa /test-data/SRR13957123.consensus.fa
# test on a B.1.1.7 genome
RUN pangolin /test-data/SRR13957123.consensus.fa --analysis-mode usher -o /test-data/SRR13957123-pusher && \
column -t -s, /test-data/SRR13957123-pusher/lineage_report.csv
# install unzip for unzipping zip archive from NCBI
RUN apt-get update && apt-get install -y --no-install-recommends unzip
# install ncbi datasets tool (pre-compiled binary); place in $PATH
RUN wget https://ftp.ncbi.nlm.nih.gov/pub/datasets/command-line/LATEST/linux-amd64/datasets && \
chmod +x datasets && \
mv -v datasets /usr/local/bin
# download assembly for a BA.1 from Florida (https://www.ncbi.nlm.nih.gov/biosample?term=SAMN29506515 and https://www.ncbi.nlm.nih.gov/nuccore/ON924087)
# run pangolin in usher analysis mode
RUN datasets download virus genome accession ON924087.1 --filename ON924087.1.zip && \
unzip ON924087.1.zip && rm ON924087.1.zip && \
mv -v ncbi_dataset/data/genomic.fna ON924087.1.genomic.fna && \
rm -vr ncbi_dataset/ README.md && \
pangolin ON924087.1.genomic.fna --analysis-mode usher -o ON924087.1-usher && \
column -t -s, ON924087.1-usher/lineage_report.csv
# test specific for new lineage, XBB.1.16, introduced in pangolin-data v1.19
# using this assembly: https://www.ncbi.nlm.nih.gov/nuccore/2440446687
# biosample here: https://www.ncbi.nlm.nih.gov/biosample?term=SAMN33060589
# one of the sample included in initial pango-designation here: https://github.com/cov-lineages/pango-designation/issues/1723
RUN datasets download virus genome accession OQ381818.1 --filename OQ381818.1.zip && \
unzip OQ381818.1.zip && rm OQ381818.1.zip && \
mv -v ncbi_dataset/data/genomic.fna OQ381818.1.genomic.fna && \
rm -vr ncbi_dataset/ README.md && \
pangolin OQ381818.1.genomic.fna --analysis-mode usher -o OQ381818.1-usher && \
column -t -s, OQ381818.1-usher/lineage_report.csv
# testing another XBB.1.16, trying to test scorpio functionality. Want pangolin to NOT assign lineage based on pango hash match.
# this test runs as expected, uses scorpio to check for constellation of mutations, then assign using PUSHER placement
RUN datasets download virus genome accession OR177999.1 --filename OR177999.1.zip && \
unzip OR177999.1.zip && rm OR177999.1.zip && \
mv -v ncbi_dataset/data/genomic.fna OR177999.1.genomic.fna && \
rm -vr ncbi_dataset/ README.md && \
pangolin OR177999.1.genomic.fna --analysis-mode usher -o OR177999.1-usher && \
column -t -s, OR177999.1-usher/lineage_report.csv
## test for BA.2.86
# virus identified in MI: https://www.ncbi.nlm.nih.gov/nuccore/OR461132.1
RUN datasets download virus genome accession OR461132.1 --filename OR461132.1.zip && \
unzip OR461132.1.zip && rm OR461132.1.zip && \
mv -v ncbi_dataset/data/genomic.fna OR461132.1.genomic.fna && \
rm -vr ncbi_dataset/ README.md && \
pangolin OR461132.1.genomic.fna --analysis-mode usher -o OR461132.1-usher && \
column -t -s, OR461132.1-usher/lineage_report.csv
## test for JN.2 (BA.2.86 sublineage) JN.2 is an alias of B.1.1.529.2.86.1.2
# NY CDC Quest sample: https://www.ncbi.nlm.nih.gov/nuccore/OR598183
RUN datasets download virus genome accession OR598183.1 --filename OR598183.1.zip && \
unzip OR598183.1.zip && rm OR598183.1.zip && \
mv -v ncbi_dataset/data/genomic.fna OR598183.1.genomic.fna && \
rm -vr ncbi_dataset/ README.md && \
pangolin OR598183.1.genomic.fna --analysis-mode usher -o OR598183.1-usher && \
column -t -s, OR598183.1-usher/lineage_report.csv
## test for JQ.1 (BA.2.86.3 sublineage); JQ.1 is an alias of B.1.1.529.2.86.3.1
# THANK YOU ERIN AND UPHL!! https://www.ncbi.nlm.nih.gov/nuccore/OR716684
# this test is important due to the fact that this lineage was included in the UShER tree, despite being designated after the pangolin-designation 1.23 release
# it previously caused and error/bug in pangolin, but now is fixed
RUN datasets download virus genome accession OR716684.1 --filename OR716684.1.zip && \
unzip OR716684.1.zip && rm OR716684.1.zip && \
mv -v ncbi_dataset/data/genomic.fna OR716684.1.genomic.fna && \
rm -vr ncbi_dataset/ README.md && \
pangolin OR716684.1.genomic.fna --analysis-mode usher -o OR716684.1-usher && \
column -t -s, OR716684.1-usher/lineage_report.csv