-
Notifications
You must be signed in to change notification settings - Fork 119
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
adding pango_aliasor version 0.3.0 (#1011)
* adding pango_aliasor version 0.3.0 * added credits * adding pango aliasor --------- Co-authored-by: Kutluhan Incekara <[email protected]>
- Loading branch information
Showing
5 changed files
with
186 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,61 @@ | ||
FROM ubuntu:jammy as app | ||
|
||
ARG PANGO_ALIASOR_VER="0.3.0" | ||
|
||
LABEL base.image="ubuntu:jammy" | ||
LABEL dockerfile.version="1" | ||
LABEL software="Pango Aliasor" | ||
LABEL software.version="${PANGO_ALIASOR_VER}" | ||
LABEL description="Links sublineages to parent pangolin lineages" | ||
LABEL website="https://github.com/corneliusroemer/pango_aliasor" | ||
LABEL license="https://github.com/corneliusroemer/pango_aliasor/blob/main/LICENSE" | ||
LABEL maintainer="Erin Young" | ||
LABEL maintainer.email="[email protected]" | ||
|
||
RUN apt-get update && apt-get install -y --no-install-recommends \ | ||
python3 \ | ||
python3-pip \ | ||
python-is-python3 \ | ||
wget \ | ||
procps && \ | ||
apt-get autoclean && rm -rf /var/lib/apt/lists/* | ||
|
||
RUN wget -q https://github.com/corneliusroemer/pango_aliasor/archive/refs/tags/v${PANGO_ALIASOR_VER}.tar.gz && \ | ||
pip install v${PANGO_ALIASOR_VER}.tar.gz && \ | ||
rm v${PANGO_ALIASOR_VER}.tar.gz && \ | ||
pip install --no-cache pandas && \ | ||
mkdir /data | ||
|
||
ENV PATH="$PATH" LC_ALL=C | ||
|
||
COPY aliasor.py /usr/bin/. | ||
|
||
WORKDIR /key | ||
|
||
RUN wget -q https://raw.githubusercontent.com/cov-lineages/pango-designation/master/pango_designation/alias_key.json | ||
|
||
WORKDIR /data | ||
|
||
CMD [ "aliasor.py", "--help" ] | ||
|
||
FROM staphb/pangolin:4.3.1-pdata-1.28 as pangolin | ||
|
||
RUN apt-get update && apt-get install -y --no-install-recommends zstd | ||
|
||
RUN wget -q https://github.com/corneliusroemer/pango-sequences/raw/main/data/pango-consensus-sequences_genome-nuc.fasta.zst && \ | ||
zstd -d pango-consensus-sequences_genome-nuc.fasta.zst && \ | ||
pangolin pango-consensus-sequences_genome-nuc.fasta | ||
|
||
FROM app as test | ||
|
||
WORKDIR /test | ||
|
||
RUN aliasor.py --help | ||
|
||
COPY --from=pangolin /data/lineage_report.csv . | ||
|
||
RUN aliasor.py --input lineage_report.csv --output aliased_lineage_report_github.tsv && \ | ||
aliasor.py --input lineage_report.csv --output aliased_lineage_report.tsv --alias-key /key/alias_key.json && \ | ||
wc -l aliased_lineage_report_github.tsv aliased_lineage_report.tsv && \ | ||
head aliased_lineage_report_github.tsv aliased_lineage_report.tsv | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,78 @@ | ||
|
||
# pango_aliasor container | ||
|
||
Main tool: [pango_aliasor](https://github.com/corneliusroemer/pango_aliasor) | ||
|
||
Code repository: https://github.com/corneliusroemer/pango_aliasor | ||
|
||
Basic information on how to use this tool: | ||
- executable: NA | ||
- help: NA | ||
- version: NA | ||
- description: pango_aliasor is a python library for determining parent pangolin lineages | ||
|
||
Additional information: | ||
- Although not an official use by any means, `aliasor.py` is included in this image. This python script was written by [@erinyoung](https://github.com/erinyoung) for some quick use cases of finding parent lineages from pangolin results. Usage is below. | ||
- A alias key is found at `/key/alias_key.json` in the containers spun from this image. When used, pango_aliasor does not download the latest key from github, which is useful for some cloud infrastructures. | ||
|
||
Full documentation: [https://github.com/corneliusroemer/pango_aliasor](https://github.com/corneliusroemer/pango_aliasor) | ||
|
||
## Example Usage | ||
|
||
```python | ||
import pandas as pd | ||
from pango_aliasor.aliasor import Aliasor | ||
import argparse | ||
|
||
|
||
def add_unaliased_column(tsv_file_path, pango_column='pango_lineage', unaliased_column='pango_lineage_unaliased'): | ||
aliasor = Aliasor() | ||
def uncompress_lineage(lineage): | ||
if not lineage or pd.isna(lineage): | ||
return "?" | ||
return aliasor.uncompress(lineage) | ||
|
||
df = pd.read_csv(tsv_file_path, sep='\t') | ||
df[unaliased_column] = df[pango_column].apply(uncompress_lineage) | ||
return df | ||
|
||
|
||
if __name__ == "__main__": | ||
parser = argparse.ArgumentParser(description='Add unaliased Pango lineage column to a TSV file.') | ||
parser.add_argument('--input-tsv', required=True, help='Path to the input TSV file.') | ||
parser.add_argument('--pango-column', default='pango_lineage', help='Name of the Pango lineage column in the input file.') | ||
parser.add_argument('--unaliased-column', default='pango_lineage_unaliased', help='Name of the column to use for the unaliased Pango lineage column in output.') | ||
args = parser.parse_args() | ||
df = add_unaliased_column(args.input_tsv, args.pango_column, args.unaliased_column) | ||
print(df.to_csv(sep='\t', index=False)) | ||
``` | ||
|
||
## Example Usage of aliasor.py | ||
|
||
The help message | ||
```bash | ||
usage: aliasor.py [-h] --input INPUT [--output OUTPUT] [--pango-column PANGO_COLUMN] [--unaliased-column UNALIASED_COLUMN] [--alias-key ALIAS_KEY] | ||
|
||
Add unaliased Pango lineage column to a TSV file. | ||
|
||
options: | ||
-h, --help show this help message and exit | ||
--input INPUT Path to the input file (should end in tsv or csv for best results). | ||
--output OUTPUT Name of tab-delimited output file | ||
--pango-column PANGO_COLUMN | ||
Name of the Pango lineage column in the input file. | ||
--unaliased-column UNALIASED_COLUMN | ||
Name of the column to use for the unaliased Pango lineage column in output. | ||
--alias-key ALIAS_KEY | ||
Alias Key as json file. If none provided, will download the latest version from github. | ||
``` | ||
Examples for using aliasor.py with the lineage_report.csv file generated via pangolin (lineage_report.csv) | ||
```bash | ||
# downloading the latest alias key from github | ||
aliasor.py --input lineage_report.csv --output unaliased_lineage_report.tsv | ||
|
||
# using included alias key | ||
aliasor.py --input lineage_report.csv --output unaliased_lineage_report.tsv --alias-key /key/alias_key.json | ||
``` | ||
The unaliased column will be the last column in the output file. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,45 @@ | ||
#!/usr/bin/env python3 | ||
|
||
##### | ||
# Mostly stolen from https://github.com/corneliusroemer/pango_aliasor?tab=readme-ov-file#convenience-script | ||
# and https://github.com/UPHL-BioNGS/Wastewater-genomic-analysis/blob/pooja-dev/utils/freyja_custom_lin_processing.py | ||
##### | ||
|
||
import pandas as pd | ||
from pango_aliasor.aliasor import Aliasor | ||
import argparse | ||
|
||
def add_unaliased_column(tsv_file_path, pango_column='lineage', unaliased_column='unaliased_lineage', alias_key = ''): | ||
if alias_key: | ||
aliasor = Aliasor(alias_key) | ||
else: | ||
aliasor = Aliasor() | ||
|
||
def uncompress_lineage(lineage): | ||
if not lineage or pd.isna(lineage): | ||
return "?" | ||
return aliasor.uncompress(lineage) | ||
|
||
df = pd.DataFrame() | ||
|
||
if tsv_file_path.endswith('.tsv'): | ||
df = pd.read_csv(tsv_file_path, sep='\t') | ||
elif tsv_file_path.endswith('.csv'): | ||
df = pd.read_csv(tsv_file_path, sep=',') | ||
else: | ||
df = pd.read_csv(tsv_file_path, sep='\t') | ||
|
||
df[unaliased_column] = df[pango_column].apply(uncompress_lineage) | ||
return df | ||
|
||
if __name__ == "__main__": | ||
parser = argparse.ArgumentParser(description='Add unaliased Pango lineage column to a TSV file.') | ||
parser.add_argument('--input', required=True, help='Path to the input file (should end in tsv or csv for best results).') | ||
parser.add_argument('--output', default='unaliased_lineage_report.tsv', help='Name of tab-delimited output file' ) | ||
parser.add_argument('--pango-column', default='lineage', help='Name of the Pango lineage column in the input file.') | ||
parser.add_argument('--unaliased-column', default='unaliased_lineage', help='Name of the column to use for the unaliased Pango lineage column in output.') | ||
parser.add_argument('--alias-key', default='', help="Alias Key as json file. If none provided, will download the latest version from github.") | ||
args = parser.parse_args() | ||
|
||
df = add_unaliased_column(args.input, args.pango_column, args.unaliased_column, args.alias_key) | ||
df.to_csv(args.output, sep='\t', index=False) |