forked from wdingx/pan-genome-visualization
-
Notifications
You must be signed in to change notification settings - Fork 14
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
chore: add scrips for data compression and upload
- Loading branch information
1 parent
fabc721
commit e12bbcb
Showing
4 changed files
with
188 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
#!/usr/bin/env bash | ||
set -euo pipefail | ||
trap "exit" INT | ||
|
||
export INPUT_DIR="${1:? Pass input directory as first parameter}" | ||
|
||
if [ ! -d "${INPUT_DIR}/dataset/" ]; then | ||
echo "Invalid input directory: '${INPUT_DIR}'. Input directory should contain subdirectory 'dataset/'" | ||
fi | ||
|
||
if [ ! -f "${INPUT_DIR}/index.json" ]; then | ||
echo "Invalid input directory: '${INPUT_DIR}'. Input directory should contain 'index.json'" | ||
fi | ||
|
||
echo "Ungzipping" | ||
find "${INPUT_DIR}" -mindepth 2 -maxdepth 2 -type d | parallel --lb pigz -dfrq || true | ||
|
||
echo "Gzipping" | ||
pigz -fkq "${INPUT_DIR}/index.json" | ||
find "${INPUT_DIR}" -mindepth 2 -maxdepth 2 -type d | parallel --lb pigz -kfrq || true |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
#!/usr/bin/env bash | ||
set -euo pipefail | ||
trap "exit" INT | ||
|
||
export INPUT_DIR="${1:? Pass input directory as first parameter}" | ||
export S3_BUCKET="${2:? Pass S3 bucket name as second parameter}" | ||
|
||
if [ ! -d "${INPUT_DIR}/dataset/" ]; then | ||
echo "Invalid input directory: '${INPUT_DIR}'. Input directory should contain subdirectory 'dataset/'" | ||
fi | ||
|
||
if [ ! -f "${INPUT_DIR}/index.json" ]; then | ||
echo "Invalid input directory: '${INPUT_DIR}'. Input directory should contain 'index.json'" | ||
fi | ||
|
||
#echo "Ungzipping" | ||
#find "${INPUT_DIR}" -mindepth 2 -maxdepth 2 -type d | parallel --lb pigz -dfrq || true | ||
|
||
#echo "Gzipping" | ||
#pigz -fkq "${INPUT_DIR}/index.json" | ||
#find "${INPUT_DIR}" -mindepth 2 -maxdepth 2 -type d | parallel --lb pigz -kfrq || true | ||
|
||
echo "Uploading" | ||
|
||
function bucket_path { | ||
realpath --relative-to="${INPUT_DIR}" "${1}" | ||
} | ||
export -f bucket_path | ||
|
||
function upload_gzip() { | ||
aws s3 sync --only-show-errors --cache-control "max-age=2592000, public" --content-encoding=gzip --exclude "*" --include "*.gz" "${INPUT_DIR}/${1}" "s3://${S3_BUCKET}/${1}" | ||
} | ||
export -f upload_gzip | ||
|
||
function upload_non_gzip() { | ||
aws s3 sync --only-show-errors --cache-control "max-age=2592000, public" --exclude "*.gz" "${INPUT_DIR}/${1}" "s3://${S3_BUCKET}/${1}" | ||
} | ||
export -f upload_non_gzip | ||
|
||
function upload_one_directory() { | ||
echo "Uploading '${1}'" | ||
parallel ::: "upload_gzip $(bucket_path ${1})" "upload_non_gzip $(bucket_path ${1})" | ||
} | ||
export -f upload_one_directory | ||
|
||
aws s3 cp --only-show-errors --cache-control "max-age=2592000, public" "${INPUT_DIR}/index.json" "s3://${S3_BUCKET}/index.json" | ||
aws s3 cp --only-show-errors --cache-control "max-age=2592000, public" --content-encoding=gzip "${INPUT_DIR}/index.json.gz" "s3://${S3_BUCKET}/index.json.gz" | ||
|
||
find "${INPUT_DIR}" -mindepth 2 -maxdepth 2 -type d | parallel -j 4 upload_one_directory |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
#!/usr/bin/env bash | ||
set -euo pipefail | ||
trap "exit" INT | ||
|
||
export INPUT_DIR="${1:? Pass input directory as first parameter}" | ||
export S3_BUCKET="${2:? Pass S3 bucket name as second parameter}" | ||
|
||
if [ ! -d "${INPUT_DIR}/dataset/" ]; then | ||
echo "Invalid input directory: '${INPUT_DIR}'. Input directory should contain subdirectory 'dataset/'" | ||
fi | ||
|
||
if [ ! -f "${INPUT_DIR}/index.json" ]; then | ||
echo "Invalid input directory: '${INPUT_DIR}'. Input directory should contain 'index.json'" | ||
fi | ||
|
||
cd "${INPUT_DIR}" | ||
|
||
# If parallel version does not work for you, here is a serial version | ||
# | ||
# echo "Upload gz" | ||
# aws s3 cp --only-show-errors --cache-control "max-age=2592000, public" \ | ||
# --content-encoding=gzip --exclude "*" --include "*.gz" . "s3://${S3_BUCKET}" | ||
# | ||
# echo "Upload non-gz" | ||
# aws s3 cp --only-show-errors --cache-control "max-age=2592000, public" \ | ||
# --exclude "*.gz" . "s3://${S3_BUCKET}" | ||
|
||
function upload_gzip() { | ||
aws s3 sync --only-show-errors --cache-control "max-age=2592000, public" \ | ||
--content-encoding=gzip --exclude "*" --include "*.gz" \ | ||
. "s3://${S3_BUCKET}" | ||
} | ||
export -f upload_gzip | ||
|
||
function upload_non_gzip() { | ||
aws s3 sync --only-show-errors --cache-control "max-age=2592000, public" \ | ||
--exclude "*.gz" \ | ||
. "s3://${S3_BUCKET}" | ||
} | ||
export -f upload_non_gzip | ||
|
||
parallel ::: upload_gzip upload_non_gzip |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,77 @@ | ||
#!/usr/bin/env bash | ||
|
||
set -euxo pipefail | ||
|
||
export INPUT_DIR="${1:? Pass input directory as first parameter}" | ||
export S3_BUCKET="${2:? Pass S3 bucket name as second parameter}" | ||
|
||
content_encoding() { | ||
case "$1" in | ||
*.gz) echo --content-encoding=gzip;; | ||
*.br) echo --content-encoding=br;; | ||
*) echo '';; | ||
esac | ||
} | ||
export -f content_encoding | ||
|
||
content_type() { | ||
f="${1%.gz}" | ||
f="${f%.br}" | ||
f="${f%.zip}" | ||
f="${f%.xz}" | ||
f="${f%.zst}" | ||
f="${f%.zstd}" | ||
f="${f%.tar}" | ||
case "${f}" in | ||
*.apng) echo --content-type=image/apng;; | ||
*.avif) echo --content-type=image/avif;; | ||
*.bz2) echo --content-type=application/x-bzip2;; | ||
*.css) echo --content-type=text/css;; | ||
*.csv) echo --content-type=text/csv;; | ||
*.gif) echo --content-type=image/gif;; | ||
*.gz) echo --content-type=application/gzip;; | ||
*.htm | *.html) echo --content-type=text/html;; | ||
*.ico) echo --content-type=image/x-icon;; | ||
*.jpg | *.jpeg) echo --content-type=image/jpeg;; | ||
*.js | *.mjs) echo --content-type=text/javascript;; | ||
*.json) echo --content-type=application/json;; | ||
*.ndjson) echo --content-type=application/x-ndjson;; | ||
*.png) echo --content-type=image/png;; | ||
*.svg) echo --content-type=image/svg+xml;; | ||
*.tar) echo --content-type=application/x-tar;; | ||
*.tsv) echo --content-type=text/tab-separated-values;; | ||
*.wasm) echo --content-type=application/wasm;; | ||
*.webp) echo --content-type=image/webp;; | ||
*.xz) echo --content-type=application/x-xz;; | ||
*.zip) echo --content-type=application/zip;; | ||
*.zst) echo --content-type=application/zstd;; | ||
*) echo --content-type=text/plain;; | ||
esac | ||
} | ||
export -f content_type | ||
|
||
upload() { | ||
aws s3 cp --only-show-errors --cache-control "max-age=2592000, public" $(content_encoding "${1}") $(content_type "${1}") "-" "s3://${S3_BUCKET}/${1}" | ||
} | ||
export -f upload | ||
|
||
upload_one_file() { | ||
if [[ "${1}" == *.gz ]] && [[ "${1}" != *.tar.gz ]]; then | ||
pigz -cdk "${1}" | upload "${1%.gz}" | ||
cat "${1}" | upload "${1}" | ||
pigz -cdk "${1}" | brotli -cf - | upload "${1%.gz}.br" | ||
elif [[ "${1}" != *.zip ]]; then | ||
cat "${1}" | upload "${1}" | ||
pigz -kf "${1}" | upload "${1}.gz" | ||
brotli -kf "${1}" | upload "${1}.br" | ||
fi | ||
} | ||
export -f upload_one_file | ||
|
||
upload_one_directory() { | ||
echo "Uploading '${1}'" | ||
find "${1}/" -mindepth 1 -maxdepth 1 -type f | parallel -j 20 --lb upload_one_file | ||
} | ||
export -f upload_one_directory | ||
|
||
find "${INPUT_DIR}" -mindepth 2 -maxdepth 2 -type d | parallel -j 8 upload_one_directory |