Skip to content

Commit

Permalink
chore: add scrips for data compression and upload
Browse files Browse the repository at this point in the history
  • Loading branch information
ivan-aksamentov committed Aug 24, 2022
1 parent fabc721 commit e12bbcb
Show file tree
Hide file tree
Showing 4 changed files with 188 additions and 0 deletions.
20 changes: 20 additions & 0 deletions scripts/s3_compress.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
#!/usr/bin/env bash
set -euo pipefail
trap "exit" INT

export INPUT_DIR="${1:? Pass input directory as first parameter}"

if [ ! -d "${INPUT_DIR}/dataset/" ]; then
echo "Invalid input directory: '${INPUT_DIR}'. Input directory should contain subdirectory 'dataset/'"
fi

if [ ! -f "${INPUT_DIR}/index.json" ]; then
echo "Invalid input directory: '${INPUT_DIR}'. Input directory should contain 'index.json'"
fi

echo "Ungzipping"
find "${INPUT_DIR}" -mindepth 2 -maxdepth 2 -type d | parallel --lb pigz -dfrq || true

echo "Gzipping"
pigz -fkq "${INPUT_DIR}/index.json"
find "${INPUT_DIR}" -mindepth 2 -maxdepth 2 -type d | parallel --lb pigz -kfrq || true
49 changes: 49 additions & 0 deletions scripts/s3_upload_fast.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
#!/usr/bin/env bash
set -euo pipefail
trap "exit" INT

export INPUT_DIR="${1:? Pass input directory as first parameter}"
export S3_BUCKET="${2:? Pass S3 bucket name as second parameter}"

if [ ! -d "${INPUT_DIR}/dataset/" ]; then
echo "Invalid input directory: '${INPUT_DIR}'. Input directory should contain subdirectory 'dataset/'"
fi

if [ ! -f "${INPUT_DIR}/index.json" ]; then
echo "Invalid input directory: '${INPUT_DIR}'. Input directory should contain 'index.json'"
fi

#echo "Ungzipping"
#find "${INPUT_DIR}" -mindepth 2 -maxdepth 2 -type d | parallel --lb pigz -dfrq || true

#echo "Gzipping"
#pigz -fkq "${INPUT_DIR}/index.json"
#find "${INPUT_DIR}" -mindepth 2 -maxdepth 2 -type d | parallel --lb pigz -kfrq || true

echo "Uploading"

function bucket_path {
realpath --relative-to="${INPUT_DIR}" "${1}"
}
export -f bucket_path

function upload_gzip() {
aws s3 sync --only-show-errors --cache-control "max-age=2592000, public" --content-encoding=gzip --exclude "*" --include "*.gz" "${INPUT_DIR}/${1}" "s3://${S3_BUCKET}/${1}"
}
export -f upload_gzip

function upload_non_gzip() {
aws s3 sync --only-show-errors --cache-control "max-age=2592000, public" --exclude "*.gz" "${INPUT_DIR}/${1}" "s3://${S3_BUCKET}/${1}"
}
export -f upload_non_gzip

function upload_one_directory() {
echo "Uploading '${1}'"
parallel ::: "upload_gzip $(bucket_path ${1})" "upload_non_gzip $(bucket_path ${1})"
}
export -f upload_one_directory

aws s3 cp --only-show-errors --cache-control "max-age=2592000, public" "${INPUT_DIR}/index.json" "s3://${S3_BUCKET}/index.json"
aws s3 cp --only-show-errors --cache-control "max-age=2592000, public" --content-encoding=gzip "${INPUT_DIR}/index.json.gz" "s3://${S3_BUCKET}/index.json.gz"

find "${INPUT_DIR}" -mindepth 2 -maxdepth 2 -type d | parallel -j 4 upload_one_directory
42 changes: 42 additions & 0 deletions scripts/s3_upload_slow.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
#!/usr/bin/env bash
set -euo pipefail
trap "exit" INT

export INPUT_DIR="${1:? Pass input directory as first parameter}"
export S3_BUCKET="${2:? Pass S3 bucket name as second parameter}"

if [ ! -d "${INPUT_DIR}/dataset/" ]; then
echo "Invalid input directory: '${INPUT_DIR}'. Input directory should contain subdirectory 'dataset/'"
fi

if [ ! -f "${INPUT_DIR}/index.json" ]; then
echo "Invalid input directory: '${INPUT_DIR}'. Input directory should contain 'index.json'"
fi

cd "${INPUT_DIR}"

# If parallel version does not work for you, here is a serial version
#
# echo "Upload gz"
# aws s3 cp --only-show-errors --cache-control "max-age=2592000, public" \
# --content-encoding=gzip --exclude "*" --include "*.gz" . "s3://${S3_BUCKET}"
#
# echo "Upload non-gz"
# aws s3 cp --only-show-errors --cache-control "max-age=2592000, public" \
# --exclude "*.gz" . "s3://${S3_BUCKET}"

function upload_gzip() {
aws s3 sync --only-show-errors --cache-control "max-age=2592000, public" \
--content-encoding=gzip --exclude "*" --include "*.gz" \
. "s3://${S3_BUCKET}"
}
export -f upload_gzip

function upload_non_gzip() {
aws s3 sync --only-show-errors --cache-control "max-age=2592000, public" \
--exclude "*.gz" \
. "s3://${S3_BUCKET}"
}
export -f upload_non_gzip

parallel ::: upload_gzip upload_non_gzip
77 changes: 77 additions & 0 deletions scripts/s3_upload_smart.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
#!/usr/bin/env bash

set -euxo pipefail

export INPUT_DIR="${1:? Pass input directory as first parameter}"
export S3_BUCKET="${2:? Pass S3 bucket name as second parameter}"

content_encoding() {
case "$1" in
*.gz) echo --content-encoding=gzip;;
*.br) echo --content-encoding=br;;
*) echo '';;
esac
}
export -f content_encoding

content_type() {
f="${1%.gz}"
f="${f%.br}"
f="${f%.zip}"
f="${f%.xz}"
f="${f%.zst}"
f="${f%.zstd}"
f="${f%.tar}"
case "${f}" in
*.apng) echo --content-type=image/apng;;
*.avif) echo --content-type=image/avif;;
*.bz2) echo --content-type=application/x-bzip2;;
*.css) echo --content-type=text/css;;
*.csv) echo --content-type=text/csv;;
*.gif) echo --content-type=image/gif;;
*.gz) echo --content-type=application/gzip;;
*.htm | *.html) echo --content-type=text/html;;
*.ico) echo --content-type=image/x-icon;;
*.jpg | *.jpeg) echo --content-type=image/jpeg;;
*.js | *.mjs) echo --content-type=text/javascript;;
*.json) echo --content-type=application/json;;
*.ndjson) echo --content-type=application/x-ndjson;;
*.png) echo --content-type=image/png;;
*.svg) echo --content-type=image/svg+xml;;
*.tar) echo --content-type=application/x-tar;;
*.tsv) echo --content-type=text/tab-separated-values;;
*.wasm) echo --content-type=application/wasm;;
*.webp) echo --content-type=image/webp;;
*.xz) echo --content-type=application/x-xz;;
*.zip) echo --content-type=application/zip;;
*.zst) echo --content-type=application/zstd;;
*) echo --content-type=text/plain;;
esac
}
export -f content_type

upload() {
aws s3 cp --only-show-errors --cache-control "max-age=2592000, public" $(content_encoding "${1}") $(content_type "${1}") "-" "s3://${S3_BUCKET}/${1}"
}
export -f upload

upload_one_file() {
if [[ "${1}" == *.gz ]] && [[ "${1}" != *.tar.gz ]]; then
pigz -cdk "${1}" | upload "${1%.gz}"
cat "${1}" | upload "${1}"
pigz -cdk "${1}" | brotli -cf - | upload "${1%.gz}.br"
elif [[ "${1}" != *.zip ]]; then
cat "${1}" | upload "${1}"
pigz -kf "${1}" | upload "${1}.gz"
brotli -kf "${1}" | upload "${1}.br"
fi
}
export -f upload_one_file

upload_one_directory() {
echo "Uploading '${1}'"
find "${1}/" -mindepth 1 -maxdepth 1 -type f | parallel -j 20 --lb upload_one_file
}
export -f upload_one_directory

find "${INPUT_DIR}" -mindepth 2 -maxdepth 2 -type d | parallel -j 8 upload_one_directory

0 comments on commit e12bbcb

Please sign in to comment.