Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve fileset distribution during the memo file regeneration #150

Merged
42 changes: 24 additions & 18 deletions src/dist/memo_regenerator.sql
Original file line number Diff line number Diff line change
@@ -1,18 +1,24 @@
COPY (SELECT * FROM (
SELECT image.id AS imageId,
pixels.id AS pixelsId,
image.series,
pixelstype.value AS pixelstype,
pixels.sizeX,
pixels.sizeY,
pixels.sizeZ,
pixels.sizeC,
pixels.sizeT,
format.value,
rank() OVER (PARTITION BY fileset.id ORDER BY image.id) AS rank
FROM fileset
JOIN image ON fileset.id = image.fileset
JOIN pixels ON image.id = pixels.image
JOIN pixelstype ON pixels.pixelstype = pixelstype.id
JOIN format ON image.format = format.id
) AS rank WHERE rank.rank = 1) TO STDOUT CSV;
COPY (SELECT * FROM (
SELECT image.id AS imageId,
pixels.id AS pixelsId,
image.series,
pixelstype.value AS pixelstype,
pixels.sizeX,
pixels.sizeY,
pixels.sizeZ,
pixels.sizeC,
pixels.sizeT,
format.value,
e2.time - e1.time AS setId,
rank() OVER (PARTITION BY fileset.id ORDER BY image.id) AS rank
FROM fileset
JOIN image ON fileset.id = image.fileset
JOIN pixels ON image.id = pixels.image
JOIN pixelstype ON pixels.pixelstype = pixelstype.id
JOIN format ON image.format = format.id
JOIN event e2 on image.creation_id=e2.id
JOIN filesetjoblink on filesetjoblink.parent=fileset.id
JOIN job on filesetjoblink.child=job.id
JOIN uploadjob on job.id=uploadjob.job_id
JOIN event e1 on job.update_id=e1.id
) AS query WHERE query.rank = 1 ORDER BY query.setId desc) TO STDOUT CSV;
41 changes: 23 additions & 18 deletions src/dist/regen-memo-files.sh
Original file line number Diff line number Diff line change
Expand Up @@ -22,37 +22,42 @@
usage() {
echo "Usage:"
echo "$0 [OPTIONS]"
echo "Regenerates bioformats memofiles"
echo "Regenerates Bio-Formats memo files in parallel"
echo
echo "This utility queries the OMERO database for a list of filesets, splits the output"
echo "into several input files and runs the memoregenerator utility using GNU parallel."
echo
echo " OPTIONS:"
echo " --help display usage and exit"
echo " --db database connection string"
echo " --jobs max number of jobs to parallelize"
echo " --memoizer-home Location of image-region-ms"
echo " --batch-size Maximum number of entries in each input file sent to parallel (default: 500)"
echo " --cache-options Memofile cache options [/path/to/dir | inplace] (required)"
echo " --csv Bypass sql query and use this csv for image list"
echo " --db Database connection string"
echo " --force-image-regen Force regeneration of image list even if it exists already"
echo " --help Display usage and exit"
echo " --jobs Maximum number of jobs to parallelize (default: number of processing units available)"
echo " --memoizer-home Location of image-region micro-service (default: current directory)"
echo " --no-ask Do not ask for confirmation"
echo " --no-wait Do not wait to start generating -- DO IT NOW"
echo " --cache-options Memofile cache options [/path/to/dir | inplace]"
echo " --batch-size # of image files to split list into"
echo " --csv Bypass sql query and use this csv for image list"
echo
echo "Example:"
echo " $0 --db postgresql://user:pass@host:port/db --jobs [12|max] --memoizer-home /opt/omero/OMERO.ms-image-region.current --cache-options /path/to/dir"
echo "Examples:"
echo " Regenerate memo files using the current cache directory and all available CPUs"
echo " $0 --cache-options inplace"
echo " Regenerate memo files offline using a secondary cache directory and 4 CPUs"
echo " $0 --jobs 4 --cache-options /OMERO/BioFormatsCache.$( date "+%Y%m%d" )"
echo " Regenerate memo files offline using a secondary cache directory, all available CPUs and a database connection string"
echo " $0 --db postgresql://user:pass@host:port/db --cache-options /OMERO/BioFormatsCache.$( date "+%Y%m%d" )"
exit $1
}

run_split_parallel_os_dep() {
set -x
export JAVA_OPTS="-XX:+HeapDumpOnOutOfMemoryError -XX:HeapDumpPath=rslt.${DATESTR} -Xmx2g -Dlogback.configurationFile=${MEMOIZER_HOME}/logback-memoizer.xml -Dprocessname=memoizer"
CENTOS_VERSION=$(cat /etc/centos-release |cut -f 3 -d' '|cut -d. -f 1)
cd rslt.${DATESTR}
split -a 3 -l ${BATCH_SIZE} ${FULL_CSV} -d input.
PARALLEL_OPTS="error"
if [ "${CENTOS_VERSION}" = "6" ]; then
PARALLEL_OPTS="--halt 2 --gnu --eta --jobs ${JOBS} --joblog parallel-${JOBS}cpus.log --files --use-cpus-instead-of-cores --result . ${DRYRUN}"
else
PARALLEL_OPTS="--halt now,fail=1 --eta --jobs ${JOBS} --joblog parallel-${JOBS}cpus.log --files --use-cpus-instead-of-cores --results . ${DRYRUN}"
fi
# Split the CSV file into N * JOBS files of at most BATCH_SIZE entries using round-robin distribution
N=$(wc -l ${FULL_CSV} | awk '{print $1}')
NFILES=$(( (($N - 1) / ($BATCH_SIZE * $JOBS) + 1 ) * $JOBS ))
split -a 3 -n r/$NFILES ${FULL_CSV} -d input.
PARALLEL_OPTS="--halt now,fail=1 --eta --jobs ${JOBS} --joblog parallel-${JOBS}cpus.log --files --use-cpus-instead-of-cores --results . ${DRYRUN}"
set -x
/usr/bin/time -p -o timed parallel ${PARALLEL_OPTS} \
${MEMOIZER_HOME}/bin/memoregenerator \
Expand Down
Loading