From 9b36191569024fabfa551fe0f83e428db439d862 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20Besson?= Date: Mon, 9 Sep 2024 14:00:28 +0100 Subject: [PATCH 1/7] memo_regenerator.sql: use LF for line endings --- src/dist/memo_regenerator.sql | 36 +++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/src/dist/memo_regenerator.sql b/src/dist/memo_regenerator.sql index ccd9f5a8..0abbba65 100644 --- a/src/dist/memo_regenerator.sql +++ b/src/dist/memo_regenerator.sql @@ -1,18 +1,18 @@ -COPY (SELECT * FROM ( - SELECT image.id AS imageId, - pixels.id AS pixelsId, - image.series, - pixelstype.value AS pixelstype, - pixels.sizeX, - pixels.sizeY, - pixels.sizeZ, - pixels.sizeC, - pixels.sizeT, - format.value, - rank() OVER (PARTITION BY fileset.id ORDER BY image.id) AS rank - FROM fileset - JOIN image ON fileset.id = image.fileset - JOIN pixels ON image.id = pixels.image - JOIN pixelstype ON pixels.pixelstype = pixelstype.id - JOIN format ON image.format = format.id -) AS rank WHERE rank.rank = 1) TO STDOUT CSV; +COPY (SELECT * FROM ( + SELECT image.id AS imageId, + pixels.id AS pixelsId, + image.series, + pixelstype.value AS pixelstype, + pixels.sizeX, + pixels.sizeY, + pixels.sizeZ, + pixels.sizeC, + pixels.sizeT, + format.value, + rank() OVER (PARTITION BY fileset.id ORDER BY image.id) AS rank + FROM fileset + JOIN image ON fileset.id = image.fileset + JOIN pixels ON image.id = pixels.image + JOIN pixelstype ON pixels.pixelstype = pixelstype.id + JOIN format ON image.format = format.id +) AS rank WHERE rank.rank = 1) TO STDOUT CSV; From 04424538fcc65215fabe00fbaae2cfb80823f357 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20Besson?= Date: Mon, 9 Sep 2024 14:00:34 +0100 Subject: [PATCH 2/7] Remove CentOS 7 parallel options --- src/dist/regen-memo-files.sh | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/src/dist/regen-memo-files.sh b/src/dist/regen-memo-files.sh index 5918b9d4..a4daa448 100755 --- a/src/dist/regen-memo-files.sh +++ b/src/dist/regen-memo-files.sh @@ -44,15 +44,9 @@ usage() { run_split_parallel_os_dep() { set -x export JAVA_OPTS="-XX:+HeapDumpOnOutOfMemoryError -XX:HeapDumpPath=rslt.${DATESTR} -Xmx2g -Dlogback.configurationFile=${MEMOIZER_HOME}/logback-memoizer.xml -Dprocessname=memoizer" - CENTOS_VERSION=$(cat /etc/centos-release |cut -f 3 -d' '|cut -d. -f 1) cd rslt.${DATESTR} split -a 3 -l ${BATCH_SIZE} ${FULL_CSV} -d input. - PARALLEL_OPTS="error" - if [ "${CENTOS_VERSION}" = "6" ]; then - PARALLEL_OPTS="--halt 2 --gnu --eta --jobs ${JOBS} --joblog parallel-${JOBS}cpus.log --files --use-cpus-instead-of-cores --result . ${DRYRUN}" - else - PARALLEL_OPTS="--halt now,fail=1 --eta --jobs ${JOBS} --joblog parallel-${JOBS}cpus.log --files --use-cpus-instead-of-cores --results . ${DRYRUN}" - fi + PARALLEL_OPTS="--halt now,fail=1 --eta --jobs ${JOBS} --joblog parallel-${JOBS}cpus.log --files --use-cpus-instead-of-cores --results . ${DRYRUN}" set -x /usr/bin/time -p -o timed parallel ${PARALLEL_OPTS} \ ${MEMOIZER_HOME}/bin/memoregenerator \ From 8b3c9d752429d1884337d1e0d47a4d5840cf71dc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20Besson?= Date: Tue, 10 Sep 2024 09:32:04 +0100 Subject: [PATCH 3/7] Order the list of filesets to regenerate by the initial setId time Similarly to what omero fs importtime does, this uses the difference between the image creation timestamp and the end of the upload job associated with the fileset to estimate the server-side time spent in initializing the reader. The SQL results are sorted in decreasing order of the initialization time --- src/dist/memo_regenerator.sql | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/dist/memo_regenerator.sql b/src/dist/memo_regenerator.sql index 0abbba65..4dc777a3 100644 --- a/src/dist/memo_regenerator.sql +++ b/src/dist/memo_regenerator.sql @@ -9,10 +9,16 @@ COPY (SELECT * FROM ( pixels.sizeC, pixels.sizeT, format.value, + e2.time - e1.time AS setId, rank() OVER (PARTITION BY fileset.id ORDER BY image.id) AS rank FROM fileset JOIN image ON fileset.id = image.fileset JOIN pixels ON image.id = pixels.image JOIN pixelstype ON pixels.pixelstype = pixelstype.id JOIN format ON image.format = format.id -) AS rank WHERE rank.rank = 1) TO STDOUT CSV; + JOIN event e2 on image.creation_id=e2.id + JOIN filesetjoblink on filesetjoblink.parent=fileset.id + JOIN job on filesetjoblink.child=job.id + JOIN uploadjob on job.id=uploadjob.job_id + JOIN event e1 on job.update_id=e1.id +) AS query WHERE query.rank = 1 ORDER BY query.setId desc) TO STDOUT CSV; From 442341d10b30860a11dea23b8d585b2e434c6c07 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20Besson?= Date: Tue, 10 Sep 2024 09:34:41 +0100 Subject: [PATCH 4/7] Split SQL into $JOBS input files using round robin distribution The using of the chunks option with round robin distribution should create as many input files as there will be jobs and ensure the projected regeneration times are as equally distributed as possible --- src/dist/regen-memo-files.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/dist/regen-memo-files.sh b/src/dist/regen-memo-files.sh index a4daa448..ba278bf7 100755 --- a/src/dist/regen-memo-files.sh +++ b/src/dist/regen-memo-files.sh @@ -45,7 +45,7 @@ run_split_parallel_os_dep() { set -x export JAVA_OPTS="-XX:+HeapDumpOnOutOfMemoryError -XX:HeapDumpPath=rslt.${DATESTR} -Xmx2g -Dlogback.configurationFile=${MEMOIZER_HOME}/logback-memoizer.xml -Dprocessname=memoizer" cd rslt.${DATESTR} - split -a 3 -l ${BATCH_SIZE} ${FULL_CSV} -d input. + split -a 3 -n r/$JOBS ${FULL_CSV} -d input. PARALLEL_OPTS="--halt now,fail=1 --eta --jobs ${JOBS} --joblog parallel-${JOBS}cpus.log --files --use-cpus-instead-of-cores --results . ${DRYRUN}" set -x /usr/bin/time -p -o timed parallel ${PARALLEL_OPTS} \ From 7e22384683e9b0600d26fb4eb2e8a1514ff7063b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20Besson?= Date: Thu, 19 Sep 2024 08:45:14 +0100 Subject: [PATCH 5/7] Compute the number of files to split into to never exceed BATCH_SIZE --- src/dist/regen-memo-files.sh | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/dist/regen-memo-files.sh b/src/dist/regen-memo-files.sh index ba278bf7..eda762ff 100755 --- a/src/dist/regen-memo-files.sh +++ b/src/dist/regen-memo-files.sh @@ -45,7 +45,10 @@ run_split_parallel_os_dep() { set -x export JAVA_OPTS="-XX:+HeapDumpOnOutOfMemoryError -XX:HeapDumpPath=rslt.${DATESTR} -Xmx2g -Dlogback.configurationFile=${MEMOIZER_HOME}/logback-memoizer.xml -Dprocessname=memoizer" cd rslt.${DATESTR} - split -a 3 -n r/$JOBS ${FULL_CSV} -d input. + # Split the CSV file into N * JOBS files of at most BATCH_SIZE entries using round-robin distribution + N=$(wc -l ${FULL_CSV} | awk '{print $1}') + NFILES=$(( (($N - 1) / ($BATCH_SIZE * $JOBS) + 1 ) * $JOBS )) + split -a 3 -n r/$NFILES ${FULL_CSV} -d input. PARALLEL_OPTS="--halt now,fail=1 --eta --jobs ${JOBS} --joblog parallel-${JOBS}cpus.log --files --use-cpus-instead-of-cores --results . ${DRYRUN}" set -x /usr/bin/time -p -o timed parallel ${PARALLEL_OPTS} \ From 08b1ad274a06dd5acc878e91bb397be600fa96eb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20Besson?= Date: Thu, 19 Sep 2024 08:45:54 +0100 Subject: [PATCH 6/7] Review utility usage --- src/dist/regen-memo-files.sh | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/src/dist/regen-memo-files.sh b/src/dist/regen-memo-files.sh index eda762ff..b7029a1f 100755 --- a/src/dist/regen-memo-files.sh +++ b/src/dist/regen-memo-files.sh @@ -22,22 +22,28 @@ usage() { echo "Usage:" echo "$0 [OPTIONS]" - echo "Regenerates bioformats memofiles" + echo "Regenerates Bio-Formats memo files in parallel" + echo + echo "This utility queries the OMERO database for a list of filesets, splits the output" + echo "into several input files and runs the memoregenerator utility using GNU parallel." echo echo " OPTIONS:" - echo " --help display usage and exit" - echo " --db database connection string" - echo " --jobs max number of jobs to parallelize" - echo " --memoizer-home Location of image-region-ms" + echo " --batch-size Maximum number of entries in each input file sent to parallel (default: 500)" + echo " --cache-options Memofile cache options [/path/to/dir | inplace] (required)" + echo " --csv Bypass sql query and use this csv for image list" + echo " --db Database connection string" echo " --force-image-regen Force regeneration of image list even if it exists already" + echo " --help Display usage and exit" + echo " --jobs Maximum number of jobs to parallelize (default: number of processing units available)" + echo " --memoizer-home Location of image-region micro-service (default: current directory)" echo " --no-ask Do not ask for confirmation" echo " --no-wait Do not wait to start generating -- DO IT NOW" - echo " --cache-options Memofile cache options [/path/to/dir | inplace]" - echo " --batch-size # of image files to split list into" - echo " --csv Bypass sql query and use this csv for image list" echo - echo "Example:" - echo " $0 --db postgresql://user:pass@host:port/db --jobs [12|max] --memoizer-home /opt/omero/OMERO.ms-image-region.current --cache-options /path/to/dir" + echo "Examples:" + echo " Regenerate memo files using the current cache directory and all available CPUs" + echo " $0 --cache-options inplace" + echo " Regenerate memo files offline using a secondary cache directory and 4 CPUs" + echo " $0 --jobs 4 --cache-options /OMERO/BioFormatsCache.$( date "+%Y%m%d" )" exit $1 } From 68cfcfc228ffd2b3a10c949184e8a33f63681c73 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20Besson?= Date: Mon, 13 Jan 2025 12:06:07 +0000 Subject: [PATCH 7/7] Reinclude --db option to the documentation string --- src/dist/regen-memo-files.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/dist/regen-memo-files.sh b/src/dist/regen-memo-files.sh index 5d561628..6d28af03 100755 --- a/src/dist/regen-memo-files.sh +++ b/src/dist/regen-memo-files.sh @@ -44,6 +44,8 @@ usage() { echo " $0 --cache-options inplace" echo " Regenerate memo files offline using a secondary cache directory and 4 CPUs" echo " $0 --jobs 4 --cache-options /OMERO/BioFormatsCache.$( date "+%Y%m%d" )" + echo " Regenerate memo files offline using a secondary cache directory, all available CPUs and a database connection string" + echo " $0 --db postgresql://user:pass@host:port/db --cache-options /OMERO/BioFormatsCache.$( date "+%Y%m%d" )" exit $1 }