From 9b36191569024fabfa551fe0f83e428db439d862 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C3=A9bastien=20Besson?= <sbesson@glencoesoftware.com>
Date: Mon, 9 Sep 2024 14:00:28 +0100
Subject: [PATCH 1/7] memo_regenerator.sql: use LF for line endings

---
 src/dist/memo_regenerator.sql | 36 +++++++++++++++++------------------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/src/dist/memo_regenerator.sql b/src/dist/memo_regenerator.sql
index ccd9f5a8..0abbba65 100644
--- a/src/dist/memo_regenerator.sql
+++ b/src/dist/memo_regenerator.sql
@@ -1,18 +1,18 @@
-COPY (SELECT * FROM (
-    SELECT image.id AS imageId,
-           pixels.id AS pixelsId,
-           image.series,
-           pixelstype.value AS pixelstype,
-           pixels.sizeX,
-           pixels.sizeY,
-           pixels.sizeZ,
-           pixels.sizeC,
-           pixels.sizeT,
-           format.value,
-           rank() OVER (PARTITION BY fileset.id ORDER BY image.id) AS rank
-        FROM fileset
-            JOIN image ON fileset.id = image.fileset
-            JOIN pixels ON image.id = pixels.image
-            JOIN pixelstype ON pixels.pixelstype = pixelstype.id
-            JOIN format ON image.format = format.id
-) AS rank WHERE rank.rank = 1) TO STDOUT CSV;
+COPY (SELECT * FROM (
+    SELECT image.id AS imageId,
+           pixels.id AS pixelsId,
+           image.series,
+           pixelstype.value AS pixelstype,
+           pixels.sizeX,
+           pixels.sizeY,
+           pixels.sizeZ,
+           pixels.sizeC,
+           pixels.sizeT,
+           format.value,
+           rank() OVER (PARTITION BY fileset.id ORDER BY image.id) AS rank
+        FROM fileset
+            JOIN image ON fileset.id = image.fileset
+            JOIN pixels ON image.id = pixels.image
+            JOIN pixelstype ON pixels.pixelstype = pixelstype.id
+            JOIN format ON image.format = format.id
+) AS rank WHERE rank.rank = 1) TO STDOUT CSV;

From 04424538fcc65215fabe00fbaae2cfb80823f357 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C3=A9bastien=20Besson?= <sbesson@glencoesoftware.com>
Date: Mon, 9 Sep 2024 14:00:34 +0100
Subject: [PATCH 2/7] Remove CentOS 7 parallel options

---
 src/dist/regen-memo-files.sh | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/src/dist/regen-memo-files.sh b/src/dist/regen-memo-files.sh
index 5918b9d4..a4daa448 100755
--- a/src/dist/regen-memo-files.sh
+++ b/src/dist/regen-memo-files.sh
@@ -44,15 +44,9 @@ usage() {
 run_split_parallel_os_dep() {
 set -x
   export JAVA_OPTS="-XX:+HeapDumpOnOutOfMemoryError -XX:HeapDumpPath=rslt.${DATESTR} -Xmx2g -Dlogback.configurationFile=${MEMOIZER_HOME}/logback-memoizer.xml -Dprocessname=memoizer"
-  CENTOS_VERSION=$(cat /etc/centos-release |cut -f 3 -d' '|cut -d. -f 1)
   cd rslt.${DATESTR}
   split -a 3 -l ${BATCH_SIZE} ${FULL_CSV} -d input.
-  PARALLEL_OPTS="error"
-  if [ "${CENTOS_VERSION}" = "6" ]; then
-    PARALLEL_OPTS="--halt 2 --gnu --eta --jobs ${JOBS} --joblog parallel-${JOBS}cpus.log --files --use-cpus-instead-of-cores --result . ${DRYRUN}"
-  else
-    PARALLEL_OPTS="--halt now,fail=1 --eta --jobs ${JOBS} --joblog parallel-${JOBS}cpus.log --files --use-cpus-instead-of-cores --results . ${DRYRUN}"
-  fi
+  PARALLEL_OPTS="--halt now,fail=1 --eta --jobs ${JOBS} --joblog parallel-${JOBS}cpus.log --files --use-cpus-instead-of-cores --results . ${DRYRUN}"
 set -x
   /usr/bin/time -p -o timed parallel ${PARALLEL_OPTS} \
     ${MEMOIZER_HOME}/bin/memoregenerator \

From 8b3c9d752429d1884337d1e0d47a4d5840cf71dc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C3=A9bastien=20Besson?= <sbesson@glencoesoftware.com>
Date: Tue, 10 Sep 2024 09:32:04 +0100
Subject: [PATCH 3/7] Order the list of filesets to regenerate by the initial
 setId time

Similarly to what omero fs importtime does, this uses the difference
between the image creation timestamp and the end of the upload job
associated with the fileset to estimate the server-side time spent
in initializing the reader.
The SQL results are sorted in decreasing order of the initialization
time
---
 src/dist/memo_regenerator.sql | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/src/dist/memo_regenerator.sql b/src/dist/memo_regenerator.sql
index 0abbba65..4dc777a3 100644
--- a/src/dist/memo_regenerator.sql
+++ b/src/dist/memo_regenerator.sql
@@ -9,10 +9,16 @@ COPY (SELECT * FROM (
            pixels.sizeC,
            pixels.sizeT,
            format.value,
+           e2.time - e1.time AS setId,
            rank() OVER (PARTITION BY fileset.id ORDER BY image.id) AS rank
         FROM fileset
             JOIN image ON fileset.id = image.fileset
             JOIN pixels ON image.id = pixels.image
             JOIN pixelstype ON pixels.pixelstype = pixelstype.id
             JOIN format ON image.format = format.id
-) AS rank WHERE rank.rank = 1) TO STDOUT CSV;
+            JOIN event e2 on image.creation_id=e2.id
+            JOIN filesetjoblink on  filesetjoblink.parent=fileset.id
+            JOIN job on filesetjoblink.child=job.id
+            JOIN uploadjob on job.id=uploadjob.job_id
+            JOIN event e1 on job.update_id=e1.id
+)  AS query WHERE query.rank = 1 ORDER BY query.setId desc) TO STDOUT CSV;

From 442341d10b30860a11dea23b8d585b2e434c6c07 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C3=A9bastien=20Besson?= <sbesson@glencoesoftware.com>
Date: Tue, 10 Sep 2024 09:34:41 +0100
Subject: [PATCH 4/7] Split SQL into $JOBS input files using round robin
 distribution

The using of the chunks option with round robin distribution should
create as many input files as there will be jobs and ensure the
projected regeneration times are as equally distributed as possible
---
 src/dist/regen-memo-files.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/dist/regen-memo-files.sh b/src/dist/regen-memo-files.sh
index a4daa448..ba278bf7 100755
--- a/src/dist/regen-memo-files.sh
+++ b/src/dist/regen-memo-files.sh
@@ -45,7 +45,7 @@ run_split_parallel_os_dep() {
 set -x
   export JAVA_OPTS="-XX:+HeapDumpOnOutOfMemoryError -XX:HeapDumpPath=rslt.${DATESTR} -Xmx2g -Dlogback.configurationFile=${MEMOIZER_HOME}/logback-memoizer.xml -Dprocessname=memoizer"
   cd rslt.${DATESTR}
-  split -a 3 -l ${BATCH_SIZE} ${FULL_CSV} -d input.
+  split -a 3 -n r/$JOBS ${FULL_CSV} -d input.
   PARALLEL_OPTS="--halt now,fail=1 --eta --jobs ${JOBS} --joblog parallel-${JOBS}cpus.log --files --use-cpus-instead-of-cores --results . ${DRYRUN}"
 set -x
   /usr/bin/time -p -o timed parallel ${PARALLEL_OPTS} \

From 7e22384683e9b0600d26fb4eb2e8a1514ff7063b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C3=A9bastien=20Besson?= <sbesson@glencoesoftware.com>
Date: Thu, 19 Sep 2024 08:45:14 +0100
Subject: [PATCH 5/7] Compute the number of files to split into to never exceed
 BATCH_SIZE

---
 src/dist/regen-memo-files.sh | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/dist/regen-memo-files.sh b/src/dist/regen-memo-files.sh
index ba278bf7..eda762ff 100755
--- a/src/dist/regen-memo-files.sh
+++ b/src/dist/regen-memo-files.sh
@@ -45,7 +45,10 @@ run_split_parallel_os_dep() {
 set -x
   export JAVA_OPTS="-XX:+HeapDumpOnOutOfMemoryError -XX:HeapDumpPath=rslt.${DATESTR} -Xmx2g -Dlogback.configurationFile=${MEMOIZER_HOME}/logback-memoizer.xml -Dprocessname=memoizer"
   cd rslt.${DATESTR}
-  split -a 3 -n r/$JOBS ${FULL_CSV} -d input.
+  # Split the CSV file into N * JOBS files of at most BATCH_SIZE entries using round-robin distribution
+  N=$(wc -l ${FULL_CSV} | awk '{print $1}')
+  NFILES=$(( (($N - 1) / ($BATCH_SIZE * $JOBS) + 1 ) * $JOBS ))
+  split -a 3 -n r/$NFILES ${FULL_CSV} -d input.
   PARALLEL_OPTS="--halt now,fail=1 --eta --jobs ${JOBS} --joblog parallel-${JOBS}cpus.log --files --use-cpus-instead-of-cores --results . ${DRYRUN}"
 set -x
   /usr/bin/time -p -o timed parallel ${PARALLEL_OPTS} \

From 08b1ad274a06dd5acc878e91bb397be600fa96eb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C3=A9bastien=20Besson?= <sbesson@glencoesoftware.com>
Date: Thu, 19 Sep 2024 08:45:54 +0100
Subject: [PATCH 6/7] Review utility usage

---
 src/dist/regen-memo-files.sh | 26 ++++++++++++++++----------
 1 file changed, 16 insertions(+), 10 deletions(-)

diff --git a/src/dist/regen-memo-files.sh b/src/dist/regen-memo-files.sh
index eda762ff..b7029a1f 100755
--- a/src/dist/regen-memo-files.sh
+++ b/src/dist/regen-memo-files.sh
@@ -22,22 +22,28 @@
 usage() {
     echo "Usage:"
     echo "$0 [OPTIONS]"
-    echo "Regenerates bioformats memofiles"
+    echo "Regenerates Bio-Formats memo files in parallel"
+    echo
+    echo "This utility queries the OMERO database for a list of filesets, splits the output"
+    echo "into several input files and runs the memoregenerator utility using GNU parallel."
     echo
     echo "  OPTIONS:"
-    echo "    --help                display usage and exit"
-    echo "    --db                  database connection string"
-    echo "    --jobs                max number of jobs to parallelize"
-    echo "    --memoizer-home       Location of image-region-ms"
+    echo "    --batch-size          Maximum number of entries in each input file sent to parallel (default: 500)"
+    echo "    --cache-options       Memofile cache options [/path/to/dir | inplace] (required)"
+    echo "    --csv                 Bypass sql query and use this csv for image list"
+    echo "    --db                  Database connection string"
     echo "    --force-image-regen   Force regeneration of image list even if it exists already"
+    echo "    --help                Display usage and exit"
+    echo "    --jobs                Maximum number of jobs to parallelize (default: number of processing units available)"
+    echo "    --memoizer-home       Location of image-region micro-service (default: current directory)"
     echo "    --no-ask              Do not ask for confirmation"
     echo "    --no-wait             Do not wait to start generating -- DO IT NOW"
-    echo "    --cache-options       Memofile cache options [/path/to/dir | inplace]"
-    echo "    --batch-size          # of image files to split list into"
-    echo "    --csv                 Bypass sql query and use this csv for image list"
     echo
-    echo "Example:"
-    echo "  $0 --db postgresql://user:pass@host:port/db --jobs [12|max] --memoizer-home /opt/omero/OMERO.ms-image-region.current --cache-options /path/to/dir"
+    echo "Examples:"
+    echo "  Regenerate memo files using the current cache directory and all available CPUs"
+    echo "  $0 --cache-options inplace"
+    echo "  Regenerate memo files offline using a secondary cache directory and 4 CPUs"
+    echo "  $0 --jobs 4 --cache-options /OMERO/BioFormatsCache.$( date "+%Y%m%d" )"
     exit $1
 }
 

From 68cfcfc228ffd2b3a10c949184e8a33f63681c73 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C3=A9bastien=20Besson?= <sbesson@glencoesoftware.com>
Date: Mon, 13 Jan 2025 12:06:07 +0000
Subject: [PATCH 7/7] Reinclude --db option to the documentation string

---
 src/dist/regen-memo-files.sh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/dist/regen-memo-files.sh b/src/dist/regen-memo-files.sh
index 5d561628..6d28af03 100755
--- a/src/dist/regen-memo-files.sh
+++ b/src/dist/regen-memo-files.sh
@@ -44,6 +44,8 @@ usage() {
     echo "  $0 --cache-options inplace"
     echo "  Regenerate memo files offline using a secondary cache directory and 4 CPUs"
     echo "  $0 --jobs 4 --cache-options /OMERO/BioFormatsCache.$( date "+%Y%m%d" )"
+    echo "  Regenerate memo files offline using a secondary cache directory, all available CPUs and a database connection string"
+    echo "  $0 --db postgresql://user:pass@host:port/db --cache-options /OMERO/BioFormatsCache.$( date "+%Y%m%d" )"
     exit $1
 }