jobscript

#!/bin/bash
#Set job requirements
#We are going to use only one node (-N 1), and use the express queue (-p short). And we set the time to 4 minutes (-t 4:00)
#SBATCH -p short
#SBATCH -N 1
#SBATCH -t 4:00

#Make sure you have logged in to your iRODS zone prior to job submission. iRODS creates a irodsA file which is subsequently used by the worker nodes.

#move to your home directory and current git repository which is also mounted on your scratch space and might hold the processing script
cd $HOME/iRODS-RDM-HPC-course

rodscoll='/surfZone1/home/irods-user1/YOUR OUTPUT COLLECTION'

inputdir="$TMPDIR/inputdat$SLURM_JOBID"
outputdir="$TMPDIR/outputdat$SLURM_JOBID"
mkdir $inputdir
mkdir $outputdir

#search for data objects with iquest
#get these files from iRODS and store them under scratch
iquest "%s/%s" "select COLL_NAME, DATA_NAME where META_DATA_ATTR_NAME = 'author' and META_DATA_ATTR_VALUE = 'Lewis Carroll'" | parallel iget {} $inputdir

#perform the word count analysis
resultsfile=results$SLURM_JOBID.dat
cat $inputdir/* | tr '[:upper:]' '[:lower:]' | awk '{for(i=1;i<=NF;i++) count[$i]++} END {for(j in count) print j, count[j]}' > $outputdir/$resultsfile

#put results back into iRODS
iput $outputdir/$resultsfile $rodscoll

#add metadata provenance
#possible add a metadata annotation script call either locally via scipt file, rule file or server side via installed rules, where last is preferred but also difficult to implement.
imeta add -d $rodscoll/$resultsfile 'somekey' 'somevalue'
imeta ls -d $rodscoll/$resultsfile