Initialise repositories

author: Matthijs van der Wild <matthijs.van-der-wild@durham.ac.uk> 2024-09-30 16:19:51 +0100
committer: Matthijs van der Wild <matthijs.van-der-wild@durham.ac.uk> 2024-09-30 16:19:51 +0100
commit: 9246d90121fb9beb87796ca5dc9b8758daaaeb45 (patch)
tree: d8ac9bdcf3fc527150bd0b008e453be4da6b2a84
2 files changed, 186 insertions, 0 deletions
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..ba7adb8
--- /dev/null
+++ b/README.md
@@ -0,0 +1,44 @@
+# LOFAR PILOT
+
+This is a small pipeline runner script that wraps Common Workflow Language ([CWL](https://www.commonwl.org/) pipelines with [toil](https://toil.readthedocs.io).
+It is compatible with [LINC](https://git.astron.nl/RD/LINC) and the [VLBI](https://git.astron.nl/RD/VLBI-cwl/) pipelines.
+*This is a work in progress.
+Issues should be reported to [Matthijs van der Wild](mailto:matthijs.van-der-wild@durham.ac.uk).*
+
+## Assumptions
+
+This script assumes the following:
+* All relevant input data is available either in either the `$HOME` directory or in a directory henceforth called `$BINDDIR`.
+  Targets of any links in these directories should be accessible to the compute directories, as these will be mounted during relevant jobs.
+* The output will be written to a results directory in `$BINDDIR`.
+* This script will be used with the SLURM queuing system on COSMA5 with the following options: `-p cosma5 -A durham -t 72:00:00`.
+  If these options are not appropriate or if this script is to be run on other SLURM-run clusters one must set `$TOIL_SLURM_ARGS` prior to running.
+* `$CWL_SINGULARITY_CACHE` is set and the corresponding path contains (a link to) a singularity container `vlbi-cwl.sif`.
+  If it isn't set a suitable container can be specified as detailed below.
+
+## Execution
+
+The script can be run as follows:
+```
+sh pilot.sh [options] <workflow name> $BINDDIR
+```
+Options can be the following:
+* `-h` prints the script usage with all available options (optional).
+* `-r` restarts a failed pipeline, if this script was run before but the pipeline failed.
+* `-c` allows the pipeline to use the specified container (optional, VLBI pipeline only).
+* `-i` points to your input JSON file (so it can be any appropriate JSON file, as long as it is located in either `$HOME` or `$BINDDIR`.
+* `-p` is a path to the pipeline repository (LINC and VLBI pipeline only).
+* `--scratch` is a path to local scratch storage where temporary data can be written to (optional).
+  **`--scratch` must be local to the compute node.
+  Nonlocal scratch storage will likely cause the pipeline to fail.**
+* `<workflow name>` is the workflow file name without extension, e.g. `delay-calibration` or `concatenate-flag` for the VLBI pipeline or `HBA_calibrator` or `HBA_target` for LINC.
+
+## Notes
+
+* Upon successful pipeline completion the results directory contains the following:
+    * The pipeline data products,
+    * the statistics gathered by toil.
+* Jobstore files and intermediate pipeline data products are stored in a `toil` directory in `$BINDDIR`.
+* Jobstore files can be removed by running `toil clean $BINDDIR/toil/<workflow>_job`.
+* Toil may not clear temporary files after the pipeline has finished.
+  These have to be removed by hand.
diff --git a/pilot.sh b/pilot.sh
new file mode 100644
index 0000000..b41f6fd
--- /dev/null
+++ b/pilot.sh
@@ -0,0 +1,142 @@
+#!/bin/sh -eu
+
+usage() {
+    echo "Usage: ${PROGRAM} [-r|--restart] [-h|--help] [-c|--container <singularity container>] [--scratch <scratch dir>] (-f <input file>) (-p <pipeline>) <Workflow> <input dir>"
+    exit 0 
+}
+
+error() {
+    echo "Error: $@" >&2
+    exit 1
+}
+
+set_container() {
+    [ -f "${1}" ] || error "Container ${1} does not exist."
+    CWL_SINGULARITY_CACHE="${1%/*}"
+    if [ ! "${1##*/}" == "vlbi-cwl.sif" ]; then
+        ln -s "${1}" "$CWL_SINGULARITY_CACHE/vlbi-cwl.sif"
+    fi
+}
+
+opts=$(getopt -o rhf:c:p: --long restart,help,container:,scratch: \
+        -n 'pilot' -- "$@")
+
+eval set -- "$opts"
+
+RESTART=""
+SCRATCH=""
+while true; do
+    case "$1" in
+        -c | --container) set_container "${2}"; shift 2 ;;
+        -f ) INPUT_FILE="${2}"; shift 2 ;;
+        -h | --help ) usage ;;
+        -p ) PIPELINE="${2}"; shift 2 ;;
+        -r | --restart) RESTART="--restart";    shift ;;
+        --scratch) SCRATCH="${2}"; shift 2 ;;
+        * ) shift; break ;;
+    esac
+done
+
+PROGRAM="${0##*/}"
+
+# TODO: clean this up
+WORKFLOW="${PIPELINE}/workflows/${1}.cwl"
+WORKFLOW_NAME=$(basename ${WORKFLOW%.cwl})
+[ -f "${WORKFLOW}" ] || error "$(realpath ${WORKFLOW}) is invalid."
+INPUT_DIR="${2}"
+[ -d "${INPUT_DIR}" ] || error "Input directory does not exist."
+
+[ -n "${INPUT_FILE}" ] || error "Missing input file."
+[ -f "${INPUT_FILE}" ] || error "Invalid input file."
+[ ! -z "${CWL_SINGULARITY_CACHE}" ] || error "\$CWL_SINGULARITY_CACHE is not set or no container has been specified."
+
+PIPELINE_LOG="${HOME}/${WORKFLOW_NAME}.log"
+
+TMP_OUTDIR="${INPUT_DIR}/toil/tmp/tmp/"
+JOB_LOG_DIR="${INPUT_DIR}/toil/logs/"
+BATCH_LOG_DIR="${TOIL_BATCH_LOGS_DIR:-${INPUT_DIR}/toil/logs}"
+WORK_DIR="${INPUT_DIR}/toil/work/"
+# TODO: optionally decouple output directory from input directory?
+OUTPUT_DIR="${INPUT_DIR}/${WORKFLOW_NAME}_results"
+JOBSTORE_DIR="${INPUT_DIR}/toil/${WORKFLOW_NAME}_job/"
+STATS_DIR="${OUTPUT_DIR}/stats"
+
+mkdir -p "$JOB_LOG_DIR"
+mkdir -p "$BATCH_LOG_DIR"
+mkdir -p "$WORK_DIR"
+mkdir -p "$OUTPUT_DIR"
+mkdir -p "$STATS_DIR"
+
+TMPDIR_PREFIX=""
+if [ ! -z "${SCRATCH}" ]; then
+    TMPDIR_PREFIX="--tmpdir-prefix ${SCRATCH}/tmp_${WORKFLOW_NAME}/"
+fi
+
+# Print information relevant for the run 
+cat << EOF
+The following will be used in the run:
+
+Pipeline                        ┃ ${WORKFLOW}
+Input file                      ┃ ${INPUT_FILE}
+━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╋━━━━━━━━━━━━━━━━━━━━━
+Output directory                ┃ ${OUTPUT_DIR}
+Log directory                   ┃ ${JOB_LOG_DIR}
+Jobstore directory              ┃ ${JOBSTORE_DIR}
+Intermediate output directory   ┃ ${TMP_OUTDIR}
+Pipeline statistics directory   ┃ ${STATS_DIR}
+EOF
+
+# TODO: set dummy variables that potentially use pre-set values
+export APPTAINERENV_PATH="\$PATH:$PIPELINE/scripts"
+export APPTAINERENV_PYTHONPATH="\$PYTHONPATH:$PIPELINE/scripts"
+export APPTAINER_BIND="$HOME,$INPUT_DIR,$OUTPUT_DIR"
+
+TOIL_COMMAND="toil-cwl-runner ${RESTART} ${TMPDIR_PREFIX} \
+  --singularity \
+  --clean never \
+  --retryCount 0 \
+  --disableCaching \
+  --logFile ${PIPELINE_LOG} \
+  --writeLogs ${JOB_LOG_DIR} \
+  --stats \
+  --clusterStats ${STATS_DIR} \
+  --batchSystem slurm \
+  --batchLogsDir ${BATCH_LOG_DIR} \
+  --tmp-outdir-prefix ${TMP_OUTDIR} \
+  --workDir ${WORK_DIR} \
+  --outdir ${OUTPUT_DIR} \
+  --jobStore ${JOBSTORE_DIR} \
+  --bypass-file-store \
+  ${WORKFLOW} \
+  ${INPUT_FILE}"
+
+export TOIL_SLURM_ARGS="${TOIL_SLURM_ARGS:-"-p cosma5 -A durham -t 72:00:00"}"
+# Note the meaning of these SLURM options:
+# -N  # number of nodes
+# -c  # number of cores; available memory is tied to this if not specified separately
+# -p  # partition (queue);
+# -A  # project
+# -t  # runtime in d-hh:mm:ss format
+
+echo -e "env APPTAINERENV_PATH="$PIPELINE/scripts:\$PATH" \
+    APPTAINERENV_PYTHONPATH="$PIPELINE/scripts:\$PYTHONPATH" \
+    APPTAINER_BIND="$HOME,$INPUT_DIR,${OUTPUT_DIR}" \
+    TOIL_SLURM_ARGS="${TOIL_SLURM_ARGS:-"-p cosma5 -A durham -t 72:00:00"}" \
+    ${TOIL_COMMAND}"
+
+env APPTAINERENV_PATH="\$PATH:$PIPELINE/scripts" \
+    APPTAINERENV_PYTHONPATH="\$PYTHONPATH:$PIPELINE/scripts" \
+    APPTAINER_BIND="$HOME,$INPUT_DIR,$OUTPUT_DIR" \
+    TOIL_SLURM_ARGS="${TOIL_SLURM_ARGS:--p cosma5 -A durham -t 72:00:00}" \
+    ${TOIL_COMMAND} > ${OUTPUT_DIR}/${WORKFLOW_NAME}.out && STATUS=${?} || STATUS=${?}
+
+toil stats --raw ${JOBSTORE_DIR} > ${STATS_DIR}/${WORKFLOW_NAME}.stats.json || true
+toil stats --pretty ${JOBSTORE_DIR} > ${STATS_DIR}/${WORKFLOW_NAME}.stats.txt || true
+
+echo -e "\nThe pipeline was run using\n\n${TOIL_COMMAND}\n"
+if [ ${STATUS} -eq 0 ]; then
+    echo -e "\nPipeline finished successfully.\n"
+else
+    echo -e "\nPipeline failed with exit status ${STATUS}.\n"
+fi
+exit ${STATUS}
author	Matthijs van der Wild <matthijs.van-der-wild@durham.ac.uk>	2024-09-30 16:19:51 +0100
committer	Matthijs van der Wild <matthijs.van-der-wild@durham.ac.uk>	2024-09-30 16:19:51 +0100
commit	9246d90121fb9beb87796ca5dc9b8758daaaeb45 (patch)
tree	d8ac9bdcf3fc527150bd0b008e453be4da6b2a84