diff options
| author | Matthijs van der Wild <matthijs.van-der-wild@durham.ac.uk> | 2024-09-30 16:19:51 +0100 |
|---|---|---|
| committer | Matthijs van der Wild <matthijs.van-der-wild@durham.ac.uk> | 2024-09-30 16:19:51 +0100 |
| commit | 9246d90121fb9beb87796ca5dc9b8758daaaeb45 (patch) | |
| tree | d8ac9bdcf3fc527150bd0b008e453be4da6b2a84 | |
Initialise repositories
| -rw-r--r-- | README.md | 44 | ||||
| -rw-r--r-- | pilot.sh | 142 |
2 files changed, 186 insertions, 0 deletions
diff --git a/README.md b/README.md new file mode 100644 index 0000000..ba7adb8 --- /dev/null +++ b/README.md @@ -0,0 +1,44 @@ +# LOFAR PILOT + +This is a small pipeline runner script that wraps Common Workflow Language ([CWL](https://www.commonwl.org/) pipelines with [toil](https://toil.readthedocs.io). +It is compatible with [LINC](https://git.astron.nl/RD/LINC) and the [VLBI](https://git.astron.nl/RD/VLBI-cwl/) pipelines. +*This is a work in progress. +Issues should be reported to [Matthijs van der Wild](mailto:matthijs.van-der-wild@durham.ac.uk).* + +## Assumptions + +This script assumes the following: +* All relevant input data is available either in either the `$HOME` directory or in a directory henceforth called `$BINDDIR`. + Targets of any links in these directories should be accessible to the compute directories, as these will be mounted during relevant jobs. +* The output will be written to a results directory in `$BINDDIR`. +* This script will be used with the SLURM queuing system on COSMA5 with the following options: `-p cosma5 -A durham -t 72:00:00`. + If these options are not appropriate or if this script is to be run on other SLURM-run clusters one must set `$TOIL_SLURM_ARGS` prior to running. +* `$CWL_SINGULARITY_CACHE` is set and the corresponding path contains (a link to) a singularity container `vlbi-cwl.sif`. + If it isn't set a suitable container can be specified as detailed below. + +## Execution + +The script can be run as follows: +``` +sh pilot.sh [options] <workflow name> $BINDDIR +``` +Options can be the following: +* `-h` prints the script usage with all available options (optional). +* `-r` restarts a failed pipeline, if this script was run before but the pipeline failed. +* `-c` allows the pipeline to use the specified container (optional, VLBI pipeline only). +* `-i` points to your input JSON file (so it can be any appropriate JSON file, as long as it is located in either `$HOME` or `$BINDDIR`. +* `-p` is a path to the pipeline repository (LINC and VLBI pipeline only). +* `--scratch` is a path to local scratch storage where temporary data can be written to (optional). + **`--scratch` must be local to the compute node. + Nonlocal scratch storage will likely cause the pipeline to fail.** +* `<workflow name>` is the workflow file name without extension, e.g. `delay-calibration` or `concatenate-flag` for the VLBI pipeline or `HBA_calibrator` or `HBA_target` for LINC. + +## Notes + +* Upon successful pipeline completion the results directory contains the following: + * The pipeline data products, + * the statistics gathered by toil. +* Jobstore files and intermediate pipeline data products are stored in a `toil` directory in `$BINDDIR`. +* Jobstore files can be removed by running `toil clean $BINDDIR/toil/<workflow>_job`. +* Toil may not clear temporary files after the pipeline has finished. + These have to be removed by hand. diff --git a/pilot.sh b/pilot.sh new file mode 100644 index 0000000..b41f6fd --- /dev/null +++ b/pilot.sh @@ -0,0 +1,142 @@ +#!/bin/sh -eu + +usage() { + echo "Usage: ${PROGRAM} [-r|--restart] [-h|--help] [-c|--container <singularity container>] [--scratch <scratch dir>] (-f <input file>) (-p <pipeline>) <Workflow> <input dir>" + exit 0 +} + +error() { + echo "Error: $@" >&2 + exit 1 +} + +set_container() { + [ -f "${1}" ] || error "Container ${1} does not exist." + CWL_SINGULARITY_CACHE="${1%/*}" + if [ ! "${1##*/}" == "vlbi-cwl.sif" ]; then + ln -s "${1}" "$CWL_SINGULARITY_CACHE/vlbi-cwl.sif" + fi +} + +opts=$(getopt -o rhf:c:p: --long restart,help,container:,scratch: \ + -n 'pilot' -- "$@") + +eval set -- "$opts" + +RESTART="" +SCRATCH="" +while true; do + case "$1" in + -c | --container) set_container "${2}"; shift 2 ;; + -f ) INPUT_FILE="${2}"; shift 2 ;; + -h | --help ) usage ;; + -p ) PIPELINE="${2}"; shift 2 ;; + -r | --restart) RESTART="--restart"; shift ;; + --scratch) SCRATCH="${2}"; shift 2 ;; + * ) shift; break ;; + esac +done + +PROGRAM="${0##*/}" + +# TODO: clean this up +WORKFLOW="${PIPELINE}/workflows/${1}.cwl" +WORKFLOW_NAME=$(basename ${WORKFLOW%.cwl}) +[ -f "${WORKFLOW}" ] || error "$(realpath ${WORKFLOW}) is invalid." +INPUT_DIR="${2}" +[ -d "${INPUT_DIR}" ] || error "Input directory does not exist." + +[ -n "${INPUT_FILE}" ] || error "Missing input file." +[ -f "${INPUT_FILE}" ] || error "Invalid input file." +[ ! -z "${CWL_SINGULARITY_CACHE}" ] || error "\$CWL_SINGULARITY_CACHE is not set or no container has been specified." + +PIPELINE_LOG="${HOME}/${WORKFLOW_NAME}.log" + +TMP_OUTDIR="${INPUT_DIR}/toil/tmp/tmp/" +JOB_LOG_DIR="${INPUT_DIR}/toil/logs/" +BATCH_LOG_DIR="${TOIL_BATCH_LOGS_DIR:-${INPUT_DIR}/toil/logs}" +WORK_DIR="${INPUT_DIR}/toil/work/" +# TODO: optionally decouple output directory from input directory? +OUTPUT_DIR="${INPUT_DIR}/${WORKFLOW_NAME}_results" +JOBSTORE_DIR="${INPUT_DIR}/toil/${WORKFLOW_NAME}_job/" +STATS_DIR="${OUTPUT_DIR}/stats" + +mkdir -p "$JOB_LOG_DIR" +mkdir -p "$BATCH_LOG_DIR" +mkdir -p "$WORK_DIR" +mkdir -p "$OUTPUT_DIR" +mkdir -p "$STATS_DIR" + +TMPDIR_PREFIX="" +if [ ! -z "${SCRATCH}" ]; then + TMPDIR_PREFIX="--tmpdir-prefix ${SCRATCH}/tmp_${WORKFLOW_NAME}/" +fi + +# Print information relevant for the run +cat << EOF +The following will be used in the run: + +Pipeline ┃ ${WORKFLOW} +Input file ┃ ${INPUT_FILE} +━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╋━━━━━━━━━━━━━━━━━━━━━ +Output directory ┃ ${OUTPUT_DIR} +Log directory ┃ ${JOB_LOG_DIR} +Jobstore directory ┃ ${JOBSTORE_DIR} +Intermediate output directory ┃ ${TMP_OUTDIR} +Pipeline statistics directory ┃ ${STATS_DIR} +EOF + +# TODO: set dummy variables that potentially use pre-set values +export APPTAINERENV_PATH="\$PATH:$PIPELINE/scripts" +export APPTAINERENV_PYTHONPATH="\$PYTHONPATH:$PIPELINE/scripts" +export APPTAINER_BIND="$HOME,$INPUT_DIR,$OUTPUT_DIR" + +TOIL_COMMAND="toil-cwl-runner ${RESTART} ${TMPDIR_PREFIX} \ + --singularity \ + --clean never \ + --retryCount 0 \ + --disableCaching \ + --logFile ${PIPELINE_LOG} \ + --writeLogs ${JOB_LOG_DIR} \ + --stats \ + --clusterStats ${STATS_DIR} \ + --batchSystem slurm \ + --batchLogsDir ${BATCH_LOG_DIR} \ + --tmp-outdir-prefix ${TMP_OUTDIR} \ + --workDir ${WORK_DIR} \ + --outdir ${OUTPUT_DIR} \ + --jobStore ${JOBSTORE_DIR} \ + --bypass-file-store \ + ${WORKFLOW} \ + ${INPUT_FILE}" + +export TOIL_SLURM_ARGS="${TOIL_SLURM_ARGS:-"-p cosma5 -A durham -t 72:00:00"}" +# Note the meaning of these SLURM options: +# -N # number of nodes +# -c # number of cores; available memory is tied to this if not specified separately +# -p # partition (queue); +# -A # project +# -t # runtime in d-hh:mm:ss format + +echo -e "env APPTAINERENV_PATH="$PIPELINE/scripts:\$PATH" \ + APPTAINERENV_PYTHONPATH="$PIPELINE/scripts:\$PYTHONPATH" \ + APPTAINER_BIND="$HOME,$INPUT_DIR,${OUTPUT_DIR}" \ + TOIL_SLURM_ARGS="${TOIL_SLURM_ARGS:-"-p cosma5 -A durham -t 72:00:00"}" \ + ${TOIL_COMMAND}" + +env APPTAINERENV_PATH="\$PATH:$PIPELINE/scripts" \ + APPTAINERENV_PYTHONPATH="\$PYTHONPATH:$PIPELINE/scripts" \ + APPTAINER_BIND="$HOME,$INPUT_DIR,$OUTPUT_DIR" \ + TOIL_SLURM_ARGS="${TOIL_SLURM_ARGS:--p cosma5 -A durham -t 72:00:00}" \ + ${TOIL_COMMAND} > ${OUTPUT_DIR}/${WORKFLOW_NAME}.out && STATUS=${?} || STATUS=${?} + +toil stats --raw ${JOBSTORE_DIR} > ${STATS_DIR}/${WORKFLOW_NAME}.stats.json || true +toil stats --pretty ${JOBSTORE_DIR} > ${STATS_DIR}/${WORKFLOW_NAME}.stats.txt || true + +echo -e "\nThe pipeline was run using\n\n${TOIL_COMMAND}\n" +if [ ${STATUS} -eq 0 ]; then + echo -e "\nPipeline finished successfully.\n" +else + echo -e "\nPipeline failed with exit status ${STATUS}.\n" +fi +exit ${STATUS} |