# Copyright 2020 John Harwell, All rights reserved.
#
# SPDX-License-Identifier: MIT
"""
HPC plugin for running SIERRA on HPC clusters using the SLURM scheduler.
"""
# Core packages
import typing as tp
import argparse
import shutil
import pathlib
import os
# 3rd party packages
import implements
# Project packages
from sierra.core import types
from sierra.core.experiment import bindings
[docs]@implements.implements(bindings.IExpShellCmdsGenerator)
class ExpShellCmdsGenerator():
"""Generate the cmd to correctly invoke GNU Parallel on SLURM HPC.
"""
[docs] def __init__(self,
cmdopts: types.Cmdopts,
exp_num: int) -> None:
self.cmdopts = cmdopts
[docs] def pre_exp_cmds(self) -> tp.List[types.ShellCmdSpec]:
return []
[docs] def post_exp_cmds(self) -> tp.List[types.ShellCmdSpec]:
return []
[docs] def exec_exp_cmds(self, exec_opts: types.StrDict) -> tp.List[types.ShellCmdSpec]:
jobid = os.environ['SLURM_JOB_ID']
nodelist = pathlib.Path(exec_opts['exp_input_root'],
f"{jobid}-nodelist.txt")
resume = ''
# This can't be --resume, because then GNU parallel looks at the results
# directory, and if there is stuff in it, (apparently) assumes that the
# job finished...
if exec_opts['exec_resume']:
resume = '--resume-failed'
unique_nodes = types.ShellCmdSpec(
cmd=f'scontrol show hostnames $SLURM_JOB_NODELIST > {nodelist}',
shell=True,
wait=True)
# Make sure GNU parallel uses the right shell, because it seems to
# defaults to /bin/sh since all cmds are run in a python shell which
# does not have $SHELL set.
shell = shutil.which('bash')
use_bash = types.ShellCmdSpec(cmd=f'export PARALLEL_SHELL={shell}',
shell=True,
wait=True)
parallel = 'parallel {2} ' \
'--jobs {1} ' \
'--results {4} ' \
'--joblog {3} ' \
'--sshloginfile {0} ' \
'--workdir {4} < "{5}"'
log = pathlib.Path(exec_opts['scratch_dir'], "parallel.log")
parallel = parallel.format(nodelist,
exec_opts['n_jobs'],
resume,
log,
exec_opts['scratch_dir'],
exec_opts['cmdfile_stem_path'] +
exec_opts['cmdfile_ext'])
parallel_spec = types.ShellCmdSpec(cmd=parallel,
shell=True,
wait=True)
return [unique_nodes, use_bash, parallel_spec]
__api__ = [
'ParsedCmdlineConfigurer',
'ExpShellCmdsGenerator'
]