Source code for sierra.plugins.hpc.slurm.plugin

# Copyright 2020 John Harwell, All rights reserved.
#
#  SPDX-License-Identifier: MIT
"""
HPC plugin for running SIERRA on HPC clusters using the SLURM scheduler.
"""

# Core packages
import typing as tp
import argparse
import shutil
import pathlib
import os

# 3rd party packages
import implements

# Project packages
from sierra.core import types
from sierra.core.experiment import bindings


[docs]@implements.implements(bindings.IParsedCmdlineConfigurer) class ParsedCmdlineConfigurer(): """Configure SIERRA for SLURM HPC. Uses the following environment variables (if any of them are not defined an assertion will be triggered): - ``SLURM_CPUS_PER_TASK`` - ``SLURM_TASKS_PER_NODE`` - ``SLURM_JOB_NODELIST`` - ``SLURM_JOB_ID`` """
[docs] def __init__(self, exec_env: str) -> None: pass
[docs] def __call__(self, args: argparse.Namespace) -> None: keys = ['SLURM_CPUS_PER_TASK', 'SLURM_TASKS_PER_NODE', 'SLURM_JOB_NODELIST', 'SLURM_JOB_ID'] for k in keys: assert k in os.environ,\ f"Non-SLURM environment detected: '{k}' not found" assert not args.platform_vc,\ "Platform visual capture not supported on SLURM" # SLURM_TASKS_PER_NODE can be set to things like '1(x32),3', indicating # that not all nodes will run the same # of tasks. SIERRA expects all # nodes to have the same # tasks allocated to each (i.e., a homogeneous # allocation), so we check for this. assert "," not in os.environ['SLURM_TASKS_PER_NODE'], \ "SLURM_TASKS_PER_NODE not homogeneous"
[docs]@implements.implements(bindings.IExpShellCmdsGenerator) class ExpShellCmdsGenerator(): """Generate the cmd to correctly invoke GNU Parallel on SLURM HPC. """
[docs] def __init__(self, cmdopts: types.Cmdopts, exp_num: int) -> None: self.cmdopts = cmdopts
[docs] def pre_exp_cmds(self) -> tp.List[types.ShellCmdSpec]: return []
[docs] def post_exp_cmds(self) -> tp.List[types.ShellCmdSpec]: return []
[docs] def exec_exp_cmds(self, exec_opts: types.StrDict) -> tp.List[types.ShellCmdSpec]: jobid = os.environ['SLURM_JOB_ID'] nodelist = pathlib.Path(exec_opts['exp_input_root'], f"{jobid}-nodelist.txt") resume = '' # This can't be --resume, because then GNU parallel looks at the results # directory, and if there is stuff in it, (apparently) assumes that the # job finished... if exec_opts['exec_resume']: resume = '--resume-failed' unique_nodes = types.ShellCmdSpec( cmd=f'scontrol show hostnames $SLURM_JOB_NODELIST > {nodelist}', shell=True, wait=True) # Make sure GNU parallel uses the right shell, because it seems to # defaults to /bin/sh since all cmds are run in a python shell which # does not have $SHELL set. shell = shutil.which('bash') use_bash = types.ShellCmdSpec(cmd=f'export PARALLEL_SHELL={shell}', shell=True, wait=True) parallel = 'parallel {2} ' \ '--jobs {1} ' \ '--results {4} ' \ '--joblog {3} ' \ '--sshloginfile {0} ' \ '--workdir {4} < "{5}"' log = pathlib.Path(exec_opts['scratch_dir'], "parallel.log") parallel = parallel.format(nodelist, exec_opts['n_jobs'], resume, log, exec_opts['scratch_dir'], exec_opts['cmdfile_stem_path'] + exec_opts['cmdfile_ext']) parallel_spec = types.ShellCmdSpec(cmd=parallel, shell=True, wait=True) return [unique_nodes, use_bash, parallel_spec]
__api__ = [ 'ParsedCmdlineConfigurer', 'ExpShellCmdsGenerator' ]