Source code for sierra.plugins.hpc.adhoc.plugin

# Copyright 2020 John Harwell, All rights reserved.
#
#  SPDX-License-Identifier: MIT
"""HPC plugin for running experiments with an ad-hoc set of compute nodes.

E.g., whatever computers you happen to have laying around in the lab.

"""

# Core packages
import os
import typing as tp
import argparse
import shutil
import pathlib

# 3rd party packages
import implements

# Project packages
from sierra.core import types, utils
from sierra.core.experiment import bindings


[docs]@implements.implements(bindings.IParsedCmdlineConfigurer) class ParsedCmdlineConfigurer(): """Configure SIERRA for ad-hoc HPC. May use the following environment variables: - ``SIERRA_NODEFILE`` - If this is not defined ``--nodefile`` must be passed. """
[docs] def __init__(self, exec_env: str) -> None: pass
[docs] def __call__(self, args: argparse.Namespace) -> None: if args.nodefile is None: assert 'SIERRA_NODEFILE' in os.environ,\ ("Non-hpc.adhoc environment detected: --nodefile not " "passed and 'SIERRA_NODEFILE' not found") args.nodefile = os.environ['SIERRA_NODEFILE'] assert utils.path_exists(args.nodefile), \ f"SIERRA_NODEFILE '{args.nodefile}' does not exist" assert not args.platform_vc,\ "Platform visual capture not supported on Adhoc"
[docs]@implements.implements(bindings.IExpShellCmdsGenerator) class ExpShellCmdsGenerator(): """Generate the cmd to invoke GNU Parallel in the ad-hoc HPC environment. """
[docs] def __init__(self, cmdopts: types.Cmdopts, exp_num: int) -> None: self.cmdopts = cmdopts
[docs] def pre_exp_cmds(self) -> tp.List[types.ShellCmdSpec]: return []
[docs] def post_exp_cmds(self) -> tp.List[types.ShellCmdSpec]: return []
[docs] def exec_exp_cmds(self, exec_opts: types.StrDict) -> tp.List[types.ShellCmdSpec]: jobid = os.getpid() # Even if we are passed --nodelist, we still make our own copy of it, so # that the user can safely modify it (if they want to) after running # stage 1. nodelist = pathlib.Path(exec_opts['exp_input_root'], f"{jobid}-nodelist.txt") resume = '' # This can't be --resume, because then GNU parallel looks at the results # directory, and if there is stuff in it, (apparently) assumes that the # job finished... if exec_opts['exec_resume']: resume = '--resume-failed' # Make sure there are no duplicate nodes unique_nodes = types.ShellCmdSpec( cmd='sort -u {0} > {1}'.format(exec_opts["nodefile"], nodelist), shell=True, wait=True) # Make sure GNU parallel uses the right shell, because it seems to # defaults to /bin/sh since all cmds are run in a python shell which # does not have $SHELL set. use_bash = types.ShellCmdSpec( cmd='export PARALLEL_SHELL={0}'.format(shutil.which('bash')), shell=True, wait=True, env=True) # GNU parallel cmd parallel = 'parallel {2} ' \ '--jobs {1} ' \ '--results {4} ' \ '--joblog {3} ' \ '--sshloginfile {0} ' \ '--workdir {4} < "{5}"' log = pathlib.Path(exec_opts['scratch_dir'], "parallel.log") parallel = parallel.format(nodelist, exec_opts['n_jobs'], resume, log, exec_opts['scratch_dir'], exec_opts['cmdfile_stem_path'] + exec_opts['cmdfile_ext']) parallel_spec = types.ShellCmdSpec(cmd=parallel, shell=True, wait=True) return [unique_nodes, use_bash, parallel_spec]
__api__ = [ 'ParsedCmdlineConfigurer', 'ExpShellCmdsGenerator', ]