diff --git a/cluster/cluster.py b/cluster/cluster.py index 484e259f3a86adede711c4bc59df68bc4068f212..c5176527fd875e6d1553573bcee5c39c57359af1 100644 --- a/cluster/cluster.py +++ b/cluster/cluster.py @@ -146,6 +146,7 @@ def is_local(hostname: str) -> bool: HOSTNAME) + def start_process( commands: List[str], hostname: Optional[str] = None, @@ -261,11 +262,16 @@ def kill_process(hostname: str, pid: int, signal="TERM"): logging.debug(f"Killing PGID {pid} on {hostname}") if not is_local(hostname): args = ["ssh", hostname, "--", "kill", f"-{signal}", f"-{pid}"] - res = subprocess.run(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) + res = subprocess.run(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE) if res.returncode != 0: logging.error( f"error: {res.returncode} {res.stdout.decode().strip()} {res.stderr.decode().strip()}") - return False + logging.info("Retrying SSH kill with shell=True") + res = subprocess.run(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) + if res.returncode != 0: + logging.error( + f"error: {res.returncode} {res.stdout.decode().strip()} {res.stderr.decode().strip()}") + return False else: if signal == "TERM": signal = pysignal.SIGTERM