Source code for schrodinger.test.stu.joberrors

from schrodinger.infra.mmjob import mmjob_is_job_server_job
import os
import zipfile
from schrodinger.utils import subprocess
from schrodinger.utils import mmutil
import pathlib

from schrodinger.job import jobcontrol
from schrodinger.job import remote_command
from schrodinger.job import queue

from . import common

logger = common.logger


[docs]def run_postmortem(job, product_name): """ This is any queue.BaseJob. If it has a jobid, run postmortem for that particular job. Otherwise, run postmortem for the job database only. """ job.infoStatus('running postmortem') _, jobid, _ = job.getStatusStrings() if jobid and jobid.strip() == "[none]": jobid = None job_directory = job.getCommandDir() args = [] if not job_directory: return if jobid: args.append(jobid) elif mmutil.feature_flag_is_enabled(mmutil.JOB_SERVER): return else: args.append("-jobdbonly") postmortem_log_path = pathlib.Path(job_directory).joinpath('postmortem.log') with open(postmortem_log_path, 'w') as logf: if mmjob_is_job_server_job(jobid): command = ["jsc", "postmortem"] + args else: command = ["postmortem"] + args proc = subprocess.run( command, cwd=job_directory, stderr=subprocess.STDOUT, stdout=logf) if proc.returncode: msg = f'postmortem failed with return code: {proc.returncode}' logf.write(msg + '\n') msg += f' while investigating {job}' logger.warning(msg) if os.path.isfile(postmortem_log_path) and jobid: unzip_postmortem(postmortem_log_path) collect_queue_stats(job)
[docs]def unzip_postmortem(postmortem_log_path): """ Find postmortem file name from the log file, then extract """ # Absolute path to postmortem zip should be at the end of the log. Take # the last line from the log containing a .zip file postmortem_zip_file = None with open(postmortem_log_path) as f: for line in f: if '.zip' in line: postmortem_zip_file = line.strip().strip('"').split(': ')[-1] if postmortem_zip_file: postmortem_zip_path = pathlib.Path(postmortem_zip_file) if not postmortem_zip_path.is_absolute(): postmortem_zip_file = postmortem_log_path.parent.joinpath( postmortem_zip_file) with zipfile.ZipFile(postmortem_zip_file) as zf: zf.extractall(os.path.dirname(postmortem_zip_file)) # Remove zip archive and file manifest os.remove(postmortem_zip_file) postmortem_files = pathlib.Path( os.path.splitext(postmortem_zip_file)[0] + '.files') if postmortem_files.exists(): os.remove(postmortem_files) else: logger.warning('Postmortem file not found. See log at {}'.format( postmortem_log_path))
[docs]def collect_queue_stats(job): """ Invoke remote queue commands to collect queue stats to debug a killed queue job. :type job: schrodinger.job.queue.JobControlJob :param job: Instance of schrodinger.job.queue.JobControlJob to collect the stats from queue job has run. """ if not is_killed_queue_job(job): return jobObj = job.getJob() job_directory = job.getCommandDir() cmd = "bash --login -c '{}'" submission_hostname = jobcontrol.get_host(jobObj.HostEntry).host submission_username = jobcontrol.get_host(jobObj.HostEntry).user rcmd = remote_command._rsh_cmd( submission_hostname, remoteuser=submission_username) with open(os.path.join(job_directory, 'qstat.log'), 'w') as f: bash_command = cmd.format('date && echo Running Cmd: qstat && qstat') proc = subprocess.run(rcmd + [bash_command], stdout=f) if proc.returncode != 0: logger.warning(f"{bash_command} exited abnormally") with open(os.path.join(job_directory, 'clusutil.log'), 'w') as f: clusutil_cmd = 'perl /nfs/working/sysmgr/sysmgr-repo/scripts/clustutil.pl -u -a' bash_command = cmd.format('date && echo Running Cmd: {} && {}'.format( clusutil_cmd, clusutil_cmd)) proc = subprocess.run(rcmd + [bash_command], stdout=f) if proc.returncode != 0: logger.warning(f"{bash_command} exited abnormally")
[docs]def is_killed_queue_job(job): """ Return True if the job was killed on the queueing system. :type job: schrodinger.job.queue.JobControlJob :param job: Instance to collect the stats from queue job has run. """ # subprocess jobs don't submit to the queu if not isinstance(job, queue.JobControlJob): return False # job failed to launch jobObj = job.getJob() if not jobObj: return False # not a queued job if not jobObj.isQueued(): return False if job.exit_status != 'killed': return False return True