"""check_arcce_monitor - Fetches ARC jobs and posts results."""

import asyncio
import os
import random
import re
import time
from typing import Optional, Tuple
from arcnagios import arcutils, nagutils, utils
from arcnagios.arcutils import Arcstat
from arcnagios.arcclients import arcget, arcstat
from arcnagios.nagutils import \
        ServiceReport, ServiceOk, ServiceUnknown, OK, WARNING, CRITICAL, \
        NagiosPerflogTimer
from arcnagios.utils import nth, counted_adjectives, host_of_uri
from arcnagios.ce.jobutils import JobInfo, JobNagiosPlugin

_SPECIFIC_STATE_TR = dict((ord(c), None) for c in "'=[],:")

_ARCGET_HARMLESS_RE = re.compile('|'.join([
    r'$',
    r'Results stored at:.*',
    r'Warning: Some jobs were not removed.*',
]))

def check_arcget_output(output: str) -> Tuple[bool, bool]:
    need_arcclean = False
    for line in output.split('\n'):
        line = line.strip()
        if line.startswith('Use arcclean to remove'):
            need_arcclean = True
        elif not re.match(_ARCGET_HARMLESS_RE, line):
            return False, need_arcclean
    return True, need_arcclean

class JobActionStats:

    def __init__(self):
        self.error_count = 0
        self.discarded_count = 0
        self.unseen_count = 0
        self.running_count = 0
        self.cleaned_count = 0
        self.retry_count = 0
        self.postponed_count = 0

    def load(self) -> float:
        return self.postponed_count \
             / float(self.discarded_count + self.cleaned_count + 1)

class Check_arcce_monitor(JobNagiosPlugin):
    def __init__(self):
        JobNagiosPlugin.__init__(self)
        self.start_time = time.time()
        argp = self.argparser
        argp.add_argument('--ce', dest = 'ces',
                default = [], action = 'append',
                metavar = 'CE',
                help = 'Pass one or more times to restrict monitoring '
                       'to the given CEs.')
        argp.add_argument('--job-tag', dest = 'job_tags',
                default = [], action = 'append',
                metavar = 'TAG',
                help = 'Pass one or more times to restrict monitoring '
                       'to the given job tags. Use "default" to match '
                       'jobs started without a --job-tag option.')
        argp.add_argument('--termination-service', dest = 'termination_service',
                default = 'ARCCE Job Termination',
                help = 'Default service to submit result to if not specified '
                       'when submitting the job. '
                       'Deprecated: Should be passed on submission.')
        argp.add_argument('--max-sysinfo-lag', dest = 'max_infosys_lag',
                default = 3600.0, metavar = 'T',
                help = 'The maximum time to wait for a job to turn up in '
                       'the arcstat listing before abandoning it.')
        argp.add_argument('--max-check-attempts', dest = 'max_check_attempts',
                default = 12, metavar = 'N',
                help = 'The maximum number of consecutive times a job in '
                       'post-SUBMITTED state is absent from arcstat listing '
                       'before it is abandoned.')
        argp.add_argument('--max-fetch-attempts', dest = 'max_fetch_attempts',
                default = 8, metavar = 'N',
                help = 'The maximum number of attempts to fetch a job before '
                       'abandoning it.')
        argp.add_argument('--keep-failed-jobdata', dest = 'keep_failed_jobdata',
                action = 'store_true', default = False,
                help = 'Keep the job descriptions and output directories for '
                       'failed jobs. These will not be removed automatically.')
        argp.add_argument('--keep-all-jobdata', dest = 'keep_all_jobdata',
                action = 'store_true', default = False,
                help = 'Keep the job descriptions and output directories for '
                       'all jobs. These will not be removed automatically.')
        argp.add_argument('--timeout', dest = 'timeout',
                type = int, default = 110,
                help = 'Approximate overall timeout.')
        argp.add_argument('--arcstat-timeout', dest = 'arcstat_timeout',
                type = int, default = 5, metavar = 'T',
                help = 'Passed to arcstat --timeout.')
        argp.add_argument('--arcget-timeout', dest = 'arcget_timeout',
                type = int, default = 10,
                help = 'Maximum value to pass to arcget --timeout.')
        argp.add_argument('--show-arcget-output-here',
                dest = 'show_arcget_output_here',
                action = 'store_true', default = False,
                help = 'Include output from failed arcget in the this service '
                       'in addition to posting it to the termination service.')
        argp.add_argument('-c', dest = 'critical_load',
                type = float, default = 20,
                help = 'Ratio of remaining work to processed work above which \
                        to issue a critical alert.')
        argp.add_argument('-w', dest = 'warning_load',
                type = float, default = 10,
                help = 'Ratio of remaining work to processed work above which \
                        to issue a warning alert.')

    def time_left(self) -> float:
        return self.opts.timeout - time.time() + self.start_time

    def parse_args(self, args) -> None:
        """Parse ARCCE-specific command-line options."""

        JobNagiosPlugin.parse_args(self, args)

    def _clean_output_dir(self, output_dir: str) -> None:
        conflict = '.conflict-%d' % int(time.time())
        for entry in os.listdir(output_dir):
            if not '.conflict-' in entry:
                subdir = os.path.join(output_dir, entry)
                self.log.warning('Moving away partially fetched output %s.',
                                 subdir)
                os.rename(subdir, subdir + conflict)

    def prepare_top_output_dir(self, jobinfo: JobInfo) -> str:
        workdir = self.workdir_for(jobinfo.host, jobinfo.job_tag)
        job_output_dir = os.path.join(workdir, self.JOB_OUTPUT_DIRNAME)
        if os.path.exists(job_output_dir):
            self._clean_output_dir(job_output_dir)
        return job_output_dir

    def locate_output_dir(self, top_output_dir: str) -> Optional[str]:
        for subdir in os.listdir(top_output_dir):
            if not subdir in ['.', '..'] or '.conflict-' in subdir:
                return os.path.join(top_output_dir, subdir)
        return None

    async def fetch_job(self, jobinfo: JobInfo, job_error: Optional[str] =None):
        """Fetch the job described by `jobinfo : JobInfo`, submit passive
        results, and return a tuple `(did_fetch, check_ok, status_code)`, where

            `did_fetch`   indicates whether the job was fetched,
            `check_ok`    indicates whether checking went well, and
            `status_code` is the overall Nagios status reported to the passive
                          services for this job.
        """

        service_name = jobinfo.termination_service \
                    or self.opts.termination_service
        termination_report = self.nagios_report_for(jobinfo.host, service_name)
        termination_report.update_status(nagutils.OK, 'Job succeeded.')

        # Report the final job state if the job failed.
        if jobinfo.job_state != arcutils.J_FINISHED:
            termination_report.update_status(nagutils.CRITICAL,
                    'Job terminated as %s.' % jobinfo.job_state)

        if job_error:
            self.log.error(job_error)
            termination_report.log.error(job_error)

        # Try to fetch the job. Exit if no files where fetched.
        self.log.info('Fetching job %s in terminal state %s.',
                      jobinfo.job_id, jobinfo.job_state)
        top_output_dir = self.prepare_top_output_dir(jobinfo)
        async with self._arcclients_semaphore:
            with NagiosPerflogTimer(
                    self.perflog, "arcget_time", host_of_uri(jobinfo.job_id)):
                timeout = min(self.opts.arcget_timeout, self.time_left())
                arcget_result = await arcget(
                        jobinfo.job_id, top_output_dir = top_output_dir,
                        timeout = timeout, log = self.log)
        job_output_dir = self.locate_output_dir(top_output_dir)
        if job_output_dir is None:
            if arcget_result.is_ok():
                if termination_report.status_code == nagutils.OK:
                    self.log.error('Subdirectory from arcget not found, it '
                                   'should have been under %s.', top_output_dir)
                    termination_report.update_status(nagutils.UNKNOWN,
                            'Output directory from arcget not found.')
                termination_report.log.error('JID: %s', jobinfo.job_id)
                did_fetch = True
                ok_check = termination_report.status_code != nagutils.OK
            else:
                self.log.error('Failed to fetch %s.', jobinfo.job_id)
                termination_report.update_status(nagutils.WARNING,
                        'Failed to fetch job.')
                if arcget_result.get_error().output:
                    details = 'Output from arcget:\n%s' \
                            % arcget_result.get_error().output
                    if self.opts.show_argget_output_here:
                        self.log.error(details)
                    termination_report.log.error(details)
                termination_report.log.error('JID: %s', jobinfo.job_id)
                did_fetch = False
                ok_check = True
            return (did_fetch, ok_check, termination_report.status_code)

        # Check if arcget returned non-zero despite having fetched something.
        if arcget_result.is_error():
            is_harmless, need_arcclean \
                    = check_arcget_output(arcget_result.get_error().output)
            if need_arcclean:
                termination_report.log.warning('Separate arcclean needed.')
                await self.cleaner.call('arcclean', jobinfo.job_id)
            if not is_harmless:
                termination_report.update_status(nagutils.WARNING,
                    '%s: %s' % (jobinfo.job_id, arcget_result))

        if jobinfo.job_state != arcutils.J_FINISHED:
            errors = \
                utils.file_contents(os.path.join(job_output_dir, 'stderr.txt'))
            if not errors is None and errors.strip() != '':
                self.log.error('stderr.txt for %s:', jobinfo.job_id)
                for line in errors.strip().split('\n'):
                    self.log.error('.. %s', line)
                details = 'stderr.txt:\n%s' % errors
                termination_report.log.error(details)
            termination_report.log.error('JID: %s', jobinfo.job_id)
            return (True, True, termination_report.status_code)

        # Run check and publish results from job tests.
        termination_report.log.info('JID: %s', jobinfo.job_id)
        status_code = termination_report.status_code
        for test_name in jobinfo.tests:
            test = self.load_jobtest(test_name, hostname = jobinfo.host)
            if test.service_description:
                report = self.nagios_report_for(jobinfo.host,
                                                test.service_description)
            else:
                report = self.nagios_report
            test.check(report, job_output_dir, jobinfo.stored_urls)
            if report.status_code > status_code:
                status_code = report.status_code

        if status_code != nagutils.OK:
            termination_report.log.error('JID: %s', jobinfo.job_id)
        return (True, True, status_code)

    def check_job_progress(self, jobinfo: JobInfo) -> None:
        if jobinfo.job_state_time is None or jobinfo.progress_service is None:
            return
        attrs = {}
        for ck_state in [jobinfo.job_specific_state, jobinfo.job_state.name]:
            if ck_state and \
                    self.config.has_option('arcce.job-states', str(ck_state)):
                specs = self.config.get('arcce.job-states', str(ck_state))
                attrs = dict(kv.split(':', 1) for kv in specs.split()
                                              if ':' in kv)
                break
        job_state_age = time.time() - jobinfo.job_state_time
        if 'c' in attrs and job_state_age > int(attrs['c']):
            status = nagutils.CRITICAL
            msg = 'Stuck in state %s (%s).' \
                % (jobinfo.job_specific_state, jobinfo.job_state.name)
        elif 'w' in attrs and job_state_age > int(attrs['w']):
            status = nagutils.WARNING
            msg = 'Stuck in state %s (%s).' \
                % (jobinfo.job_specific_state, jobinfo.job_state.name)
        else:
            status = nagutils.OK
            msg = 'Normal progress.'
        # This also triggers in the initial case when jobinfo.job_state_alert is
        # None, to clear any lingering alerts.
        if status != jobinfo.job_state_alert:
            report = \
                self.nagios_report_for(jobinfo.host, jobinfo.progress_service)
            report.update_status(status, msg)
            jobinfo.job_state_alert = status

    def report_job_state_time(self, jobinfo: JobInfo, t_now: float) -> None:
        if not self.opts.granular_perfdata:
            return
        assert jobinfo.job_state_time
        dt_state = t_now - jobinfo.job_state_time
        perf_indices = [
            'host:%s' % jobinfo.host,
            'arc_internal_job_state:%s' % jobinfo.job_state,
        ]
        if not jobinfo.job_specific_state is None:
            s = str(jobinfo.job_specific_state).translate(_SPECIFIC_STATE_TR)
            perf_indices.append('arc_middleware_job_state:%s' % s)
        perf_label = 'arc_job_state_time[%s]' % ','.join(perf_indices)
        self.add_perfdata(perf_label, dt_state, uom = 's', limit_min = 0)

    async def process_missing_job(
            self, jobinfo: JobInfo, job_action_stats: JobActionStats
        ) -> None:
        # Job missing from from arcstat output can happen
        #   a) right after submission before it becomes available,
        #   b) temporarily if the CE infosys is unavailable, or
        #   c) if the job has been permanently removed.
        jobinfo.check_attempts = jobinfo.check_attempts or 0
        if jobinfo.job_state == arcutils.J_NOT_SEEN \
                and time.time() - jobinfo.submission_time \
                  < self.opts.max_infosys_lag:
            # We hope it's case a and give it more time.
            self.log.info('Job %s of kind %s on %s not found yet.',
                          jobinfo.job_id, jobinfo.job_tag, jobinfo.host)
            job_action_stats.unseen_count += 1
        elif jobinfo.check_attempts < self.opts.max_check_attempts:
            # We hope it's case a or b and make a fixed number of
            # attempts.
            jobinfo.check_attempts = jobinfo.check_attempts + 1
            self.log.info('Job %s of kind %s on %s missing for '
                          'the %s time in state %s, still checking.',
                          jobinfo.job_id, jobinfo.job_tag, jobinfo.host,
                          nth(jobinfo.check_attempts), jobinfo.job_state)
            self.save_active_job(jobinfo, jobinfo.host, jobinfo.job_tag)
            job_action_stats.unseen_count += 1
        elif self.time_left() < 1:
            job_action_stats.postponed_count += 1
        else:
            # We give up, assuming c) the job has been removed,
            # but discard_job schedules repeated attemts to remove
            # the job and any staged files while new jobs are run.
            self.log.info('Job %s of kind %s on %s disappeared in '
                          'state %s, removing active job info.',
                          jobinfo.job_id, jobinfo.job_tag, jobinfo.host,
                          jobinfo.job_state)
            await self.discard_job(jobinfo,
                    archive = self.opts.keep_failed_jobdata)
            job_action_stats.discarded_count += 1

    async def process_found_job(
            self, jobinfo: JobInfo, jobstat: Arcstat,
            job_action_stats: JobActionStats
        ) -> None:

        jobinfo.check_attempts = 0
        self.log.debug('Checking job on %s.', jobinfo.host)

        # Update job data.
        t_now = time.time()
        if jobinfo.job_state != jobstat.state \
                or jobinfo.job_specific_state != jobstat.specific_state:
            self.report_job_state_time(jobinfo, t_now)
            jobinfo.job_state = jobstat.state
            jobinfo.job_specific_state = jobstat.specific_state
            jobinfo.job_state_time = t_now
        jobinfo.check_time = t_now

        self.check_job_progress(jobinfo)

        if not jobinfo.job_state.is_final():
            self.save_active_job(jobinfo, jobinfo.host, jobinfo.job_tag)
            job_action_stats.running_count += 1
        elif self.time_left() < 1:
            job_action_stats.postponed_count += 1
        else:
            did_fetch, ok_check, passive_status_code = \
                await self.fetch_job(jobinfo, jobstat.job_error)
            if not ok_check:
                job_action_stats.error_count += 1
            archive = self.opts.keep_failed_jobdata \
                        and passive_status_code != nagutils.OK \
                   or self.opts.keep_all_jobdata
            if did_fetch:
                reputation_choices = jobinfo.reputation_choices or {}
                for dist_name, choice_name in reputation_choices.items():
                    ok_rep = passive_status_code == nagutils.OK
                    self.log.debug('Reputation for %s choice %s is %s.',
                            dist_name, choice_name, ok_rep and 'good' or 'bad')
                    self._reputation_tracker.submit(
                            dist_name, choice_name, ok_rep)
                await self.cleanup_job(jobinfo, archive = archive)
                job_action_stats.cleaned_count += 1
            elif (jobinfo.fetch_attempts or 0) < self.opts.max_fetch_attempts:
                jobinfo.fetch_attempts = (jobinfo.fetch_attempts or 0) + 1
                self.log.info('Will retry fetching %s.', jobinfo.job_id)
                self.save_active_job(jobinfo, jobinfo.host, jobinfo.job_tag)
                job_action_stats.retry_count += 1
            else:
                self.log.warning('Giving up on fetching %s.', jobinfo.job_id)
                await self.discard_job(jobinfo, archive = archive)
                job_action_stats.discarded_count += 1

    async def _check_async(self) -> ServiceReport:
        """Monitor submitted jobs."""

        if not os.path.exists(self.top_workdir):
            self.log.info('The work directory is %s.', self.top_workdir)
            return ServiceOk('No jobs to monitor since the working directory '
                             'has not yet been created.')
        self.require_voms_proxy()

        if self.opts.ces == []:
            ces = None
        else:
            ces = set(self.opts.ces)

        if self.opts.job_tags == []:
            job_tags = None
        else:
            job_tags = set(self.opts.job_tags)

        # Collect the list of active jobs.
        error_count = 0
        jobinfo_by_id = {}
        for svc_dir in os.listdir(self.top_workdir):
            if not os.path.isdir(os.path.join(self.top_workdir, svc_dir)):
                continue
            if '#' in svc_dir:
                host, job_tag = svc_dir.split('#', 1)
            else:
                host, job_tag = svc_dir, None
            if not ces is None and not host in ces:
                continue
            if not job_tags is None and not (job_tag or 'default') in job_tags:
                continue

            jobinfo = self.load_active_job(host, job_tag)
            if jobinfo is None:
                self.log.debug('No active job info for %s.', host)
                self.cleanup_job_files(
                    host, job_tag, archive=self.opts.keep_failed_jobdata)
            else:
                jobinfo.host = host
                jobinfo.job_tag = job_tag
                jobinfo_by_id[jobinfo.job_id] = jobinfo

        query_jobids = [jobinfo.job_id for jobinfo in jobinfo_by_id.values()]
        random.shuffle(query_jobids)
        if query_jobids == []:
            msg = 'No jobs to query, found %d in terminal states.' \
                % len(jobinfo_by_id)
            return ServiceOk(msg)

        # Obtains information from CEs about the active jobs.
        self.log.debug('Querying job IDs %s', ', '.join(query_jobids))
        arcstat_result = await arcstat(query_jobids,
                timeout = self.opts.arcstat_timeout, log = self.log)
        if arcstat_result.is_error():
            exn = arcstat_result.get_error()
            return ServiceUnknown("Failed to query status of jobs: %s" % exn)
        arcstat_response = arcstat_result.get()
        self.log.info('Queried %d jobs, found %d.',
                      len(query_jobids), len(arcstat_response.jobs))

        # Process jobs.
        job_action_stats = JobActionStats()
        futures = []
        for jobid in query_jobids:
            jobinfo = jobinfo_by_id[jobid]
            jobstat = arcstat_response.jobs.get(jobid)
            if jobstat:
                futures.append(asyncio.ensure_future(
                    self.process_found_job(jobinfo, jobstat, job_action_stats)))
            else:
                futures.append(asyncio.ensure_future(
                    self.process_missing_job(jobinfo, job_action_stats)))
        if futures != []:
            await asyncio.gather(*futures)
        error_count += job_action_stats.error_count

        # Summary and report.
        status_code = OK

        msg = counted_adjectives(
            [(job_action_stats.discarded_count, "discarded"),
             (job_action_stats.unseen_count, "unseen"),
             (job_action_stats.running_count, "running"),
             (job_action_stats.cleaned_count, "cleaned"),
             (job_action_stats.retry_count, "to retry"),
             (job_action_stats.postponed_count, "postponed")],
            if_empty = 'Nothing to do')

        if error_count > 0:
            msg += ', %d errors' % error_count
            status_code = CRITICAL

        load = job_action_stats.load()
        if load > self.opts.critical_load:
            msg += ', critical load!'
            status_code = CRITICAL
        elif load > self.opts.warning_load:
            msg += ', high load!'
            status_code = max(status_code, WARNING)
        else:
            msg += '.'

        self.log.info('')
        self.log.info('Summary:')
        jobinfos = list(jobinfo_by_id.values())
        jobinfos.sort()
        for jobinfo in jobinfos:
            self.log.info('- %s: %s', jobinfo.host_and_tag, jobinfo.job_state)

        return ServiceReport(status_code, msg)

    def check(self):
        return asyncio.run(self._check_async())
