#!/bin/env python
#
# check_js - job submission/wn testing based on jess
#
# Author: Marian Babik
# Copyright CERN 2017
#
from __future__ import print_function
import os
import sys
import time
import logging
import shutil
import re
import tarfile
import json
import datetime
import traceback
import io as stream_io
from distutils.file_util import copy_file
from distutils.dir_util import copy_tree

import jess
import jess.api
import jess.jobs

import nap.core

log = logging.getLogger('jess')

app = nap.core.Plugin(description='This plugin tests grid job submission with configurable payload.')
app.add_argument('--namespace', help='Metric prefix', dest='namespace', default='org.sam')
app.add_argument('-m', '--metric', help='Name of a metric to be collected.')
app.add_argument('-v', '--verbose', help='Verbosity.')
app.add_argument('-x', '--proxy', help='VOMS Proxy used for submitting the job')
app.add_argument('-T', '--token', help='Token to use for submitting the job')
app.add_argument('--executable', help='Script/binary to execute on the WN',
                 default='/usr/libexec/grid-monitoring/probes/emi.cream/wnjob_wlcg/nagrun.sh')
app.add_argument('--exec-args', dest='exec_args', help='Command line arguments to be passed on to the executable')
app.add_argument('--job-schedule', default=60, help='Interval (in minutes) to submit new job if previous one has '
                                                    'already finished')
app.add_argument('--vo', help='Virtual Organization.', required=True)
app.add_argument('--vo-fqan', help='VOMS primary attribute as FQAN. If given, will be used along with --vo',
                 default='')
app.add_argument('--work-dir', help='Working directory for storing job meta-data, logs, output, etc.',
                 default='/var/lib/gridprobes')
app.add_argument('--web-dir', help='Web directory for storing view of the job meta-data, logs, output, etc.')
app.add_argument('--backend', help='Job submission backend to be used, options are scondor, condor, cream, arc',
                 required=True)
app.add_argument('--pool', help='Job submission to remote HTCondor master')
app.add_argument('--schedd', help='Job submission to remote HTCondor schedd (needs --pool)')
app.add_argument('--resource',
                 help='CE to send job to. Format : <type>://<host>[:<port>]/<schedd>/<lrms-system-name>/<queue-name> '
                      'If not given - resource discovery via BDII will be performed.')
app.add_argument('--jdl-ads', help='Classads to add to the JDL')
app.add_argument('--ldap-uri', help='Format [ldap://]hostname[:port[/]]')
app.add_argument('--zero-payload', help='Generate zero bytes file as payload/tarball and pass it as input',
                 action='store_true', default=False)
app.add_argument('--add-payload',
                 help='Comma-separated list of top level directories with Nagios compliant directories structure to '
                      'be added to tarball to be sent to WN.')
app.add_argument('--add-wntar-nag-nosam',
                 help='Do not include standard SAM WN probes and their Nagios config to WN tarball.',
                 action='store_true', default=False)
app.add_argument('--add-wntar-nag-nosamcfg',
                 help='Do not include Nagios configuration for SAM WN probes to WN tarball. The probes themselves and '
                      'respective Python packages, however, will be included.',
                 action='store_true', default=False)
app.add_argument('--timeout-limits', help='Comma separated list of timeouts in minutes per job status.'
                                          ' Also support global timeout, e.g. global:3600,idle:3000,running:15')
app.add_argument('-e', '--env', dest='env_var', help='Environment variable to set on the worker node', action='append')
app.add_argument('--env-file', dest='env_file', help='Environment file to be transferred to the worker node')
app.add_argument('--arc-debug', dest='arc_debug', default='INFO',
                 help='ARC backend: arcsub debug flag (defaults to INFO)')
app.add_argument('--arc-gmlog', action='store_true', default=False,
                 help='ARC backend: request gmlog')
app.add_argument('--arc-rsl', dest='arc_rsl',
                 help='ARC backend: add-ons for nordugrid_rsl')
app.add_argument('--arc-ce', dest='arc_ce',
                 help='ARC backend: arcsub computing element endpoint (arc6 client only)')
app.add_argument('--arc-sub-type', dest='arc_sub_type',
                 help='ARC backend: arcsub submission endpoint type (arc6 client only)')
app.add_argument('--arc-info-type', dest='arc_info_type',
                 help='ARC backend: arcsub information endpoint type (arc6 client only')
app.add_argument('--arc-registry', dest='arc_registry',
                 help='ARC backend: arcsub registry (arc6 client only)')
app.add_argument('--wnfm-config', dest='wnfm_config', help='ETF WN qFM: configuration file (json)')
app.add_argument('--wnfm-static', dest='wnfm_static',
                 help='ETF WN qFM: Path to the statically compiled version of ETF WN qFM')
app.add_argument('--wnfm-pool', default=2, type=int,
                 help='ETF WN qFM:  number of threads to run on WN (tests concurrency)')
app.add_argument('--wnfm-global-timeout', default=600, type=int,
                 help='ETF WN qFM:  global timeout (to run all tests, in seconds)')
app.add_argument('--wnfm-test-timeout', default=550, type=int,
                 help='ETF WN qFM:  test timeout (to run a single test, in seconds)')


def jdl_factory(wd, out_dir, args):
    if args.backend == 'arc':
        gr, rsl = __get_resource(args)
        jdl = jess.api.JDL({'executable': os.path.basename(args.executable),
                            'stdout': 'arc.out',
                            'join': 'yes',
                            })
        if args.arc_gmlog:
            jdl.update({'gmlog': 'gmlog'})
        if not args.zero_payload:
            jdl.update({'arguments': args.exec_args,
                        'inputFiles': ['gridjob.tgz'],
                        'outputFiles': ['wnlogs.tgz']})
        if rsl:
            # todo: fix this after __get_resource refactoring
            arc_rsl = rsl['nordugrid_rsl'].replace('(', '').replace(')', ' ').replace('1800', '30')
            arc_rsl = arc_rsl.strip().split(' ')
            jdl.update(dict([i.split('=') for i in arc_rsl]))
        return jdl
    elif args.backend == 'cream':
        jdl = jess.api.JDL({'Type': 'Job', 'JobType': 'Normal', 'Executable': os.path.basename(args.executable),
                            'StdOutput': 'cream.out',
                            'StdError': 'cream.out',
                            'InputSandBox': ['{}/{}'.format(wd, os.path.basename(args.executable)),
                                             '{}/gridjob.tgz'.format(wd)],
                            'OutputSandBox': ['cream.out', 'wnlogs.tgz'],
                            'OutputSandBoxBaseDestUri': 'gsiftp://localhost'
                            })
        if not args.zero_payload:
            jdl.update({'Arguments': args.exec_args})
        return jdl
    elif 'condor' in args.backend and 'dodas:' in args.resource:
        jdl = jess.api.JDL([('executable', os.path.basename(args.executable)),
                            ('transfer_executable', 'true'),
                            ('output', '{}/gridjob.out'.format(out_dir)),
                            ('error', '{}/gridjob.err'.format(out_dir)),
                            ('log', '{}/gridjob.log'.format(out_dir)), ('log_xml', 'true'),
                            ('when_to_transfer_output', 'ON_EXIT'),
                            ('leave_in_queue', '(JobStatus != 5)')])
        if args.proxy:
            jdl.update([('use_x509userproxy', 'true')])
        if args.token:
            jdl.update([('+SciTokensFile', "\"{}\"".format(args.token))])
        if not args.zero_payload:
            jdl.update([('arguments', args.exec_args),
                        ('transfer_input_files', ['{}/wnlogs.tgz'.format(wd), '{}/gridjob.tgz'.format(wd)]),
                        ('transfer_output_files', 'wnlogs.tgz'),
                        ('transfer_output_remaps', '\"wnlogs.tgz={}/wnlogs.tgz\"'.format(out_dir)),
                        ('should_transfer_files', 'YES')])
        open(os.path.join(wd, 'wnlogs.tgz'), 'w').close()
        return jdl
    elif 'condor' in args.backend and 'condor-ce:' in args.resource:
        jdl = jess.api.JDL([('universe', 'vanilla'), ('executable', os.path.basename(args.executable)),
                            ('transfer_executable', 'true'),
                            ('output', '{}/gridjob.out'.format(out_dir)),
                            ('error', '{}/gridjob.err'.format(out_dir)),
                            ('log', '{}/gridjob.log'.format(out_dir)), ('log_xml', 'true'),
                            ('should_transfer_files', 'YES'),
                            ('when_to_transfer_output', 'ON_EXIT')])
        if args.proxy:
            jdl.update([('use_x509userproxy', 'true')])
        if args.token:
            jdl.update([('+SciTokensFile', "\"{}\"".format(args.token))])
        if not args.zero_payload:
            jdl.update([('arguments', args.exec_args),
                        ('transfer_input_files', ['{}/wnlogs.tgz'.format(wd), '{}/gridjob.tgz'.format(wd)]),
                        ('transfer_output_files', 'wnlogs.tgz'),
                        ('transfer_output_remaps', '\"wnlogs.tgz={}/wnlogs.tgz\"'.format(out_dir)),
                        ('arguments', ' -v {} -f {} -m -c {} '
                                      '-t 600 -w 2'.format(args.vo, args.vo_fqan, args.resource)),
                        ])
        open(os.path.join(wd, 'wnlogs.tgz'), 'w').close()
        return jdl
    elif 'condor' in args.backend:
        gr, rsl = __get_resource(args)
        jdl = jess.api.JDL([('universe', 'grid'), ('executable', os.path.basename(args.executable)),
                            ('transfer_executable', 'true'),
                            ('output', '{}/gridjob.out'.format(out_dir)),
                            ('error', '{}/gridjob.err'.format(out_dir)),
                            ('log', '{}/gridjob.log'.format(out_dir)), ('log_xml', 'true'),
                            ('when_to_transfer_output', 'ON_EXIT'),
                            ('leave_in_queue', '(JobStatus != 5)'),
                            ('grid_resource', gr),
                            ])
        if args.proxy:
            jdl.update([('use_x509userproxy', 'true')])
        if args.token:
            jdl.update([('+SciTokensFile', "\"{}\"".format(args.token))])
        if not args.zero_payload:
            jdl.update([('arguments', args.exec_args),
                        ('transfer_input_files', ['{}/wnlogs.tgz'.format(wd), '{}/gridjob.tgz'.format(wd)]),
                        ('transfer_output_files', 'wnlogs.tgz'),
                        ('transfer_output_remaps', '\"wnlogs.tgz={}/wnlogs.tgz\"'.format(out_dir)),
                        ('should_transfer_files', 'YES')])
        # create empty wnlogs.tgz
        open(os.path.join(wd, 'wnlogs.tgz'), 'w').close()
        if rsl:
            jdl.update(rsl)
        return jdl
    else:
        return False


def __get_resource(args):
    # todo: rewrite as add_resource supporting other backends
    # cream -> alex4.nipne.ro:8443/cream-pbs-alice + wsdl
    # arc   -> arc2.farm.particle.cz:2811/nordugrid-Condor-grid + xrsl
    port_map = {'arc': '2811', 'cream': '8443',
                'gt': '2119', 'htcondor': '9619',
                'condor': '9619', 'gt5': '2119',
                'nordugrid': '2811'}
    host = args.hostname
    res = args.resource
    grid_resource = ''
    rsl = None

    # sanity
    if '://' not in res:  # check if type is there
        return False, 'Invalid resource format, missing type (arc, cream, gt, condor, etc)'

    if ('condor://' in res or 'htcondor://' in res) and not re.search(r'(.*)://(.*?)/(.*)', res):
        # adding default path to condor resources (sched defaults to hostname)
        res += '/%s/htcondor/noqueue' % host

    if not re.search(r'(.*)://(.*?)/(.*)', res):
        return False, 'Invalid resource format (missing sched, jms and queue)'

    # add default port if none specified
    port_match = re.search(r'.*:(\d+)/.*$', res)
    if not port_match:
        prefix_match = re.search(r'(.*)://(.*?)/(.*)', res)
        if not prefix_match:
            return False, 'Invalid resource format'
        tmp_res = prefix_match.groups()
        if tmp_res[0] not in port_map.keys():
            return False, 'Unknown ce type %s', tmp_res[0]
        res = "%s://%s:%s/%s" % (tmp_res[0], tmp_res[1], port_map[tmp_res[0]], tmp_res[2])

    match = re.search(r'(.+?)://.*:(\d+)/(.*?)/(.+?)/(.*)$', res)  # full URI match
    if not match:
        return False, 'Invalid resource format'
    (ce_type, port, sched_name, jm, queue) = match.groups()

    if ce_type == 'cream':
        # gr = 'cream https://' + res
        if queue != 'noqueue':
            grid_resource = 'cream https://' + host + ':' + str(port) + '/ce-cream/services/CREAM2 ' + jm + ' ' + queue
        else:
            grid_resource = 'cream https://' + host + ':' + str(port) + '/ce-cream/services/CREAM2 ' + jm + ' '
    elif ce_type == 'condor' or ce_type == 'htcondor':
        grid_resource = 'condor ' + sched_name + ' ' + host + ':' + str(port)
        if queue != 'noqueue':
            rsl = {'+remote_queue': '\"%s\"' % queue}
    elif ce_type == 'arc' or ce_type == 'nordugrid':
        grid_resource = 'nordugrid ' + host
        if queue != 'noqueue':
            rsl = {'nordugrid_rsl': '(walltime=1800)(cputime=1800)(queue=' + queue + ')' +
                                    '(runtimeenvironment=ENV/PROXY)'}
        else:
            rsl = {'nordugrid_rsl': '(walltime=1800)(cputime=1800)(runtimeenvironment=ENV/PROXY)'}
        if args.arc_rsl:
            rsl['nordugrid_rsl'] = rsl['nordugrid_rsl'] + args.arc_rsl
    elif ce_type == 'gt' or 'ce_type' == 'gt5':
        grid_resource = 'gt2 ' + host + ':' + str(port) + '/jobmanager'
        if jm:
            grid_resource += '-' + jm
        if queue != 'noqueue':
            rsl = {'globus_rsl': '(jobtype=single)(queue=' + queue + ')'}
        else:
            rsl = {'globus_rsl': '(jobtype=single)'}

    return grid_resource, rsl


def _traverse_dir(d, match=None):
    for root, dirs, files in os.walk(d):
        for f in files:
            if match in f:
                yield os.path.join(root, f)


def _tar_filter(members):
    for tarinfo in members:
        if 'json' in os.path.basename(tarinfo.name):
            yield tarinfo


def _publish_wn_results(out_dir, io, args, wnfm_config=None):
    wnlogs_file = os.path.join(out_dir, 'wnlogs.tgz')
    print("Worker node results tarball at: %s" % wnlogs_file)
    if not os.path.isfile(wnlogs_file):
        raise jess.JessError('Unhandled exception while trying to access wn tarball')
    # untar
    try:
        tar = tarfile.open(wnlogs_file, 'r:gz')
        tar.extractall(path=out_dir, members=_tar_filter(tar))
    except Exception as e:
        raise jess.JessError('Unhandled exception while processing tarball {}'.format(e))

    # publish
    for wn_msg_f in _traverse_dir(out_dir, match='json'):
        with open(wn_msg_f, 'r') as wn_msg_h:
            msg = json.load(wn_msg_h)
        # timestamp = msg.get('timestamp', time.time())
        match = re.match('([a-zA-Z0-9_]*://)?([^/:$]*):?(\d+)?/?', msg.get('service_uri', 'UNDEFINED'))
        hostname = match.group(2)
        service = msg.get('nagios_name', 'UNDEFINED')
        if wnfm_config and 'wn_metric_map' in wnfm_config.keys():
            if msg.get('nagios_name', 'UNDEFINED') in wnfm_config['wn_metric_map']:
                service = wnfm_config['wn_metric_map'][msg.get('nagios_name')]
        status = msg.get('status', 'UNDEFINED')
        output = msg.get('summary', 'UNDEFINED')
        details = msg.get('details', 'UNDEFINED')
        if args.web_dir:
            with stream_io.open(os.path.join(args.web_dir, msg.get('nagios_name', 'UNDEFINED')), 'w',
                                encoding='utf-8', errors='ignore') as wn_o:
                wn_o.write(details)
            details = '### Full plugin output can be found at <a href="{}">{}</a>\n'.format(
                      os.path.join(args.web_url, msg.get('nagios_name', 'UNDEFINED')),
                      os.path.join(args.web_url, msg.get('nagios_name', 'UNDEFINED'))) + details
        gathered_at = msg.get('gatheredAt', '')
        sum_out = "%s: %s" % (gathered_at, output)
        print("Submitting worker node result: %s, %s, %s (%s)" % (hostname, service, status,
                                                                  sum_out.encode('ascii', errors='ignore')))
        io.batch_passive_out(hostname, service, nap.core.get_code(status), sum_out, details)
    print('All WN results successfully submitted.')


def payload_factory(wd, args):
    tarball = jess.api.Payload(os.path.join(wd, 'gridjob.tgz'))
    if args.zero_payload:
        tarball.zero_file()
        return tarball
    tarball.add(args.env_file, prefix='./etf-env.sh')
    tarball.add('/usr/libexec/grid-monitoring/wnfm', prefix='./')
    if args.wnfm_static:
        tarball.add('/usr/libexec/grid-monitoring/wnfm-static', prefix='./')
    if args.add_payload:
        for d in args.add_payload.split(','):
            tarball.add(d, prefix='./')
    return tarball


def etf_log():
    if hasattr(app, 'limits'):
        print('=== ETF job log:')
        print('Timeout limits configured were:')
        for k, v in app.limits.items():
            print('    {} -> {} minutes'.format(k, v))
    if hasattr(app.job, 'timer'):
        print('Current time: {}'.format(
            datetime.datetime.utcfromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')))
        print('Job started: {}'.format(
            datetime.datetime.utcfromtimestamp(app.job.ts_job_start).strftime('%Y-%m-%d %H:%M:%S')))
        print('Job finished: {}'.format(
            datetime.datetime.utcfromtimestamp(app.job.ts_job_end).strftime('%Y-%m-%d %H:%M:%S')))
        print('Job tracking times (entered):')
        for k, v in app.job.timer.items():
            print('    {} -> {} '.format(k, datetime.datetime.utcfromtimestamp(v).strftime('%Y-%m-%d %H:%M:%S')))


def etf_job_details():
    if app.job.ts_job_end:
        etf_log()
    print('=== Credentials:')
    if app.job.proxy_details:
        print('x509:')
        print(app.job.proxy_details)
    if app.job.token_details:
        print('scitoken:')
        print(app.job.token_details)
    print('=== Job description:')
    print(app.job.jdl)
    # print(app.job.proxy)
    print('=== Job submission command:')
    print(app.job.submit_cmd)
    print(app.job.submit_out)
    if app.job.log:
        print('=== Job log:')
        print(app.job.log)
    if app.job.status_out:
        print('=== Last job status:')
        print(app.job.status_out)
    if app.job.verbose_status:
        print(app.job.verbose_status)
    #if app.job.output:
    #    print('=== Job output:')
    #    print(app.job.output)


def etf_web_header(args):
    print("### Job details (please follow links below to get additional debug information)")
    header = "### Job: <a href={}>jdl</a> <a href={}>stdout</a> <a href={}>log</a> " \
             "<a href={}>etf_log</a> <a href={}>env</a>".format(
                                    os.path.join(args.web_url, 'gridjob.jdl'), os.path.join(args.web_url, 'job.out'),
                                    os.path.join(args.web_url, 'job.log'), os.path.join(args.web_url, 'etf.log'),
                                    os.path.join(args.web_url, 'job.env'))
    if app.job.backend == 'arc':
        header = header + " <a href={}>gmlog</a>".format(os.path.join(args.web_url, 'gmlog'))
    print(header + "<br>")


def etf_out_to_file(args, io):
    with open(os.path.join(args.web_dir, 'etf.log'), 'w') as ejl:
        ejl.write('\n'.join([e for e in io.getvalue().split('\n') if '### Job' not in e and '### Full' not in e]))


def etf_web_copy_files(args):
    if os.path.isfile(os.path.join(args.wd, 'gridjob.jdl')):
        copy_file(os.path.join(args.wd, 'gridjob.jdl'), os.path.join(args.web_dir, 'gridjob.jdl'), update=1)
    if os.path.isfile(os.path.join(args.wd, 'etf-env.sh')):
        copy_file(os.path.join(args.wd, 'etf-env.sh'), os.path.join(args.web_dir, 'job.env'), update=1)
    # ARC
    if os.path.isfile(os.path.join(args.wd, 'out', 'arc.out')):
        copy_file(os.path.join(args.wd, 'out', 'arc.out'), os.path.join(args.web_dir, 'job.out'), update=1)
    if app.job.backend == 'arc' and app.job.output and os.path.isdir(os.path.join(app.job.output, 'gmlog')):
        copy_tree(os.path.join(app.job.output, 'gmlog'), os.path.join(args.web_dir, 'gmlog'), update=1)
        copy_file(os.path.join(app.job.output, 'gmlog/errors'), os.path.join(args.web_dir, 'job.log'), update=1)
    # HT-Condor
    if os.path.isfile(os.path.join(args.wd, 'out', 'gridjob.log')):
        copy_file(os.path.join(args.wd, 'out', 'gridjob.log'), os.path.join(args.web_dir, 'job.log'), update=1)
    if os.path.isfile(os.path.join(args.wd, 'out', 'gridjob.out')):
        copy_file(os.path.join(args.wd, 'out', 'gridjob.out'), os.path.join(args.web_dir, 'job.out'), update=1)
    if os.path.isfile(os.path.join(args.wd, 'out', 'gridjob.err')):
        copy_file(os.path.join(args.wd, 'out', 'gridjob.err'), os.path.join(args.web_dir, 'job.err'), update=1)

    # fix permissions
    for root, dirs, files in os.walk(args.web_dir):
        for d in dirs:
            os.chmod(os.path.join(root, d), 0o755)
        for f in files:
            os.chmod(os.path.join(root, f), 0o644)


@app.metric(metric_name="JobState", seq=1)
def test_js_state(args, io):
    fqan_path = None
    if args.vo_fqan and ('/' in args.vo_fqan or '=' in args.vo_fqan):
        fqan_path = args.vo_fqan.replace("/", ".").replace("=", ".")[1:]
    wd = os.path.join(args.work_dir, fqan_path or args.vo, args.backend, args.hostname)
    args.wd = wd
    if not os.path.exists(wd):
        os.makedirs(wd)
    out_dir = os.path.join(wd, 'out')
    if not os.path.exists(out_dir):
        os.makedirs(out_dir)
    os.chdir(wd)
    log.debug('Working directory: {}'.format(wd))

    if not args.proxy and not args.token:
        io.status = nap.UNKNOWN
        io.summary = 'ETF internal error: neither proxy nor token was specified'
        return

    if args.web_dir:
        webd = os.path.join(args.web_dir, fqan_path or args.vo, args.backend, args.hostname)
        args.web_dir = webd
        args.web_url = os.path.join('/etf-raw/', fqan_path or args.vo, args.backend, args.hostname)
        if not os.path.exists(webd):
            os.makedirs(webd)
        log.debug('Web directory: {}'.format(webd))

    if args.env_file and os.path.exists(args.env_file) and os.access(args.env_file, os.R_OK):
        shutil.copy(args.env_file, os.path.join(wd, 'etf-env.sh'))
    else:
        io.status = nap.UNKNOWN
        io.summary = 'ETF internal error: failed to access env file {}'.format(args.env_file)

    args.env_file = os.path.join(wd, 'etf-env.sh')
    with open(args.env_file, 'a') as env_f:  # we want an empty file in all cases
        if args.env_var:
            for e in args.env_var:
                env_f.write(e + '\n')

    # ETF WN-qFM config
    if args.wnfm_config:
        with open(args.wnfm_config, 'r') as wn_cf:
            wnfm_config = json.load(wn_cf)
    else:
        wnfm_config = None

    if args.wnfm_static and not os.path.isdir(args.wnfm_static):
        io.status = nap.UNKNOWN
        io.summary = 'ETF internal error: static ETF WN-qFM path not found'
        return

    # ETF WN-qFM run counter
    etf_wnfm_counter = 0
    if wnfm_config and 'counter_enabled' in wnfm_config.keys() and wnfm_config['counter_enabled']:
        # etf_wnfm.json persists internal WN qFM state over multiple executions of check_js
        # e.g. keeps a counter of completed runs (job executions)
        if os.path.isfile(os.path.join(wd, 'etf_wnfm.json')):
            with open(os.path.join(wd, 'etf_wnfm.json'), 'r') as wnfm_f:
                wnfm_config_struct = json.load(wnfm_f)
                if 'run_counter' in wnfm_config_struct.keys():
                    etf_wnfm_counter = int(wnfm_config_struct['run_counter'])
        else:
            with open(os.path.join(wd, 'etf_wnfm.json'), 'w') as wnfm_f:
                etf_wnfm_meta = {'run_counter': etf_wnfm_counter}
                json.dump(etf_wnfm_meta, wnfm_f)

    limits = dict()
    if args.timeout_limits:
        l_split = args.timeout_limits.split(',')
        for li in l_split:
            st, to = li.split(':')
            limits[st.strip()] = int(to)
        log.debug('Limits configured: {}'.format(limits))

    log.debug('Checking/copying executable {} ...'.format(args.executable))
    if os.path.exists(args.executable):
        shutil.copy(args.executable, wd)

    if not args.zero_payload and 'etf_run.sh' in args.executable:
        log.debug('Compiling arguments for ETF WN-qFM ...')
        if '://' in args.resource:
            res = args.resource.split('/')[2]
        else:
            res = args.resource.split('/')[0]
        args.exec_args = '-v {} -c {} -p {} -t {} -T {} -d'.format(args.vo, res, args.wnfm_pool,
                                                                   args.wnfm_global_timeout,
                                                                   args.wnfm_test_timeout)
        if args.wnfm_static:
            args.exec_args = args.exec_args + ' -s'

    log.debug('Generating JDL ...')
    jdl = jdl_factory(wd, out_dir, args)
    jdl.serialize(os.path.join(wd, 'gridjob.jdl'), backend=args.backend, jdl_ads=args.jdl_ads)
    if not jdl:
        io.status = nap.CRITICAL
        io.summary = 'ETF internal error: Failed to generate a valid JDL'
        return

    log.debug('Creating payload tarball ...')
    tb = payload_factory(wd, args)

    log.debug('Running tracker ...')
    timer = jess.jobs.Timer(limits)
    if args.backend == 'arc':
        os.environ['ARC_LOCATION'] = '/usr'
        jmi = jess.jobs.JMI(wd, args.backend, jdl, tb, proxy=args.proxy, token=args.token,
                            resource=args.resource, arc_debug=args.arc_debug,
                            arc_sub_type=args.arc_sub_type, arc_info_type=args.arc_info_type,
                            arc_registry=args.arc_registry, arc_ce=args.arc_ce)
        tr = jess.jobs.Tracker(wd, jmi, job_schedule=args.job_schedule)
    elif args.backend == 'cream':
        jmi = jess.jobs.JMI(wd, args.backend, args.proxy, jdl, tb, resource=args.resource)
        tr = jess.jobs.Tracker(wd, jmi, job_schedule=args.job_schedule)
    elif 'condor' in args.backend:
        jmi = jess.jobs.JMI(wd, args.backend, jdl, tb, proxy=args.proxy, token=args.token,
                            resource=args.resource, pool=args.pool,
                            schedd=args.schedd)
        tr = jess.jobs.Tracker(wd, jmi, job_schedule=args.job_schedule)
    else:
        io.status = nap.WARNING
        io.summary = 'ETF unknown backend specified ({})'.format(args.backend)
        return

    try:
        tr.attach(timer)
        job = tr.run()
        app.job = job
        app.limits = limits
    except jess.JessConfigurationError as e:
        io.status = nap.WARNING
        io.summary = 'ETF internal configuration issue'
        print(e)
        return
    except jess.JessPayloadException as e:
        io.status = nap.WARNING
        io.summary = 'ETF payload configuration issue'
        print(e)
        return
    except jess.JobSubmitError as e:
        io.status = nap.CRITICAL
        io.summary = e.msg
        # pass failure to submit if direct submission
        if 'condor:' not in args.resource:
            job = jess.jobs.Job()
            job.state = 'failed'
            job.job_id = 0
            job.verbose_status = e.msg
            job.log = ''
            job.submit_cmd = e.cmd
            job.submit_out = e.details
            job.jdl = jdl
            job.proxy = args.proxy
            job.token = args.token
            app.job = job
        print(e.ret_code, e.cmd)
        print(e.details)
        print(jdl)
        return
    except jess.JobStatusError as e:
        io.status = nap.WARNING
        io.summary = 'Failed to get status of a job'
        print(e.ret_code, e.cmd)
        print(e.details)
        return
    except jess.JessError as e:
        io.status = nap.CRITICAL
        io.summary = 'ETF internal exception was thrown'
        print(e)
        return

    # check timer
    timed_out = timer.check_limits(job)
    if timed_out:
        job.state = 'timeout'
        job.ts_job_end = time.time()
        jess.jobs.Job.save(job)
        io.status = nap.OK
        io.summary = 'Job timed out ({}) {}'.format(job.job_id, job.status_out)
        etf_job_details()
        #if 'condor' in args.backend:
        #    print(job.log)
        #print(job.status_out)
        jmi.purge(job.job_id)
        # app.job.state = 'timeout'
        return

    if job.state == 'submit':
        io.status = nap.OK
        io.summary = 'Job was successfully submitted ({})'.format(job.job_id)
        etf_job_details()
        return
    elif job.state == 'idle':
        io.status = nap.OK
        io.summary = 'Existing job ({}) was found in status {}'.format(job.job_id, job.verbose_status)
        etf_job_details()
        return
    elif job.state == 'waiting':
        io.status = nap.OK
        io.summary = 'Previous job finished, waiting {:.2f} mins to submit a new one'.format(
                                int(args.job_schedule) - ((time.time() - job.ts_job_start) / 60))
        print('Job details for job that already terminated:')
        etf_job_details()
        return
    elif job.state == 'failed':
        io.status = nap.OK
        io.summary = 'Job ({}) has failed with status:{}'.format(job.job_id, job.status)
        etf_job_details()
        return
    elif job.state == 'completed':
        io.status = nap.OK
        io.summary = 'Job successfully completed (status:{}, id:{})'.format(job.verbose_status, job.job_id)
        etf_job_details()
        try:
            if not args.zero_payload and 'etf_run.sh' in args.executable:
                _publish_wn_results(out_dir, io, args, wnfm_config=wnfm_config)
            if wnfm_config and 'counter_enabled' in wnfm_config.keys() and wnfm_config['counter_enabled']:
                # increment ETF WN qFM run counter
                with open(os.path.join(wd, 'etf_wnfm.json'), 'w') as wnfm_f:
                    etf_wnfm_meta = {'run_counter': etf_wnfm_counter + 1}
                    json.dump(etf_wnfm_meta, wnfm_f)
        except jess.JessError as e:
            io.status = nap.WARNING
            io.summary = 'ETF internal error while publishing WN results'
            print(e)
            traceback.print_exc(file=sys.stdout)
        except Exception as e:
            io.status = nap.WARNING
            io.summary = 'ETF internal error; exception was thrown'
            print(e)
            traceback.print_exc(file=sys.stdout)
        return


@app.metric(metric_name="JobSubmit", passive=True)
def test_js_submit(args, io):
    if not hasattr(app, "job"):
        return
    if app.job.state not in ['timeout', 'failed', 'completed']:
        return
    if args.web_dir:
        print('### Full plugin output can be found at <a href="{}">{}</a>\n'.format(
            os.path.join(args.web_url, 'etf.log'), os.path.join(args.web_url, 'etf.log')))
        etf_web_header(args)
        etf_web_copy_files(args)
    # out -> out_prev
    if os.path.isdir(os.path.join(args.wd, 'out_prev')):
        shutil.rmtree(os.path.join(args.wd, 'out_prev'))
    if os.path.isdir(os.path.join(args.wd, 'out')):
        os.rename(os.path.join(args.wd, 'out'), os.path.join(args.wd, 'out_prev'))

    if app.job.state == 'timeout':
        io.status = nap.WARNING
        io.summary = 'Job ({}) has timed out while in status: {}'.format(app.job.job_id, app.job.verbose_status)
        etf_job_details()
        if args.web_dir:
            etf_out_to_file(args, io)
        return
    if app.job.state == 'failed':
        io.status = nap.CRITICAL
        io.summary = 'Job ({}) has failed with status: {}'.format(app.job.job_id, app.job.verbose_status)
        etf_job_details()
        # special cases
        if app.job.backend == 'cream':
            if 'Maximum number of jobs already in queue' in app.job.status_out:
                io.status = nap.WARNING
                io.summary = 'Job failed due to maximum number of jobs in queue'
        if app.job.backend == 'arc':
            if 'Job failed with exit code 1' in app.job.status_out:
                io.status = nap.WARNING
                io.summary = 'Job completed with exit code 1'
            if 'PeriodicRemove evaluated to TRUE' in app.job.status_out:
                io.status = nap.WARNING
                io.summary = 'Status failed due to periodic removal'
        if args.web_dir:
            etf_out_to_file(args, io)
        return
    elif app.job.state == 'completed':
        io.status = nap.OK
        io.summary = 'Job successfully completed'
        etf_job_details()
        if args.web_dir:
            etf_out_to_file(args, io)
        return


if __name__ == '__main__':
    app.run()
