# This is a command line utility that can dump lists of warnings in a codesonar analysis.

# Run me with:
#  codesonar dump_warnings.py ...args...

# To see help:
#  codesonar dump_warnings.py help

import argparse
import csv
import json
import os
import subprocess
import sys
import tempfile
import urllib.parse

import gtr
import gtr.json
import collections
import io

gtr.util.ensure_sys_path(os.path.join(gtr.gthome(), 'third-party', 'github-sarif-integration'))
import sarif_parser
from sarif_state import SarifState


class CommandLineException(Exception):
    pass

class GtrCsvWriter(object):
    def __init__(self, outfile, delimiter=',', lineterminator='\n'):
        self.delimiter = delimiter
        self.lineterminator = lineterminator
        self.outfile = outfile
    
    def writerow(self, row):
        self.outfile.write(
            self.delimiter.join(
                [gtr.csv_field(cell) for cell in row]))
        self.outfile.write(self.lineterminator)


class DumpWarningsSarifState(SarifState):
    def __init__(self):
        super(DumpWarningsSarifState, self).__init__()
        self.result_count = 0

    def run_object_member_end(self, tool_name, message_strings):
        pass

    def run_object_start(self, parser):
        pass

    def original_uri_base_id_add(self, uri, uriBaseId, key):
        pass
    
    def resources_object_member_end(self, parser, key):
        pass

    def file_item_add(self, file_item):
        pass

    def rules_v1_object_member_end(self, parser, key):
        pass

    def rules_item_array_element_end(self, parser, idx):
        pass

    def results_item_array_element_end(self, parser, idx):
        self.result_count += 1


def count_sarif_warnings(sarif_path):
    parse_state = DumpWarningsSarifState()
    save_stdout = sys.stdout
    try:
        sys.stdout = sys.stderr
        sarif_parser.process_sarif(sarif_path, parse_state)
    finally:
        sys.stdout = save_stdout
    return parse_state.result_count


def pipe_file(infile, outfile, buflen=4096):
    chunk = infile.read(buflen)
    while chunk:
        outfile.write(chunk)
        chunk = infile.read(buflen)
    outfile.flush()


def assemble_columnopts(parser, args):
    raw_options = {}
    if not (args.json or args.csv or args.sarif):
        raw_options['columns'] = []
        dummy = {}
        while True:
            try:
                args.format % dummy
            except KeyError as e:
                dummy[e.args[0]] = 0 # Use 0 because it is compatible with every format specifier
                raw_options['columns'].append(e.args[0])
            else:
                break
    else:
        for s in args.show_column:
            raw_options.setdefault('columns', []).append(s)

    for s in args.sort:
        if s.startswith('ascending:'):
            s = s[len('ascending:'):]
            raw_options.setdefault('orderBy', []).append({s: 'ASCENDING'})
        elif s.startswith('descending:'):
            s = s[len('descending:'):]
            raw_options.setdefault('orderBy', []).append({s: 'DESCENDING'})
        else:
            parser.print_help(file=sys.stderr)
            print('invalid --sort: %s' % s, file=sys.stderr)
            sys.exit(1)
    return json.dumps(raw_options)

def parse_options(parser, args):
    gridopts = assemble_columnopts(parser, args)
    options = dict(
        # For warning search url
        swarnings_json=gridopts,
        # For analysis home page url
        awarnings_json=gridopts,
        
        filter=json.dumps(args.visible_warnings),
        )
    if args.search:
        options['query'] = args.search
    
    return options

def warnings_search_url(url_format, args, options, hub, aid):
    assert isinstance(options, dict)
    if url_format == 'sarif':
        if args.src_root:
            options['srcroot'] = args.src_root
        if args.sarif_detail:
            options['detail'] = args.sarif_detail
    if args.gained_since_previous_analysis:
        url = f'{hub}/analysis/{aid}-compare.{url_format}?compare_mode=gained&compare_target=previous'
        if options:
            url += '&forward_qvars=' + gtr.urlencode(json.dumps(options))
    elif args.lost_since_previous_analysis:
        url = f'{hub}/analysis/{aid}-compare.{url_format}?compare_mode=lost&compare_target=previous'
        if options:
            url += '&forward_qvars=' + gtr.urlencode(json.dumps(options))
    else:
        if aid:
            options['scope'] = 'aid:' + aid
        if options:
            url = f'{hub}/search.{url_format}?' + '&'.join([f'{gtr.urlencode(k)}={gtr.urlencode(v)}' for k, v in options.items()])
        else:
            url = f'{hub}/search.{url_format}'
    return url

def add_analysis_args(parser):
    parser.add_argument('--project-file', help='.prj file name')
    parser.add_argument('--analysis-id', help='analysis ID on hub')
    parser.add_argument('--analysis-url', help='analysis URL')
    parser.add_argument('--project-name', help='name of the project on the hub--find most recent successful analysis (introduces a race condition)')
    parser.add_argument('--hub', default='[::1]:7340', help='hub address')

def add_filtering_args(parser):
    parser.add_argument('--search', help='filter warnings using this search query')
    parser.add_argument('--show-column', action='append', help='show column', default=[], metavar='"COLUMN NAME"')

def add_network_args(parser):
    parser.add_argument('-t', help='time in seconds to wait for data to be retrieved', metavar='TIMEOUT_SEC')
    parser.add_argument('-auth', help='hub authentication mode')
    parser.add_argument('-hubuser', help='hub username')
    parser.add_argument('-hubpwfile', help='hub password file')
    parser.add_argument('-hubbearerfile', help='hub bearer token file')
    parser.add_argument('-hubcert', help='hub authentication certificate')
    parser.add_argument('-hubkey', help='hub authentication private key')

def process_analysis_args(args, extra_spec_args=()):
    hub = args.hub
    #if sum((int(bool(x)) for x in ((project_dir, args.analysis_id, args.analysis_url)))) > 1:
    if int(bool(args.project_file)) + int(bool(args.analysis_id)) + int(bool(args.analysis_url)) + int(bool(args.project_name)) > 1:
        raise CommandLineException('Must specify no more than one of --project-file, --project-name, --analysis-id, or --analysis-url\n')
    if not (any((args.project_file, args.analysis_id, args.analysis_url, args.project_name)) or any(extra_spec_args)):
        
        raise CommandLineException('Must specify at least one of --project-file, --project-name, --analysis-id, --analysis-url, or --search\n')
    if not any((hub, args.analysis_url)):
        raise CommandLineException('Must specify at least one of --hub or --analysis-url')
    if args.project_file:
        project_dir = args.project_file
        if project_dir.endswith('.prj'):
            project_dir += '_files'
        elif project_dir.endswith('.prj_files'):
            pass
        else:
            project_dir += '.prj_files'
        if not os.path.exists(project_dir):
            raise CommandLineException('Cannot find %s' % project_dir)
        aid_txt = os.path.join(project_dir, 'aid.txt')
        if not os.path.exists(aid_txt):
            raise CommandLineException('Cannot find %s' % aid_txt)
        with open(aid_txt, 'r') as f:
            aid = f.read().strip()
    elif args.analysis_id:
        aid = args.analysis_id
    elif args.analysis_url:
        url = args.analysis_url
        import re
        m = re.match(r'^(https?://[^/]+)/analysis/([0-9]+)[.]((html)|(xml)|(csv))$', args.analysis_url)
        if not m:
            raise CommandLineException('--analysis-url must be of the form http://server:port/analysis/123.html')
        hub = m.group(1)
        aid = m.group(2)
    elif args.project_name:
        # Assume that most_recent_analysis prints any diagnostics
        # necessary in case of failure, such as the project is not
        # found or there are no successful analyses.
        aid = most_recent_analysis(args)
    else:
        aid = None
    return hub, aid

def main(argv):
    parser = argparse.ArgumentParser(description='''Dump warnings from a hub.  Examples:
    codesonar dump_warnings.py --hub somecomputer:7340 --project-file path/to/foo.prj
    codesonar dump_warnings.py --hub somecomputer:7340 --project-name /A/Bc
    codesonar dump_warnings.py --hub somecomputer:7340 --search "aid:123 \\"class:Division By Zero\\" file:foo.c"
    codesonar dump_warnings.py --analysis-url http://snail:7340/analysis/123.html
    codesonar dump_warnings.py --analysis-url http://snail:7340/analysis/123.html --sort ascending:file --sort "ascending:line number"
    codesonar dump_warnings.py --analysis-url http://snail:7340/analysis/123.html --csv
    codesonar dump_warnings.py --analysis-url http://snail:7340/analysis/123.html --json --show-column "file path"
    codesonar dump_warnings.py --analysis-url http://snail:7340/analysis/123.html --json -o foo.json
    codesonar dump_warnings.py --analysis-url http://snail:7340/analysis/123.html --format "%(class)s in %(procedure)s on line %(line number)s"
    codesonar dump_warnings.py --analysis-url http://snail:7340/analysis/123.html --gained-since-previous-analysis --fail-if-more-warnings-than 0
''',
                                     formatter_class=argparse.RawTextHelpFormatter,
                                     prog='codesonar dump_warnings.py')
    add_analysis_args(parser)
    add_filtering_args(parser)
    parser.add_argument('--sort', action='append', help='result ordering', default=[], metavar='"ascending:COLUMN NAME" or "descending:COLUMN NAME"')

    default_format = '%(file)s:%(lineNumber)s:%(class)s'
    parser.add_argument('--format', default=default_format, help='python format string using column names; default is ' + default_format.replace('%', '%%'), metavar='"FORMAT"')
    parser.add_argument('--csv', action='store_true', help='dump in csv format')
    parser.add_argument('--json', action='store_true', help='dump in json format')
    parser.add_argument('--sarif', action='store_true', help='dump in SARIF format')
    parser.add_argument('--fail-if-more-warnings-than', type=int, help='exit with non-zero if there are more than this many warnings', metavar='LIMIT')
    parser.add_argument('--gained-since-previous-analysis', action='store_true', help='dump only warnings that did not exist in the previous analysis of the same project')
    parser.add_argument('--lost-since-previous-analysis', action='store_true', help='dump only warnings that exist in the previous analysis of the same project but not the current one')
    parser.add_argument('--visible-warnings', help='saved search name to use (defaults to "all")', default='all', metavar='NAME')
    parser.add_argument('--src-root', help='base path to use for relative source code uris in SARIF output')
    parser.add_argument('--sarif-detail', help='reduce detail of SARIF document while increasing download speed',
                        metavar='brief')
    parser.add_argument('--metrics-csv', help='dump metrics in csv format')
    parser.add_argument('--metrics-xml', help='dump metrics in xml format')
    parser.add_argument('--verbose', action='store_true', help='emit fetched urls to stderr')
    add_network_args(parser)
    parser.add_argument('-o', help='output filename', metavar='OUTPUT_FILENAME')
    args = parser.parse_args(argv[1:])
    args.format += '\n'
    
    options = parse_options(parser, args)

    try:
        fmt_count = 0
        if args.sarif:
            fmt_count += 1
        if args.csv:
            fmt_count += 1
        if args.json:
            fmt_count += 1
        if args.format.strip() != default_format.strip():
            if args.show_column:
                raise CommandLineException('--format and --show-column are mutually exclusive')
            fmt_count += 1
        if fmt_count > 1:
            raise CommandLineException('--format, --csv, --json, and --sarif are mutually exclusive')
        if args.sarif:
            if args.sort:
                raise CommandLineException('--sarif and --sort are mutually exclusive')
            if args.show_column:
                raise CommandLineException('--sarif and --show-column are mutually exclusive')
        else:
            if args.src_root:
                raise CommandLineException('--src-root can only be used with --sarif')
            if args.sarif_detail:
                raise CommandLineException('--sarif-detail can only be used with --sarif')
        hub, aid = process_analysis_args(args, extra_spec_args=(args.search,))
        if args.metrics_csv and not aid:
            raise CommandLineException('--metrics-csv can only be used when specifying a specific analysis')
        if args.metrics_xml and not aid:
            raise CommandLineException('--metrics-xml can only be used when specifying a specific analysis')
    except CommandLineException as e:
        parser.print_help(file=sys.stderr)
        print()
        print('ERROR:', e, file=sys.stderr)
        return 1

    if args.csv:
        url_format = 'csv'
    elif args.sarif:
        url_format = 'sarif'
    else:
        url_format = 'json'
    
    url = warnings_search_url(url_format, args, options, hub, aid)

    outfilepath = args.o if args.o not in ('', '-') else None
    outfile_mode = None
    rv = 1
    with invoke_codesonar_get(args, '-', url) as p:
        if outfilepath is not None:
            outfile = open(outfilepath, 'wb')
        else:
            sys.stdout.flush() # Not sure whether this is needed to
                               # ensure sys.stdout flushes to
                               # sys.stdout.buffer, but perhaps.
            outfile = sys.stdout.buffer
        try:
            rv = dump_warnings(args, p.stdout, outfile, outfilepath)
            if not rv:
                rv = p.wait()
        finally:
            if outfilepath is not None and outfile is not None and not outfile.closed:
                outfile.close()

    # Finally, retrieve metrics if requested, but if retrieving the warnings
    # failed, then there's no point even attempting to do so because that
    # will fail too. We need a valid aid too, but that's checked in dump_metrics()
    if rv == 0:
        if args.metrics_csv:
            rv = dump_metrics(args, aid, args.metrics_csv, 'csv')
        if rv == 0 and args.metrics_xml:
            rv = dump_metrics(args, aid, args.metrics_xml, 'xml')
    return rv

def dump_metrics(args, aid, outfile, fmt):
    if aid is None:
        printf("Failed to dump metrics because a valid analysis id was not found.")
        return 1
    url = f'{args.hub}/metrics/{aid}.{fmt}'
    rv = None
    with invoke_codesonar_get(args, outfile, url) as p:
        p.wait()
        rv = p.returncode
    if rv != 0:
        print(f"Failed to retrieve metrics. See '{outfile}' for diagnostics", file=sys.stderr)
    return rv


def invoke_codesonar_get(args, outfile, url, cmd_kwargs={}):
    if getattr(args, 'verbose', False):
        print(url, file=sys.stderr)
    if args.t is None:
        args.t = '3600'
    cmd = [os.path.join(gtr.gthome(), 'codesonar', 'bin', 'codesonar'),
           'get', url, '-o', outfile,
           '-follow-redirect',
           ]
    for x in ('t', 'auth', 'hubcert', 'hubuser', 'hubkey', 'hubpwfile', 'hubbearerfile'):
        if getattr(args, x):
            cmd += ['-' + x, getattr(args, x)]
    return subprocess.Popen(cmd, stdout=subprocess.PIPE, **cmd_kwargs)
    

def most_recent_analysis(args):
    # Find the most recent analysis for the named project. The name is either
    # the full path in the project tree (if it begins with '/'), or it is
    # treated as a suffix. The shortest name wins, so if you have /A/Bc and /Bc,
    # and search for 'Bc', you'll get the last of those.
    #
    # Unfortunately there is no way to search the project names for an
    # exact match, so we have to search the project paths to be more
    # precise. Using 'project=name' helps, but that is case
    # insensitive. 'ptree_path=xxx' doesn't give me the suffix
    # semantics we want either. Nevertheless, we try to invoke the
    # search with those keywords anyway just to cut down on the number
    # of items we then have to winnow through.

    # First retrieve the project id 'pid' by searching for the project
    # then going through the resulting table to do the precise match.
    pid = None
    is_absolute = args.project_name.startswith('/')
    is_path = '/' in args.project_name
    escaped_name = args.project_name.replace('\\', '\\\\').replace('"', '\\"')
    escaped_name = urllib.parse.quote(escaped_name.encode('utf8'))
    search_kw = "ptree_path=" if is_absolute else "project=" if not is_path else "ptree_path=~~%%"
    gridopts = gtr.urlencode(json.dumps(dict(columns=['path', 'projectId'])))
    queryval = gtr.urlencode(f'{search_kw}"{escaped_name}"')
    url = f"{args.hub}/project_search.csv?query={queryval}&sprjgrid_json={gridopts}"
    with invoke_codesonar_get(args, '-', url, {'encoding': 'utf-8'}) as p:
        reader = csv.reader(p.stdout)
        hdr = next(reader)
        path_pos = get_hdr_pos(hdr, 'path')
        pid_pos = get_hdr_pos(hdr, 'project id')
        pid = None
        for line in reader:
            path = line[path_pos]
            # If the project name begins with a slash, insist on a full path
            # match. Otherwise match against the suffix.
            if is_absolute:
                if path == args.project_name:
                    pid = line[pid_pos]
                    break
            elif path.endswith(args.project_name):
                pid = line[pid_pos]
                break
        if pid is None:
            raise CommandLineException(f'Could not find project with name "{args.project_name}"')

    # Now get the analyses for this project.
    aid = None
    gridopts = gtr.urlencode(json.dumps(dict(columns=['analysisId', 'state'])))
    url = f"{args.hub}/project/{pid}.csv?anlgrid_json={gridopts}"
    with invoke_codesonar_get(args, '-', url, {'encoding': 'utf-8'}) as p:
        reader = csv.reader(p.stdout)
        hdr = next(reader)
        aid_pos = get_hdr_pos(hdr, 'analysis id')
        state_pos = get_hdr_pos(hdr, 'state')
        # MAYBE TODO: This assumes the list is in reverse chronological order. This is the default
        # but if it ever should change, we'll have to issue a query that specifies the sort
        # order.
        for line in reader:
            state = line[state_pos]
            aid = line[aid_pos]
            if state == "Finished":
                return aid

    raise CommandLineException(f'Could not find a successful analysis for project with name "{args.project_name}"')

def get_hdr_pos(hdr, name):
    if name not in hdr:
        raise CommandLineException(f"Error: cannot find {name} in csv header: {hdr}")
    return hdr.index(name)

class JsonRowCounter(gtr.json.PathfulJsonParser):
    rowcount = 0
    
    def array_element_end(self, idx):
        super().array_element_end(idx)
        if self.path == ['rows']:
            self.rowcount += 1

def missing_field_factory():
    # When a field is missing (because it doesn't apply to a
    # particular warning), this value will be used instead.
    return ''

def warning_json_to_code(x):
    if isinstance(x, str):
        return x
    elif isinstance(x, dict):
        if x.get('type') in ('expansion', 'msg'):
            return ''
        else:
            return ''.join(map(warning_json_to_code, x.get('children', ())))
    else:
        raise TypeError(f'Unexpected type {type(x)}')

class WarningFormatter(JsonRowCounter, gtr.json.DOMBuilderJsonParser):
    def __init__(self, outfile, format):
        super().__init__()
        self.outfile = outfile
        self.format = format
    
    def array_element_end(self, idx):
        super().array_element_end(idx)
        if self.path == ['rows']:
            # dbstack is maintained by DOMBuilderJsonParser.  Here, we
            # are basically eating "rows" every time a new row
            # finishes, preventing any significant amount of DOM from
            # staying around in memory.
            row = self.dbstack[-1].pop()
            if 'lineContent' in row:
                row['lineContent'] = warning_json_to_code(row['lineContent']).rstrip()
            if 'id' in row:
                row['id'] = f'{row["id"]["groupId"]}.{row["id"]["instanceId"]}'
            if 'categories' in row:
                row['categories'] = ';'.join([c['link'] for c in row['categories']])
            if 'notes' in row:
                row['notes'] = ';'.join([n['comment'].replace('\n', ' ') for n in row['notes']])
            self.outfile.write(self.format % collections.defaultdict(missing_field_factory, row))

    def object_member_end(self, key):
        super().object_member_end(key)
        if key == 'error' and not self.path:
            print(self.dbstack[-1]['error'], file=sys.stderr)

# This adapts a string read pipe into a byte read pipe that never
# returns more than the requested number of bytes.  Python's built in
# file objects can return more than the requested number of
# bytes/characters on some occasions, which json_stream_parse_all does
# not cope with.
class RegulatingPipe:
    def __init__(self, instream):
        self.instream = instream
        self.buffer = None

    def read(self, limit):
        if not self.buffer:
            self.buffer = self.instream.read(limit)
        if len(self.buffer) <= limit:
            rv = self.buffer
            self.buffer = type(self.buffer)()
            return rv
        rv = self.buffer[:limit]
        self.buffer = self.buffer[limit:]
        return rv

# Tee a byte input stream into a string output stream and otherwise
# act as a pass through.
class Tee(io.RawIOBase):
    def __init__(self, instream, outstream):
        self.outstream = outstream
        self.instream = instream
        
    def read(self, limit):
        chunk = self.instream.read(limit)
        assert len(chunk) <= limit
        self.outstream.write(chunk)
        return chunk

    def close(self):
        self.instream.close()

    @property
    def closed(self):
        return self.instream.closed

    def fileno(self):
        return self.instream.fileno()

    def flush(self):
        self.instream.flush()
        self.outstream.flush()

    def isatty(self):
        return self.instream.isatty()

    def readable(self):
        return self.instream.readable()

def dump_warnings(args, infile, outfile, outfilepath):
    max_count = args.fail_if_more_warnings_than
    count = 0
    if args.sarif and max_count is None:
        pipe_file(infile, outfile)
    elif args.sarif and outfilepath is not None:
        pipe_file(infile, outfile)
        outfile.close()
        count = count_sarif_warnings(outfilepath)
    elif args.sarif:
        tmpfd = None
        tmpfile = None
        tmpfilepath = None
        try:
            tmpfd, tmpfilepath = tempfile.mkstemp()
            tmpfile = os.fdopen(tmpfd, 'wb')
            tmpfd = None
            pipe_file(infile, tmpfile)
            tmpfile.close()
            tmpfile = None
            count = count_sarif_warnings(tmpfilepath)
            with open(tmpfilepath, 'rb') as tmpfile2:
                pipe_file(tmpfile2, outfile)
        finally:
            if tmpfile is not None:
                tmpfile.close()
            if tmpfd is not None:
                os.close(tmpfd)
            if tmpfilepath is not None:
                os.remove(tmpfilepath)
    elif args.json and max_count is None:
        pipe_file(infile, outfile)
    elif args.json:
        counter = JsonRowCounter()
        # json_stream_parse_all requires a byte input stream that
        # never returns more than the requested number of bytes, so we
        # must wrap the pipe with Tee and RegulatingPipe.
        gtr.json.json_stream_parse_all(Tee(RegulatingPipe(infile), outfile), counter)
        count = counter.rowcount
    elif args.csv and max_count is None:
        pipe_file(infile, outfile)
    elif args.csv:
        textio = io.TextIOWrapper(Tee(infile, outfile))
        try:
            csv_reader = csv.reader(textio)
            for row in csv_reader:
                count += 1
        finally:
            # We don't own infile, so do not close it
            textio.detach()
    else:
        textio = io.TextIOWrapper(outfile)
        try:
            formatter = WarningFormatter(textio, args.format)
            gtr.json.json_stream_parse_all(RegulatingPipe(infile), formatter)
        finally:
            # We don't own outfile, so do not close it
            textio.detach()
        count = formatter.rowcount
    if max_count is not None and count > max_count:
        return 1
    return 0


if __name__ == '__main__':
    sys.exit(main(sys.argv))