autogen.py

#!/usr/bin/env python3

import argparse
import git
import os
import io
import shutil
import re
import csv
import sys

import diff_function as df

from glob import glob

DEBUG = False

##################### DEBUG FUNCTIONS ######################

def PRINT(x):
    if DEBUG:
        print(x)

############################################################

DEFAULT_ARR_SIZE=8
C_HEADERS=['<stdio.h>', '<string.h>', '<stdlib.h>', '<stdint.h>']
BOILERPLATE_HEADERS=['utils.h'] 
KLEE_HEADERS=['klee/klee.h']
HEADER_COMMENT="""/* This file was automatically generated by autogen.py */\n\n"""
TD_PREFIX="autotd"
RENAME_SUFFIX="_renamed"

# TODO: Have to translate into regex, since data type definitions allow for spaces
PRIMITIVE_C_DATATYPES=["char", "signed char", "unsigned char", "short", "short int", "signed short", "signed short", "signed short int", "unsigned short", "unsigned short int", "int", "signed", "signed int", "unsigned", "unsigned int", "long", "long int", "signed long", "signed long int", "unsigned long", "unsigned long int", "long long", "long long int", "signed long long", "signed long long int", "unsigned long long", "unsigned long long int", "float", "double", "long double"]

STDLIB_H_DATATYPES=["size_t", "wchar_t", "div_t", "ldiv_t"]
STDINT_H_DATATYPES=["int8_t", "int16_t", "int32_t", "uint8_t", "uint16_t", "unint32_t"]
STDBOOL_H_DATATYPES=["bool"]
COMMON_DATATYPES=PRIMITIVE_C_DATATYPES + STDLIB_H_DATATYPES + STDINT_H_DATATYPES + STDBOOL_H_DATATYPES + ["void"]

def create_outputstream():
    f = io.StringIO()
    f.write(HEADER_COMMENT)
    return f

def is_valid_outputstream(outputstream):
    if type(outputstream) is not io.StringIO:
        sys.stderr.write("Expected type: io.StringIO\nActual type: {}\n".format(type(outputstream)))
        sys.exit(1)

def read_csv(csv_file):
    """Reads rows of CSV file. Returns a list of tuples."""
    results = []
    with open(csv_file, newline='') as f:
        reader = csv.reader(f, delimiter=':')
        for row in reader:
            #PRINT('CSV Row: {}'.format(row))
            results.append(row)
    return results

def write_with_tabs(outputstream, output_str, tabs=0):
    is_valid_outputstream(outputstream)
    for i in range(tabs):
        output_tab(outputstream)
    outputstream.write(output_str)

def output_newline(outputstream):
    outputstream.write("\n")

def output_tab(outputstream):
    outputstream.write("\t")

def output_boilerplate_headers(outputstream):
    for header in BOILERPLATE_HEADERS:
        output_str = "#include \"{}\"\n".format(header)
        outputstream.write(output_str)

def output_stdc_headers(outputstream):
    for header in C_HEADERS:
        output_str = "#include {}\n".format(header)
        outputstream.write(output_str)

#TODO: Generify to work for any repository, not just upb.
def get_lib_headers(repo_dir):
    # For the 'upb' repository, the public C/C++ API is defined by all header (.h) files under
    # "upb/", except those ending with '.int.h', which are internal-only (see 'upb' repository README)
    header_files = []
    path = os.path.join(repo_dir, 'upb')
    
    for f in glob(os.path.join(path, '*.h')):
        if f.endswith('-inl.h'):
        #if f.endswith('.int.h') or f.endswith('-inl.h'): # 'upb' issue: '-inl.h' files should also be internal (waiting for confirmation)
            continue
        header_files.append(f)
    return header_files

def output_lib_headers(headers, outputstream):
    for header in headers:
        output_str = "#include \"{}\"\n".format(header)
        outputstream.write(output_str)

def get_structs_from_headers(header_files):
    """
    Some structs in the upb repo are private: declared and definied within .c files,
    and so are not accessible externally. The public C/C++ API for the upb repo
    is specifed by the header files in 'upb/' directory, relative to the upb repo root,
    that do not end in '.int.h' (see upb README).
    NOTE: The paths specified in 'header_files' will be used as is. No path extension
    will be performed. If any path is wrong, the header file will not be opened.
    """
    struct_pattern = re.compile(r"""
                     \s*struct\s*
                     (\w+)\s*
                     \{.*\}\s*;""", re.VERBOSE | re.MULTILINE | re.DOTALL)
    all_structs = []

    for header_file in header_files:

        with open(header_file, 'r') as content_file:
            try:
                content = content_file.read()
                found_structs = [i.group(1).strip() for i in struct_pattern.finditer(content)]
                all_structs.extend(found_structs)

                PRINT("Structs declared in header file '{}':".format(header_file))
                for s in found_structs:
                    PRINT("  '{}'".format(s))
            except OSError as err:
                print("OS Error: {0}".format(err))
    return all_structs

class TDContext:
    _id = 0

    def __init__(self, funcname, functype, params):
        TDContext._id += 1

        self.id       = TDContext._id # Used to number output files (i.e: test drivers)
        self.funcname = funcname
        self.functype = functype
        self.params   = params

    def get_funcname(self):
        return self.funcname

    def get_functype(self):
        return self.functype

    def get_params(self):
        return self.params

    def to_string(self):
        return """
          TDContext:
            ID: {}
            Name: {}
            Type: {}
            Params: {}
          """.format(self.id, self.funcname, self.functype, self.params)

    def get_vars_from_params(self):
        """Creates list of 'Variable' objects from 'self''s list of 'params'"""
        variables = []
        pattern = r"((?:const |volatile )?[a-zA-Z_][a-zA-Z0-9_]*\s*\**)(\s*[a-zA-Z_][a-zA-Z0-9_]*)"
        for param in self.params:
            x = re.match(pattern, param)
            if not x:
                sys.stderr.write("Could not extract variable name from function parameter: {}\n".format(param))
                return None
            var_name = x.group(2)
            var_type = x.group(1)
            var_isPtr = var_type.endswith('*')
            var_arrSize = 0 #TODO
            var = Variable(var_name, var_type, var_isPtr, var_arrSize)
            variables.append(var)
        return variables

    def output_funccalls(self, outputstream):
        """Emits function call code to 'outputstream' and also returns the return variables of said function calls"""
        isPtr = self.functype.endswith('*')
        var1  = Variable("res1", self.functype, isPtr, 0) ## TODO: arrSize
        var2  = Variable("res2", self.functype, isPtr, 0) ## TODO: arrSize

        param_vars = self.get_vars_from_params()
        arguments  = []
        for var in param_vars:
            arg_str = var.get_name()
            arguments.append(arg_str)
        arguments = ', '.join(arguments)

        output_str1 = "{} {} = {}({});\n".format(var1.get_type(), var1.get_name(), self.funcname, arguments)
        output_str2 = "{} {} = {}{}({});\n".format(var2.get_type(), var2.get_name(), self.funcname, RENAME_SUFFIX, arguments)

        outputstream.write(output_str1)
        outputstream.write(output_str2)
        return var1, var2

    def dump_to_cfile(self, outputstream):
        filename = TD_PREFIX + '{}.c'.format(self.id)
        with open(filename, 'w') as f:
            outputstream.seek(0)
            shutil.copyfileobj(outputstream, f)

class Variable:
    vars_to_free = []

    def __init__(self, name, _type, isPtr=False, arrSize=0):
        self.name    = name
        self.type    = _type
        self.isPtr   = isPtr
        self.arrSize = arrSize

    def to_string(self):
        return """
          Variable:
            Name: {}
            Type: {}
            IsPtr: {}
            Array Size: {}
              """.format(self.name, self.type, self.isPtr, self.arrSize)

    def get_name(self):
        return self.name

    def get_id(self):
        return self.id

    def get_type(self):
        return self.type

    def is_ptr(self):
        return self.isPtr

    def get_arr_size(self):
        return self.arrSize

    def output_mk_var(self, outputstream):
        # Check outputstream is valid
        # Procedure for initialising variable:
        #   [self._type] [self.name] = malloc(sizeof([self._type]))
        #   Push [self.name] onto vars_to_free
        #   Output check testing whether malloc was successful (remember to free vars)
        #   Output klee_make_symbolic (should call Klee.output_symbolic(self, outputstream))
        is_valid_outputstream(outputstream)

        if self.isPtr:
            # self.type should already contain '*'s if var is pointer
            output_str = "{} {} = malloc(sizeof(*{}));\n".format(self.type, self.name, self.name)
            outputstream.write(output_str)
            Variable.vars_to_free.append(self)
            output_str = "if (!{})\n\tmalloc_fail(-1);\n".format(self.name)
            outputstream.write(output_str)
        else:
            output_str = "{} {};\n".format(self.type, self.name)
            outputstream.write(output_str)
        Klee.output_symbolic(self, outputstream)
        return

    def output_free_vars(outputstream):
        # Output free calls for variables in 'vars_to_free', from last element to first
        is_valid_outputstream(outputstream)
        return

class Klee:

    @staticmethod
    def output_headers(outputstream):
        is_valid_outputstream(outputstream)
        for header in KLEE_HEADERS:
            output_str = "#include \"{}\"\n".format(header)
            outputstream.write(output_str)

    @staticmethod
    def output_init(outputstream):
        is_valid_outputstream(outputstream)
        output_str = "int main()\n{\n"
        outputstream.write(output_str)

    @staticmethod
    def output_footer(outputstream):
        is_valid_outputstream(outputstream)
        output_str = "return 0;\n}"
        outputstream.write(output_str)
        return

    @staticmethod
    def output_symbolic(var, outputstream):
        is_valid_outputstream(outputstream)
        if var.is_ptr():
            star_count = var.get_type().count('*')
            ptr_str    = '*' * star_count
            output_str = "klee_make_symbolic({}, sizeof({}{}), \"{}\");\n".format(var.get_name(), ptr_str, var.get_name(), var.get_name())
            outputstream.write(output_str)
        else:
            output_str = "klee_make_symbolic(&{}, sizeof({}), \"{}\");\n".format(var.get_name(), var.get_name(), var.get_name())
            outputstream.write(output_str)

    @staticmethod
    def output_assume(var1, var2, outputstream):
        is_valid_outputstream(outputstream)
        if var.is_ptr():
            _str1 = "int i;\n"
            _str2 = "for (i=0; i < {}; i++)\n"
            _str3 = "{\n"
            _str4 = "klee_assume({}[i] == {}[i]);\n".format(var1.get_name(), var2.get_name())
            _str5 = "}\n\n"
            write_with_tabs(outputstream, _str1, 1)
            write_with_tabs(outputstream, _str2, 1)
            write_with_tabs(outputstream, _str3, 1)
            write_with_tabs(outputstream, _str4, 2)
            write_with_tabs(outputstream, _str5, 1)
        else:
            _str1 = "klee_assume({} == {});\n\n".format(var1.get_name(), var2.get_name())
            write_with_tabs(outputstream, _str1, 1)
                
    @staticmethod
    def output_assert(var1, var2, outputstream):
        is_valid_outputstream(outputstream)
        output_str = "klee_assert({} == {});\n".format(var1.get_name(), var2.get_name())
        outputstream.write(output_str)

def get_arguments():
    """Grab user supplied arguments"""
    parser = argparse.ArgumentParser()
    parser.add_argument("-v", "--verbose", required=False, help="Verbose output", action="store_true")
    parser.add_argument("-r", "--repository", required=True, help="Path to git repository", type=str)
    parser.add_argument("--commits", required=True, nargs=2, help="Commits (exactly 2) which are used to generate test drivers", type=str)
    parser.add_argument("--sortby", required=False, choices=['lines-changed', 'lines-added', 
                        'lines-removed', 'functions-changed', 'functions-added', 'functions-removed'],
                        default='functions-changed',
                        help=
                        """
                        Optional comparator to sort data from CSV file. Use with the
                        '--depth N' so that test drivers are generated for the top N
                        inputs as specified by the comparator.
                        """)
    parser.add_argument("-d", "--depth", required=False, default=10, type=int, help="Number of test drivers to generate (default 10)")
    args = parser.parse_args()

    if args.verbose:
        global DEBUG
        DEBUG = True

    # Validate arguments
    is_valid_repo(args.repository)
    return args

def is_valid_csv(input_file):
    if not input_file.endswith('.csv'):
        sys.stderr.write("The file {} is not a valid CSV file\n".format(input_file))
        sys.exit(1)

def is_valid_repo(path):
    """Ensure that the input file exists"""
    if not os.path.exists(os.path.join(path, '.git')):
        sys.stderr.write("The path '{}' does not point to a valid Git Repository\n".format(path))
        sys.exit(1)

def transpose(xs):
    """Transposes a list of lists"""
    results = []
    for i in range(len(xs[0])):
        results.append([x[i] for x in xs])
    return results

def candidate_funcs(funcs):
    """
    Filter the input list of functions, retaining only functions for which test
    drivers can be generated. Note: 'funcs' is a list of tuples, the first element
    is the function name and the second are the parameters.
    """
    PRINT("Candidate Functions:")
    for f in funcs:
        PRINT("  Function: {}".format(f))
    pattern = r"(\s*static\s*|\s*void\s*).*"
    PRINT("\n")
    return [(name, params) for name, params in funcs if not re.match(pattern, name)]

def parse_funcname(fdecl):
    """'fdecl' refers to the function declaration without parameters"""
    pattern = r"((?:static)?\s*(?:const|volatile)?\s*[a-zA-Z_][a-zA-Z0-9_]*\s*\**)(\s*[a-zA-Z_][a-zA-Z0-9_]*)"
    m = re.match(pattern, fdecl)
    if not m:
        sys.stderr.write("Could not extract name from function {}\n".format(fdecl))
    else:
        return m.group(2).strip()

def parse_functype(fdecl):
    """'fdecl' refers to the function declaration without parameters"""
    pattern = r"((?:static)?\s*(?:const|volatile)?\s*[a-zA-Z_][a-zA-Z0-9_]*\s*\**)(\s*[a-zA-Z_][a-zA-Z0-9_]*)"
    m = re.match(pattern, fdecl)
    if not m:
        sys.stderr.write("Could not extract type from function {}\n".format(fdecl))
    else:
        return m.group(1).strip()

if  __name__ == "__main__":
    arguments = get_arguments()
    #TODO: Get list of records of form: (funcname, functype, list of params (incl. types))
    # The records retrieved should be indexed by pairs of revisions: (rev1, rev2)
    # So we need to extract this info from the verbose CSV file, not the one given as input.

    # Get Functions for first revision

    commit1 = arguments.commits[0]
    commit2 = arguments.commits[1]
    res, verbose_res = df.changes(arguments.repository, arguments.commits)

    assert len(res) == 1
    assert len(verbose_res) == 1

    lib_headers = get_lib_headers(arguments.repository)
    all_structs = get_structs_from_headers(lib_headers)

    record     = verbose_res[0]
    print ("Generating Test Drivers for revision pair: ({}, {})\n".format(record[0], record[1]))
    f_modified = record[7]
    funcs      = candidate_funcs(f_modified)

    contexts = []
    cv_pattern     = r"\s*(const|volatile)\s*(\**)"
    struct_pattern = r"\s*struct\s*(\**)\s*"
        
    for f in funcs:
        unsuccessful = False # Important to set this before anything else
        print ("Generating Test Driver for function '{}'...".format(f))
        fname  = parse_funcname(f[0])
        ftype  = parse_functype(f[0])
        params = re.split(r"\s*,\s*", f[1])

        if not params:
            sys.stderr.write("ERROR: Could not parse parameter string: {}\n".format(f[1]))
            sys.exit(-1)

        for typ in params + [ftype]:
            m = re.sub(cv_pattern, '', typ)
            m = re.sub(struct_pattern, '', m)
            try:
                typ_begin = m.split()[0]
                typ_begin = typ_begin.rstrip('* ') # Remove pointer asteriks and whitespace
                typ_begin = typ_begin.lstrip() # Just to be sure
                #if typ_begin not in COMMON_DATATYPES and typ_begin not in all_structs:
                 #   print("FAILURE: Unsuccessful test driver generation for function '{}':".format(f))
                  #  PRINT("  Function Name: '{}'".format(fname))
                   # PRINT("  Function Return Type: '{}'".format(ftype))
                   # if typ_begin not in COMMON_DATATYPES:
                    #    PRINT("  Type (no pointer asterisk) '{}' not in COMMON_DATATYPES".format(typ_begin))
                    #if typ_begin not in all_structs:
                    #    PRINT("  Type (no pointer asterisk) '{}' not defined in public repository API".format(typ_begin))
                    #print("  Function '{}' requires type (no pointer asterisk) '{}', which is outside public C/C++ interface of upb repository\n".format(fname, typ_begin))
                    #unsuccessful = True
                    #break
            except IndexError as err:
                print("FAILURE: Unsuccessful test driver generation for function '{}':".format(f))
                print("  Regex Object: '{}'".format(str(m)))
                print("  Function Name: '{}'".format(fname))
                print("  Failed on Type: '{}'".format(typ))
                print('  Index Error: {}\n'.format(err))
        if unsuccessful:
            continue

        outputstream = create_outputstream()

        output_stdc_headers(outputstream)
        output_lib_headers(lib_headers, outputstream)
        output_boilerplate_headers(outputstream)
        Klee.output_headers(outputstream)


        ctx = TDContext(fname, ftype, params)
        PRINT(ctx.to_string())

        variables = ctx.get_vars_from_params()

        if not variables:
            continue

        Klee.output_init(outputstream)

        for var in variables:
            PRINT(var.to_string())
            var.output_mk_var(outputstream)
            output_newline(outputstream)

        res1, res2 = ctx.output_funccalls(outputstream)
        Klee.output_assert(res1, res2, outputstream)

        Klee.output_footer(outputstream)

        ctx.dump_to_cfile(outputstream)
        print ("Test Driver for function '{}' generated successfully\n".format(f))