xiosim/config/N.cfg

##############################################################################################
# This is our "best guess" knob file for a Nehalem-class (45nm) Intel Core i7
# processor.  We make no claims as to the accuracy or correctness of these
# settings, there is no support for modeling SMT cores, LLC cache inclusion,
# and a variety of other microarchitectural features, so use this at your own
# risk.  It is **your** responsibility to understand what you are modeling and
# simulating!
##############################################################################################

# Global settings about the system and the simulation.
system_cfg {
  seed = 1                         # Random number generator seed
  num_cores = 1                    # Number of cores in the system.
  heartbeat_interval = 10000       # Print out simulator heartbeat every x cycles.
  ztrace_file_prefix = "ztrace"    # Zesto trace filename prefix.
  simulate_power = false           # Simulate power.
  power_rtp_interval = 0           # uncore cycles between power computations.
  cache_miss_sample_parameter = 0  # Interval between sampling cache misses.
  power_rtp_file = ""              # Runtime power file.
  output_redir = "sim.out"         # Redirect simulator output.

  dvfs_cfg {
    # DVFS controller configuration.
    config = "none"
    # Re-evaluate voltage/freq choice every X cycles.
    interval = 0
  }

  # OS scheduler and core allocator.
  scheduler_cfg {
    scheduler_tick = 0                   # Scheduler refresh in cycles.
    allocator = "gang:1"                 # Core allocation algorithm.
    allocator_opt_target = "throughput"  # Core allocation optimization target.
    speedup_model = "linear"             # Core allocation speedup model.
  }

  profiling_cfg {
    # file with profiling results
    file_prefix = ""
    # symbol/instruction to start profiling (format is symbol_name(+offset))
    start = {}
    # symbol/instruction to stop profiling (if empty, exit points of @profiling_start)
    stop = {}
  }

  ignore_cfg {
    # Names of functions to replace.
    funcs = {}
    # Individual instructions to ignore. Format is either an exact PC in hex or
    # symbol_name(+offset), like the profiling start parameters.
    pcs = {}
  }
}

# Core configuration.
core_cfg {
  # Pipeline model.
  pipeline_model = "DPM"

  # CPU clock frequency
  core_clock = 3200.0

  # Instruction fetch settings.
  fetch_cfg {
    # Size of instruction queue (macro ops), placed between predecode and
    # decode.
    instruction_queue_size = 18

    # Caches consist of the cache itself, a TLB, a prefetcher, and a coherency
    # controller.
    icache_cfg icache {
      # General cache settings - size, associativity, line size, etc.
      config = "IL1:128:4:64:4:64:2:C:8"
      # Cache coherency controller configuration.
      coherency_controller = "none"
      # Enable cache miss sampling.
      sample_misses = false

      iprefetch_cfg inst_pf {
        config = {"nextline"}        # 1st-level icache prefetcher configuration
        on_miss_only = true          # icache prefetch on miss only
        fifosize = 8                 # Prefetch FIFO size (TODO: units?)
        buffer = 0                   # Prefetch buffer size.
        filter = 0                   # Prefetch filter size.
        filter_reset = 65536         # Prefetch filter reset interval (cycles).
        # Prefetch threshold - only prefetch if MSHR occupancy is less than
        # this.
        threshold = 4
        # Maximum instruction prefetch requests in the MSHR
        max_outstanding_requests = 2
        # Sampling interval (cycles) for prefetch control. 0 = no PF controller.
        watermark_sampling_interval = 100
        # Minimum watermark - always prefetch if lower than this.
        watermark_min = 0.1
        # Maximum watermark - never prefetch if above this.
        watermark_max = 0.3
      }

      itlb_cfg itlb {
        # Instruction ITLB configuration.
        config = "ITLB:128:4:1:2:L:5"
        # Coherency controller.
        coherency_controller = "none"
      }
    }

    branch_pred_cfg {
      # bpred configuration(s)
      type = {"tage:TAGE5:5:2048:512:9:6:75"}
      # fusion algorithm for hybrid 2nd-level bpred
      fusion = "none"
      # branch target buffer configuration
      btb = "btac:BTB:512:4:8:l"
      # indirect branch target buffer configuration
      ibtb = "2levbtac:iBTB:1:8:1:128:4:8:l"
      # return address stack predictor configuration
      ras = "multistack:RAS:8:8"
      # additional latency from branch-exec to jeclear
      jump_exec_delay = 1
    }

    byte_queue_cfg {
      # Number of entries.
      size = 3
      # Bytes per line.
      line_size = 16
    }

    predecode_cfg {
      # Number of stages in the predecode pipe.
      depth = 2
      # Width of predecode pipeline (macro-ops)
      width = 6
    }
  }

  decode_cfg {
    # Pipeline depth in stages.
    depth = 2
    # Width of pipeline in macro-ops.
    width = 4
    # stage of branch agen ("targetstage").
    branch_agen_stage = 1
    # Maximum branches decoded per cycle.
    branch_decode_limit = 1
    # maximum uops generated for each decoder (e.g., 4 1 1)
    decoder_max_uops = {4, 1, 1, 1}
    # Latency to access micro-code sequencer.
    ucode_sequencer_latency = 0
    # Number of entries in uop queue.
    uop_queue_size = 24

    # Enable/disable uop fusion rules.
    uop_fusion_cfg {
      # Fuse the load op with the next computation op.
      load_comp_op = true
      # Fuse the load op with the next fp op.
      fpload_comp_op = true
      # Store address generate - store op.
      sta_std = true
      # Load-store op fusion.
      load_op_store = false
    }
  }

  # Alloc = dispatch
  alloc_cfg {
    # Pipeline depth (stages).
    depth = 1
    # Pipeline width (uops).
    width = 4
    # use drain-flush after misprediction
    use_drain_flush = true
  }

  exec_cfg {
    # Maximum issues from RS per cycle (equal to num exec ports).
    width = 6
    # Number of cycles for payload RAM access (schedule to exec delay).
    payload_depth = 2
    # Enable heuristic tornado breaker.
    enable_tornado_breaker = true
    # Enable load issue throttling on partial matches.
    enable_partial_throttle = true
    # Latency to forward results to FP cluster (cycles).
    fp_forward_penalty = 0
    # Memory dependence predictor configuration.
    mem_dep_pred_config = "lwt:LWT:8192:999999"

    # Number of reservation station entries.
    rs_size = 36
    # Number of load queue entries.
    loadq_size = 36
    # Number of store queue entries.
    storeq_size = 24

    dcache_cfg dcache {
      config = "DL1:64:8:64:8:64:2:C:W:B:16:8:C"
      mshr_cmd = "RWPB"
      coherency_controller = "none"
      sample_misses = false

      dtlb_cfg dtlb {
        config = "DTLB:256:4:1:2:L:8"
        coherency_controller = "none"
      }

      d2tlb_cfg d2tlb {
        config = "none"
        coherency_controller = "none"
      }

      dprefetch_cfg data_pf {
        # 1st-level dcache prefetcher configuration
        config = {"IP:256:12:13:6", "nextline"}
        on_miss_only = true          # dcache prefetch on miss only
        fifosize = 8                 # Prefetch FIFO size (TODO: units?)
        buffer = 0                   # Prefetch buffer size.
        filter = 0                   # Prefetch filter size.
        filter_reset = 65536         # Prefetch filter reset interval (cycles).
        threshold = 4                # Prefetch threshold.
        # Maximum instruction prefetch requests in the MSHR
        max_outstanding_requests = 2
        # Sampling interval (cycles) for prefetch control. 0 = no PF controller.
        watermark_sampling_interval = 100
        # Minimum watermark - always prefetch if lower than this.
        watermark_min = 0.1
        # Maximum watermark - never prefetch if above this.
        watermark_max = 0.3
      }
    }

    l2cache_cfg L2 {
      config = "DL2:512:8:64:8:64:2:C:W:B:16:8:C"
      mshr_cmd = "RPWB"
      coherency_controller = "const:75"
      sample_misses = false

      l2prefetch_cfg l2_pf {
        config = {"IP:256:12:13:6", "nextline"}
        on_miss_only = true          # dcache prefetch on miss only
        fifosize = 8                 # Prefetch FIFO size (TODO: units?)
        buffer = 0                   # Prefetch buffer size.
        filter = 0                   # Prefetch filter size.
        filter_reset = 65536         # Prefetch filter reset interval (cycles).
        threshold = 4                # Prefetch threshold.
        # Maximum instruction prefetch requests in the MSHR
        max_outstanding_requests = 2
        # Sampling interval (cycles) for prefetch control. 0 = no PF controller.
        watermark_sampling_interval = 100
        # Minimum watermark - always prefetch if lower than this.
        watermark_min = 0.1
        # Maximum watermark - never prefetch if above this.
        watermark_max = 0.3
      }
    }

    # RingCache settings.
    repeater_cfg {
      # RingCache configuration (originally in zesto-repeater).
      config = "none"
      # Send request to L1 in parallel with the repeater.
      request_dl1 = false
    }

    exeu int_alu {
      latency = 1  # Execution latency.
      rate = 1  # Issue rate.
      port_binding = {0, 1, 5}  # Port bindings.
    }

    exeu jump {
      latency = 1
      rate = 1
      port_binding = {5}
    }

    exeu int_mul {
      latency = 3
      rate = 1
      port_binding = {1}
    }

    exeu int_div {
      latency = 24
      rate = 16
      port_binding = {0}
    }

    exeu shift {
      latency = 1
      rate = 1
      port_binding = {0, 5}
    }

    exeu fp_alu {
      latency = 3
      rate = 1
      port_binding = {1}
    }

    exeu fp_mul {
      latency = 5
      rate = 2
      port_binding = {0}
    }

    exeu fp_div {
      latency = 32
      rate = 32
      port_binding = {0}
    }

    exeu fp_cplx {
      latency = 58
      rate = 58
      port_binding = {0}
    }

    exeu ld {
      latency = 1
      rate = 1
      port_binding = {2}
    }

    exeu st_agen {
      latency = 1
      rate = 1
      port_binding = {3}
    }

    exeu st_data {
      latency = 1
      rate = 1
      port_binding = {4}
    }

    # LEA = load effective address.
    exeu lea {
      latency = 1
      rate = 1
      port_binding = {1}
    }

    exeu magic {
      latency = 1
      rate = 1
      port_binding = {0}
    }
  }

  # Commit stage.
  commit_cfg {
    rob_size = 128          # Number of ROB entries.
    commit_width = 4        # Maximum uops committed per cycle.
    commit_branches = 0     # Maximum branches committed per cycle.
  }

}  # End of core cfg.

# Last level cache, FSB, DRAM, etc.
uncore_cfg {
  llccache_cfg llc {
    # General cache settings - size, associativity, line size, etc.
    config = "LLC:8192:16:64:16:64:9:L:W:B:16:1:8:C"
    # Cache coherency controller configuration.
    coherency_controller = "const:75"
    mshr_cmd = "RPWB"            # MSHR configuration.
    clock = 1600                 # Cache clock frequency (MHz).
    sample_misses = false

    llcprefetch_cfg llc_pf {
      config = {"IP:256:12:13:6 stream:12:4"}   # last-level cache prefetcher configuration
      on_miss_only = false       # LLC prefetch on miss only
      fifosize = 8               # Prefetch FIFO size (TODO: units?)
      buffer = 0                 # Prefetch buffer size.
      filter = 0                 # Prefetch filter size.
      filter_reset = 65536       # Prefetch filter reset interval (cycles).
      # Prefetch threshold - only prefetch if MSHR occupancy is less than
      # this.
      threshold = 4
      # Maximum instruction prefetch requests in the MSHR
      max_outstanding_requests = 2
      # Sampling interval (cycles) for prefetch control. 0 = no PF controller.
      watermark_sampling_interval = 2000
      # Minimum watermark - always prefetch if lower than this.
      watermark_min = 0.1
      # Maximum watermark - never prefetch if above this.
      watermark_max = 0.4
    }
  }

  fsb_cfg {
    width = 8           # FSB bus width (Bytes).
    ddr = true          # FSB double pumped data.
    clock = 800.0       # FSB bus clock frequency (MHz).
    magic = false       # FSB unlimited bandwdidth.
  }

  dram_cfg {
    memory_controller_config = "simple:16:1"
    dram_config = "simplesdram:4:4:35:11.25:11.25:11.25:11.25:64"
    # Based on Samsung K4B510446E-ZCH0
    # 512-Mb, DDR3-1600 9-9-9
    #
    # t_RAS = 45.0ns
    # t_RCD = 15.0ns
    # t_CAS = 15.0ns
    # t_WR  = 15.0ns
    # t_RP  = 15.0ns
  }
}  # End of uncore configs.