diff --git a/LICENSE b/LICENSE
index 22fbe5db..d159169d 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,7 +1,7 @@
-GNU GENERAL PUBLIC LICENSE
+                    GNU GENERAL PUBLIC LICENSE
                        Version 2, June 1991
 
- Copyright (C) 1989, 1991 Free Software Foundation, Inc., <http://fsf.org/>
+ Copyright (C) 1989, 1991 Free Software Foundation, Inc.,
  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  Everyone is permitted to copy and distribute verbatim copies
  of this license document, but changing it is not allowed.
@@ -290,8 +290,8 @@ to attach them to the start of each source file to most effectively
 convey the exclusion of warranty; and each file should have at least
 the "copyright" line and a pointer to where the full notice is found.
 
-    {description}
-    Copyright (C) {year}  {fullname}
+    <one line to give the program's name and a brief idea of what it does.>
+    Copyright (C) <year>  <name of author>
 
     This program is free software; you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
@@ -329,11 +329,11 @@ necessary.  Here is a sample; alter the names:
   Yoyodyne, Inc., hereby disclaims all copyright interest in the program
   `Gnomovision' (which makes passes at compilers) written by James Hacker.
 
-  {signature of Ty Coon}, 1 April 1989
+  <signature of Ty Coon>, 1 April 1989
   Ty Coon, President of Vice
 
 This General Public License does not permit incorporating your program into
 proprietary programs.  If your program is a subroutine library, you may
 consider it more useful to permit linking proprietary applications with the
 library.  If this is what you want to do, use the GNU Lesser General
-Public License instead of this License.
\ No newline at end of file
+Public License instead of this License.
diff --git a/README.md b/README.md
index 542aef9e..73979d4a 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,206 @@
 zsim
 ====
 
-A fast and scalable x86-64 multicore simulator
+zsim is a fast x86-64 simulator. It was originally written to evaluate ZCache
+(Sanchez and Kozyrakis, MICRO-44, Dec 2010), hence the name, but it has since
+outgrown its purpose.
+zsim's main goals are to be fast, simple, and accurate, with a focus on
+simulating memory hierarchies and large, heterogeneous systems. It is parallel
+and uses DBT extensively, resulting in speeds of hundreds of millions of
+instructions/second in a modern multicore host. Unlike conventional simulators,
+zsim is organized to scale well (almost linearly) with simulated core count.
+
+You can find more details about zsim in our ISCA 2013 paper:
+http://people.csail.mit.edu/sanchez/papers/2013.zsim.isca.pdf.
+
+
+License & Copyright
+-------------------
+
+zsim is free software; you can redistribute it and/or modify it under the terms
+of the GNU General Public License as published by the Free Software Foundation,
+version 2.
+
+zsim was originally written by Daniel Sanchez at Stanford University, and per
+Stanford University policy, the copyright of this original code remains with
+Stanford (specifically, the Board of Trustees of Leland Stanford Junior
+University). Since then, zsim has been substantially modified and enhanced at
+MIT by Daniel Sanchez, Nathan Beckmann, and Harshad Kasture. zsim also
+incorporates contributions on main memory performance models from Krishna
+Malladi, Makoto Takami, and Kenta Yasufuku.
+
+zsim was also modified and enhanced while Daniel Sanchez was an intern at
+Google. Google graciously agreed to share these modifications under a GPLv2
+license. This code is (C) 2011 Google Inc. Files containing code developed at
+Google have a different license header with the correct copyright attribution.
+
+Additionally, if you use this software in your research, we request that you
+reference the zsim paper ("ZSim: Fast and Accurate Microarchitectural
+Simulation of Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June
+2013) as the source of the simulator in any publications that use this
+software, and that you send us a citation of your work.
+
+
+Setup
+-----
+
+External dependencies: `gcc >=4.6, pin, scons, libconfig, libhdf5`
+
+1. Clone a fresh copy of the git zsim repository (`git clone <path to zsim repo>`).
+
+2. Download Pin, http://www.pintool.org . Tested with Pin 2.8+ on an x86-64
+   architecture.  Compiler flags are set up for Pin 2.9 on x86-64. To get flags
+   for other versions, examine the Pin makefile or derive from sample pintools.
+   Set the PINPATH environment variable to Pin's base directory.
+
+   NOTE: Linux 3.0+ systems require Pin 2.10+, just because Pin does a kernel
+   version check that 3.0 fails.
+
+   NOTE 2: Use Pin 2.12 with Sandy/Ivy Bridge systems, earlier Pin versions
+   have strange performance regressions on this machine (extremely low IPC).
+
+3. zsim requires some additional libraries. If they are not installed in your
+   system, you will need to download and build them:
+
+  3.1 libconfig, http://www.hyperrealm.com/libconfig . To install locally,
+      untar, run `./configure --prefix=<libconfig install path> && make install`.
+      Then define the env var `LIBCONFIGPATH=<libconfig install path>`. 
+
+  3.2 libhdf5, http://www.hdfgroup.org (v1.8.4 path 1 or higher). The
+      SConstruct file assumes it is installed in the system.
+
+  3.3 (OPTIONAL) polarssl (currently used just for their SHA-1 hash function),
+      http://www.polarssl.org Install locally as in 3.1 and define the env var
+      `POLARSSLPATH=<polarssl install path>`
+      
+      NOTE: You may need to add `-fPIC` to the Makefile's C(PP/XX)FLAGS depending
+      on the version.
+
+  3.4 (OPTIONAL) DRAMSim2 for main memory simulation. Build locally and define
+      the env var DRAMSIMPATH as in 3.1 and 3.3.
+
+4. Compile zsim: `scons -j16`
+
+5. Launch a test run: `./build/opt/zsim tests/simple.cfg`
+
+For more compilation options, run scons --help. You can build debug, optimized
+and release variants of the simulator (--d, --o, --r options). Optimized (opt)
+is the default. You can build profile-guided optimized (PGO) versions of the
+code with --p. These improve simulation performance with OOO cores by about
+30%.
+
+NOTE: zsim uses C++11 features available in `gcc >=4.6` (such as range-based for
+loops, strictly typed enums, lambdas, and type inference). Older version of gcc
+will not work. zsim can also be built with `icc` (see the `SConstruct` file). 
+
+
+Notes
+-----
+
+**Accuracy:** While we have validated zsim against a real system, you should be
+aware that we sometimes sacrifice some accuracy for speed and simplicity. The
+ISCA 2013 paper details the possible sources of inaccuracy.  Despite our
+validation efforts, if you are using zsim with workloads or architectures that
+are significantly different from ours, you should not blindly trust these
+results. Also, zsim can be configured with varying degrees of accuracy, which
+may be OK in some cases but not others (e.g., longer bound phases to reduce
+overheads are often OK if your application has little communication, but not
+with fine-grained parallelism and synchronization). Finally, in some cases, you
+will need to modify the code, and for some purposes, zsim is just not the right
+tool. In any case, we strongly recommend validating your baseline configuration
+and workloads against a real machine.
+
+**Memory Management:** zsim can simulate multiple processes, which introduces some
+complexities in memory management. Each Pin process uses SysV IPC shared
+memory to communicate through a global heap. Be aware that Pin processes have a
+global and a process-local heap, and all simulator objects should be allocated
+in the global heap. A global heap allocator is implemented (galloc.c and g\_heap
+folder) using Doug Lea's malloc. The global heap allocator functions are as the
+usual ones, with the gm\_ prefix (e.g. gm\_malloc, gm\_calloc, gm\_free). Objects
+can be allocated in the global heap automatically by making them inherit from
+GlobAlloc, which redefines the new and delete operators. STL classes use their
+own internal allocators, so they cannot be members of globally visible objects.
+To ease this, the g\_stl folder has template specializations of commonly used
+STL classes that are changed to use our own STL-compliant allocator that
+allocates from the global heap. Use these classes as drop-in replacements when
+you need a globally visible STL class, e.g. substitute std::vector with
+g\_vector, etc.
+
+**Harness:** While most of zsim is implemented as a pintool (`libzsim.so`), a harness
+process (`zsim`) is used to control the simulation: set up the shared memory
+segment, launch pin processes, check for deadlock, and ensure termination of
+the whole process tree when it is killed. In prior revisions of the simulator,
+you could launch the pintool directly, but now you should use the harness.
+
+**Transparency & I/O:** To maintain transparency w.r.t instrumented
+applications, zsim does all logging through info/warn/panic methods. With the
+sim.logToFile option, these dump to per-process log files instead of the
+console. *You should never use cout/cerr or printf in simulator code* ---
+simple applications will work, but more complex setups, e.g., anything that
+uses pipes, will break.
+
+**Interfacing with applications:** You can use special instruction sequences to
+control the simulation from the application (e.g., fast-forward to the region
+you want to simulate). `misc/hooks` has wrappers for C/C++, Fortran, and Java,
+and extending this to other languages should be easy.
+
+**Host Configuration:** The system configuration may need some tweaks to support
+zsim. First, it needs to allow for large shared memory segments. Second, for
+Pin to work, it must allow a process to attach to any other from the user, not
+just to a child. Use sysctl to ensure that `kernel.shmmax=1073741824` (or larger)
+and `kernel.yama.ptrace_scope=0`. zsim has mainly been used in
+Ubuntu 11.10, 12.04, 12.10, 13.04, and 13.10, but it should work in other Linux
+distributions. Using it in OSs other than Linux (e.g,, OS X, Windows) will be
+non-trivial, since the user-level virtualization subsystem has deep ties into
+the Linux syscall interface.
+
+**Stats:** The simulator outputs periodic, eventual and end-of-sim stats files.
+Stats can be output in both HDF5 and plain text. Read the README.stats file
+and the associated scripts repository to see how to use these stats.
+
+**Configuration & Getting Started:** A detailed use guide is out of the scope of
+this README, because the simulator options change fairly often. In general,
+*the documentation is the source code*. You should be willing to occasionally
+read the source code to see how different zsim features work. To get familiar
+with the way to configure the simulator, the following three steps usually work
+well when getting started:
+
+1. Check the examples in the `tests/` folder, play around with the settings, and
+   launch a few runs. Config files have three sections, sys (configures the
+   simulated system, e.g., core and cache parameters), sim (configures simulation
+   parameters, e.g., how frequent are periodic stats output, phase length, etc.),
+   and process{0, 1, 2, ...} entries (what processes to run). 
+
+2. Most parameters have implicit defaults. zsim produces an out.cfg file that
+   includes all the default choices (and we recommend that your analysis scripts
+   automatically parse this file to check that what you are simulating makes
+   sense). Inspecting the out.cfg file reveals more configuration options to play
+   with, as well as their defaults.
+
+3. Finally, check the source code for more info on options. The whole system is
+   configured in the init.cpp (sys and sim sections) and process\_tree.cpp
+   (processX sections) files, so there is no need to grok the whole simulator
+   source to find out all the configuration options.
+
+**Hacking & Style Guidelines:** zsim is mostly consistent with Google's C++ style
+guide. You can use cpplint.py to check rule violations. We depart from these
+guidelines on a couple of aspects:
+
+- 4-space indentation instead of 2 spaces
+
+- 120-character lines instead of 80-char (though you'll see a clear disregard
+  for strict line length limits every now and then)
+
+You can use cpplint.py (included in misc/ with slight modifications) to check
+your changes. misc/ also has a script to tidy includes, which should be in
+alphabetical order within each type (own, system, and project headers).
+
+vim will indent the code just fine with the following options:
+`set cindent shiftwidth=4 expandtab smarttab`
+
+Finally, as Google's style guidelines say, please be consistent with the
+current style used elsewhere. For example, the parts of code that deal with Pin
+follow a style consistent with pintools.
+
+Happy hacking, and hope you find zsim useful!
+
diff --git a/README.stats b/README.stats
new file mode 100644
index 00000000..01f28281
--- /dev/null
+++ b/README.stats
@@ -0,0 +1,71 @@
+#!/usr/bin/python
+# zsim stats README
+# Author: Daniel Sanchez <sanchezd@stanford.edu>
+# Date: May 3 2011
+#
+# Stats are now saved in HDF5, and you should never need to write a stats
+# parser. This README explains how to access them in python using h5py. It
+# doubles as a python script, so you can just execute it with "python
+# README.stats" and see how everything works (after you have generated a stats
+# file).
+#
+
+import h5py # presents HDF5 files as numpy arrays
+import numpy as np
+
+# Open stats file
+f = h5py.File('zsim-ev.h5', 'r')
+
+# Get the single dataset in the file
+dset = f["stats"]["root"]
+
+# Each dataset is first indexed by record. A record is a snapshot of all the
+# stats taken at a specific time.  All stats files have at least two records,
+# at beginning (dest[0])and end of simulation (dset[-1]).  Inside each record,
+# the format follows the structure of the simulated objects. A few examples:
+
+# Phase count at end of simulation
+endPhase = dset[-1]['phase']
+print endPhase
+
+# If your L2 has a single bank, this is all the L2 hits. Otherwise it's the
+# hits of the first L2 bank
+l2_0_hits = dset[-1]['l2'][0]['hGETS'] + dset[-1]['l2'][0]['hGETX']
+print l2_0_hits
+
+# Hits into all L2s
+l2_hits = np.sum(dset[-1]['l2']['hGETS'] + dset[-1]['l2']['hGETX'])
+print l2_hits
+
+# Total number of instructions executed, counted by adding per-core counts
+# (you could also look at procInstrs)
+totalInstrs = np.sum(dset[-1]['simpleCore']['instrs'])
+print totalInstrs
+
+# You can also focus on one sample, or index over multiple steps, e.g.,
+lastSample = dset[-1]
+allHitsS = lastSample['l2']['hGETS']
+firstL2HitsS = allHitsS[0]
+print firstL2HitsS
+
+# There is a certain slack in the positions of numeric and non-numeric indices,
+# so the following are equivalent:
+print dset[-1]['l2'][0]['hGETS'] 
+#print dset[-1][0]['l2']['hGETS'] # can't do
+print dset[-1]['l2']['hGETS'][0]
+print dset['l2']['hGETS'][-1,0]
+print dset['l2'][-1,0]['hGETS']
+print dset['l2']['hGETS'][-1,0]
+
+# However, you can't do things like dset[-1][0]['l2']['hGETS'], because the [0]
+# indexes a specific element in array 'l2'. The rule of thumb seems to be that
+# numeric indices can "flow up", i.e., you can index them later than you should.
+# This introduces no ambiguities.
+
+# Slicing works as in numpy, e.g.,
+print dset['l2']['hGETS'] # a 2D array with samples*per-cache data
+print dset['l2']['hGETS'][-1] # a 1D array with per-cache numbers, for the last sample
+print dset['l2']['hGETS'][:,0] # 1D array with all samples, for the first L2 cache
+
+# OK, now go bananas!
+
diff --git a/SConstruct b/SConstruct
new file mode 100644
index 00000000..f4afd209
--- /dev/null
+++ b/SConstruct
@@ -0,0 +1,198 @@
+import os, sys
+from os.path import join as joinpath
+
+useIcc = False
+#useIcc = True
+
+def buildSim(cppFlags, dir, type, pgo=None):
+    ''' Build the simulator with a specific base buid dir and config type'''
+
+    buildDir = joinpath(dir, type)
+    print "Building " + type + " zsim at " + buildDir
+
+    env = Environment(ENV = os.environ)
+    env["CPPFLAGS"] = cppFlags
+
+    allSrcs = [f for dir, subdirs, files in os.walk("src") for f in Glob(dir + "/*")]
+    versionFile = joinpath(buildDir, "version.h")
+    if os.path.exists(".git"):
+        env.Command(versionFile, allSrcs + [".git/index", "SConstruct"],
+            'echo "#define ZSIM_BUILDDATE \\""`date`\\""\\\\n#define ZSIM_BUILDVERSION \\""`python misc/gitver.py`\\""" >>' + versionFile)
+    else:
+        env.Command(versionFile, allSrcs + ["SConstruct"],
+            'echo "#define ZSIM_BUILDDATE \\""`date`\\""\\\\n#define ZSIM_BUILDVERSION \\""no git repo\\""" >>' + versionFile)
+
+    # Parallel builds?
+    #env.SetOption('num_jobs', 32)
+
+    # Use link-time optimization? It's still a bit buggy, so be careful
+    #env['CXX'] = 'g++ -flto -flto-report -fuse-linker-plugin'
+    #env['CC'] = 'gcc -flto'
+    #env["LINKFLAGS"] = " -O3 -finline "
+    if useIcc:
+        env['CC'] = 'icc'
+        env['CXX'] = 'icpc -ipo'
+
+    # Required paths
+    if "PINPATH" in os.environ:
+        PINPATH = os.environ["PINPATH"]
+    else:
+       print "ERROR: You need to define the $PINPATH environment variable with Pin's path"
+       sys.exit(1)
+
+    ROOT = Dir('.').abspath
+
+    # NOTE: These flags are for the 28/02/2011 2.9 PIN kit (rev39599). Older versions will not build.
+    # NOTE (dsm 10 Jan 2013): Tested with Pin 2.10 thru 2.12 as well
+    # NOTE: Original Pin flags included -fno-strict-aliasing, but zsim does not do type punning
+    env["CPPFLAGS"] += " -g -std=c++0x -Wall -Wno-unknown-pragmas -fomit-frame-pointer -fno-stack-protector -MMD -DBIGARRAY_MULTIPLIER=1 -DUSING_XED -DTARGET_IA32E -DHOST_IA32E -fPIC -DTARGET_LINUX"
+    env["CPPPATH"] = [joinpath(PINPATH , "extras/xed2-intel64/include"),
+            joinpath(PINPATH , "source/include"), joinpath(PINPATH , "source/include/gen"),
+            joinpath(PINPATH , "extras/components/include")]
+
+    # Perform trace logging? 
+    ##env["CPPFLAGS"] += " -D_LOG_TRACE_=1"
+    
+    # Uncomment to get logging messages to stderr
+    ##env["CPPFLAGS"] += " -DDEBUG=1"
+
+    # Be a Warning Nazi? (recommended)
+    env["CPPFLAGS"] += " -Werror "
+
+    # Enables lib and harness to use the same info/log code,
+    # but only lib uses pin locks for thread safety
+    env["PINCPPFLAGS"] = " -DMT_SAFE_LOG "
+
+    # PIN-specific libraries
+    env["PINLINKFLAGS"] = " -Wl,--hash-style=sysv -Wl,-Bsymbolic -Wl,--version-script=" + joinpath(PINPATH , "source/include/pintool.ver") 
+    
+    # To prime system libs, we include /usr/lib and /usr/lib/x86_64-linux-gnu
+    # first in lib path. In particular, this solves the issue that, in some
+    # systems, Pin's libelf takes precedence over the system's, but it does not
+    # include symbols that we need or it's a different variant (we need
+    # libelfg0-dev in Ubuntu systems)
+    env["PINLIBPATH"] = ["/usr/lib", "/usr/lib/x86_64-linux-gnu", joinpath(PINPATH , "extras/xed2-intel64/lib"),
+            joinpath(PINPATH, "intel64/lib"), joinpath(PINPATH, "intel64/lib-ext")]
+
+    # Libdwarf is provided in static and shared variants, Ubuntu only provides
+    # static, and I don't want to add -R<pin path/intel64/lib-ext> because
+    # there are some other old libraries provided there (e.g., libelf) and I
+    # want to use the system libs as much as possible. So link directly to the
+    # static version of libdwarf.
+    env["PINLIBS"] = ["pin", "xed", File(joinpath(PINPATH, "intel64/lib-ext/libdwarf.a")), "elf", "dl", "rt"]
+
+    # Non-pintool libraries
+    env["LIBPATH"] = []
+    env["LIBS"] = ["config++"]
+
+    env["LINKFLAGS"] = ""
+
+    if useIcc:
+        # icc libs
+        env["LINKFLAGS"] += " -Wl,-R/data/sanchez/tools/intel/composer_xe_2013.1.117/compiler/lib/intel64/"
+
+    # Use non-standard library paths if defined
+    if "LIBCONFIGPATH" in os.environ:
+        LIBCONFIGPATH = os.environ["LIBCONFIGPATH"]
+        env["LINKFLAGS"] += " -Wl,-R" + joinpath(LIBCONFIGPATH, "lib")
+        env["LIBPATH"] += [joinpath(LIBCONFIGPATH, "lib")]
+        env["CPPPATH"] += [joinpath(LIBCONFIGPATH, "include")]
+    
+
+    if "POLARSSLPATH" in os.environ:
+        POLARSSLPATH = os.environ["POLARSSLPATH"]
+        env["PINLIBPATH"] += [joinpath(POLARSSLPATH, "library")]
+        env["CPPPATH"] += [joinpath(POLARSSLPATH, "include")]
+        env["PINLIBS"] += ["polarssl"]
+        env["CPPFLAGS"] += " -D_WITH_POLARSSL_=1 "
+
+    # Only include DRAMSim if available
+    if "DRAMSIMPATH" in os.environ:
+        DRAMSIMPATH = os.environ["DRAMSIMPATH"]
+        env["LINKFLAGS"] += " -Wl,-R" + DRAMSIMPATH
+        env["PINLIBPATH"] += [DRAMSIMPATH]
+        env["CPPPATH"] += [DRAMSIMPATH]
+        env["PINLIBS"] += ["dramsim"]
+        env["CPPFLAGS"] += " -D_WITH_DRAMSIM_=1 "
+
+    env["CPPPATH"] += ["."]
+
+    # HDF5
+    env["PINLIBS"] += ["hdf5", "hdf5_hl"]
+
+    # Harness needs these defined
+    env["CPPFLAGS"] += ' -DPIN_PATH="' + joinpath(PINPATH, "intel64/bin/pinbin") + '" '
+    env["CPPFLAGS"] += ' -DZSIM_PATH="' + joinpath(ROOT, joinpath(buildDir, "libzsim.so")) + '" '
+
+    # Do PGO?
+    if pgo == "generate":
+        genFlags = " -prof-gen " if useIcc else " -fprofile-generate "
+        env["PINCPPFLAGS"] += genFlags
+        env["PINLINKFLAGS"] += genFlags
+    elif pgo == "use":
+        if useIcc: useFlags = " -prof-use "
+        else: useFlags = " -fprofile-use -fprofile-correction "
+        # even single-threaded sims use internal threads, so we need correction
+        env["PINCPPFLAGS"] += useFlags
+        env["PINLINKFLAGS"] += useFlags
+
+    env.SConscript("src/SConscript", variant_dir=buildDir, exports= {'env' : env.Clone()})
+
+####
+
+AddOption('--buildDir', dest='buildDir', type='string', default="build/", nargs=1, action='store', metavar='DIR', help='Base build directory')
+AddOption('--d', dest='debugBuild', default=False, action='store_true', help='Do a debug build')
+AddOption('--o', dest='optBuild', default=False, action='store_true', help='Do an opt build (optimized, with assertions and symbols)')
+AddOption('--r', dest='releaseBuild', default=False, action='store_true', help='Do a release build (optimized, no assertions, no symbols)')
+AddOption('--p', dest='pgoBuild', default=False, action='store_true', help='Enable PGO')
+AddOption('--pgoPhase', dest='pgoPhase', default="none", action='store', help='PGO phase (just run with --p to do them all)')
+
+
+baseBuildDir = GetOption('buildDir')
+buildTypes = []
+if GetOption('debugBuild'): buildTypes.append("debug")
+if GetOption('releaseBuild'): buildTypes.append("release")
+if GetOption('optBuild') or len(buildTypes) == 0: buildTypes.append("opt")
+
+march = "core2" # ensure compatibility across condor nodes
+#march = "native" # for profiling runs
+
+buildFlags = {"debug": "-g -O0", 
+              "opt": "-march=%s -g -O3 -funroll-loops" % march, # unroll loops tends to help in zsim, but in general it can cause slowdown
+              "release": "-march=%s -O3 -DNASSERT -funroll-loops -fweb" % march} # fweb saves ~4% exec time, but makes debugging a world of pain, so careful
+
+pgoPhase = GetOption('pgoPhase')
+
+# The PGO flow calls scons recursively. Hacky, but pretty much the only option:
+# scons can't build the same file twice, and although gcc enables you to change
+# the fprofile path, it considers the whole relative path as the filename
+# (e.g., build/opt/zsim.os), and all hell breaks loose when it tries to create
+# files in another dir. And because it uses checksums for filenames, it breaks
+# when you move the files. Check the repo for a version that tries this.
+if GetOption('pgoBuild'):
+    for type in buildTypes:
+        print "Building PGO binary"
+        root = Dir('.').abspath
+        testsDir = joinpath(root, "tests")
+        trainCfgs = [f for f in os.listdir(testsDir) if f.startswith("pgo")]
+        print "Using training configs", trainCfgs
+        
+        baseDir = joinpath(baseBuildDir, "pgo-" + type)
+        genCmd = "scons -j16 --pgoPhase=generate-" + type
+        runCmds = []
+        for cfg in trainCfgs:
+            runCmd = "mkdir -p pgo-tmp && cd pgo-tmp && ../" + baseDir + "/zsim ../tests/" + cfg + " && cd .."
+            runCmds.append(runCmd)
+        useCmd = "scons -j16 --pgoPhase=use-" + type
+        Environment(ENV = os.environ).Command("dummyTgt-" + type, [], " && ".join([genCmd] + runCmds + [useCmd]))
+elif pgoPhase.startswith("generate"):
+    type = pgoPhase.split("-")[1]
+    buildSim(buildFlags[type], baseBuildDir, "pgo-" + type, "generate")
+elif pgoPhase.startswith("use"):
+    type = pgoPhase.split("-")[1]
+    buildSim(buildFlags[type], baseBuildDir, "pgo-" + type, "use")
+    baseDir = joinpath(baseBuildDir, "pgo-" + type)
+    Depends(Glob(joinpath(baseDir, "*.os")), "pgo-tmp/zsim.out") #force a rebuild
+else:
+    for type in buildTypes:
+        buildSim(buildFlags[type], baseBuildDir, type)
diff --git a/misc/cpplint.py b/misc/cpplint.py
new file mode 100755
index 00000000..119cfeb3
--- /dev/null
+++ b/misc/cpplint.py
@@ -0,0 +1,4023 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2009 Google Inc. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+#    * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#    * Redistributions in binary form must reproduce the above
+# copyright notice, this list of conditions and the following disclaimer
+# in the documentation and/or other materials provided with the
+# distribution.
+#    * Neither the name of Google Inc. nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# Here are some issues that I've had people identify in my code during reviews,
+# that I think are possible to flag automatically in a lint tool.  If these were
+# caught by lint, it would save time both for myself and that of my reviewers.
+# Most likely, some of these are beyond the scope of the current lint framework,
+# but I think it is valuable to retain these wish-list items even if they cannot
+# be immediately implemented.
+#
+#  Suggestions
+#  -----------
+#  - Check for no 'explicit' for multi-arg ctor
+#  - Check for boolean assign RHS in parens
+#  - Check for ctor initializer-list colon position and spacing
+#  - Check that if there's a ctor, there should be a dtor
+#  - Check accessors that return non-pointer member variables are
+#    declared const
+#  - Check accessors that return non-const pointer member vars are
+#    *not* declared const
+#  - Check for using public includes for testing
+#  - Check for spaces between brackets in one-line inline method
+#  - Check for no assert()
+#  - Check for spaces surrounding operators
+#  - Check for 0 in pointer context (should be NULL)
+#  - Check for 0 in char context (should be '\0')
+#  - Check for camel-case method name conventions for methods
+#    that are not simple inline getters and setters
+#  - Do not indent namespace contents
+#  - Avoid inlining non-trivial constructors in header files
+#  - Check for old-school (void) cast for call-sites of functions
+#    ignored return value
+#  - Check gUnit usage of anonymous namespace
+#  - Check for class declaration order (typedefs, consts, enums,
+#    ctor(s?), dtor, friend declarations, methods, member vars)
+#
+
+# dsm: Minor modifications to lint zsim code
+
+"""Does google-lint on c++ files.
+
+The goal of this script is to identify places in the code that *may*
+be in non-compliance with google style.  It does not attempt to fix
+up these problems -- the point is to educate.  It does also not
+attempt to find all problems, or to ensure that everything it does
+find is legitimately a problem.
+
+In particular, we can get very confused by /* and // inside strings!
+We do a small hack, which is to ignore //'s with "'s after them on the
+same line, but it is far from perfect (in either direction).
+"""
+
+import codecs
+import copy
+import getopt
+import math  # for log
+import os
+import re
+import sre_compile
+import string
+import sys
+import unicodedata
+
+
+_USAGE = """
+Syntax: cpplint.py [--verbose=#] [--output=vs7] [--filter=-x,+y,...]
+                   [--counting=total|toplevel|detailed]
+        <file> [file] ...
+
+  The style guidelines this tries to follow are those in
+    http://google-styleguide.googlecode.com/svn/trunk/cppguide.xml
+
+  Every problem is given a confidence score from 1-5, with 5 meaning we are
+  certain of the problem, and 1 meaning it could be a legitimate construct.
+  This will miss some errors, and is not a substitute for a code review.
+
+  To suppress false-positive errors of a certain category, add a
+  'NOLINT(category)' comment to the line.  NOLINT or NOLINT(*)
+  suppresses errors of all categories on that line.
+
+  The files passed in will be linted; at least one file must be provided.
+  Linted extensions are .cc, .cpp, and .h.  Other file types will be ignored.
+
+  Flags:
+
+    output=vs7
+      By default, the output is formatted to ease emacs parsing.  Visual Studio
+      compatible output (vs7) may also be used.  Other formats are unsupported.
+
+    verbose=#
+      Specify a number 0-5 to restrict errors to certain verbosity levels.
+
+    filter=-x,+y,...
+      Specify a comma-separated list of category-filters to apply: only
+      error messages whose category names pass the filters will be printed.
+      (Category names are printed with the message and look like
+      "[whitespace/indent]".)  Filters are evaluated left to right.
+      "-FOO" and "FOO" means "do not print categories that start with FOO".
+      "+FOO" means "do print categories that start with FOO".
+
+      Examples: --filter=-whitespace,+whitespace/braces
+                --filter=whitespace,runtime/printf,+runtime/printf_format
+                --filter=-,+build/include_what_you_use
+
+      To see a list of all the categories used in cpplint, pass no arg:
+         --filter=
+
+    counting=total|toplevel|detailed
+      The total number of errors found is always printed. If
+      'toplevel' is provided, then the count of errors in each of
+      the top-level categories like 'build' and 'whitespace' will
+      also be printed. If 'detailed' is provided, then a count
+      is provided for each category like 'build/class'.
+
+    root=subdir
+      The root directory used for deriving header guard CPP variable.
+      By default, the header guard CPP variable is calculated as the relative
+      path to the directory that contains .git, .hg, or .svn.  When this flag
+      is specified, the relative path is calculated from the specified
+      directory. If the specified directory does not exist, this flag is
+      ignored.
+
+      Examples:
+        Assuing that src/.git exists, the header guard CPP variables for
+        src/chrome/browser/ui/browser.h are:
+
+        No flag => CHROME_BROWSER_UI_BROWSER_H_
+        --root=chrome => BROWSER_UI_BROWSER_H_
+        --root=chrome/browser => UI_BROWSER_H_
+"""
+
+# We categorize each error message we print.  Here are the categories.
+# We want an explicit list so we can list them all in cpplint --filter=.
+# If you add a new error message with a new category, add it to the list
+# here!  cpplint_unittest.py should tell you if you forget to do this.
+# \ used for clearer layout -- pylint: disable-msg=C6013
+_ERROR_CATEGORIES = [
+  'build/class',
+  'build/deprecated',
+  'build/endif_comment',
+  'build/explicit_make_pair',
+  'build/forward_decl',
+  'build/header_guard',
+  'build/include',
+  'build/include_alpha',
+  'build/include_order',
+  'build/include_what_you_use',
+  'build/namespaces',
+  'build/printf_format',
+  'build/storage_class',
+  'legal/copyright',
+  'readability/alt_tokens',
+  'readability/braces',
+  'readability/casting',
+  'readability/check',
+  'readability/constructors',
+  'readability/fn_size',
+  'readability/function',
+  'readability/multiline_comment',
+  'readability/multiline_string',
+  'readability/namespace',
+  'readability/nolint',
+  'readability/streams',
+  'readability/todo',
+  'readability/utf8',
+  'runtime/arrays',
+  'runtime/casting',
+  'runtime/explicit',
+  'runtime/int',
+  'runtime/init',
+  'runtime/invalid_increment',
+  'runtime/member_string_references',
+  'runtime/memset',
+  'runtime/operator',
+  'runtime/printf',
+  'runtime/printf_format',
+  'runtime/references',
+  'runtime/rtti',
+  'runtime/sizeof',
+  'runtime/string',
+  'runtime/threadsafe_fn',
+  'whitespace/blank_line',
+  'whitespace/braces',
+  'whitespace/comma',
+  'whitespace/comments',
+  'whitespace/empty_loop_body',
+  'whitespace/end_of_line',
+  'whitespace/ending_newline',
+  'whitespace/forcolon',
+  'whitespace/indent',
+  'whitespace/labels',
+  'whitespace/line_length',
+  'whitespace/newline',
+  'whitespace/operators',
+  'whitespace/parens',
+  'whitespace/semicolon',
+  'whitespace/tab',
+  'whitespace/todo'
+  ]
+
+# The default state of the category filter. This is overrided by the --filter=
+# flag. By default all errors are on, so only add here categories that should be
+# off by default (i.e., categories that must be enabled by the --filter= flags).
+# All entries here should start with a '-' or '+', as in the --filter= flag.
+_DEFAULT_FILTERS = ['-build/include_alpha']
+
+# We used to check for high-bit characters, but after much discussion we
+# decided those were OK, as long as they were in UTF-8 and didn't represent
+# hard-coded international strings, which belong in a separate i18n file.
+
+# Headers that we consider STL headers.
+_STL_HEADERS = frozenset([
+    'algobase.h', 'algorithm', 'alloc.h', 'bitset', 'deque', 'exception',
+    'function.h', 'functional', 'hash_map', 'hash_map.h', 'hash_set',
+    'hash_set.h', 'iterator', 'list', 'list.h', 'map', 'memory', 'new',
+    'pair.h', 'pthread_alloc', 'queue', 'set', 'set.h', 'sstream', 'stack',
+    'stl_alloc.h', 'stl_relops.h', 'type_traits.h',
+    'utility', 'vector', 'vector.h',
+    ])
+
+
+# Non-STL C++ system headers.
+_CPP_HEADERS = frozenset([
+    'algo.h', 'builtinbuf.h', 'bvector.h', 'cassert', 'cctype',
+    'cerrno', 'cfloat', 'ciso646', 'climits', 'clocale', 'cmath',
+    'complex', 'complex.h', 'csetjmp', 'csignal', 'cstdarg', 'cstddef',
+    'cstdio', 'cstdlib', 'cstring', 'ctime', 'cwchar', 'cwctype',
+    'defalloc.h', 'deque.h', 'editbuf.h', 'exception', 'fstream',
+    'fstream.h', 'hashtable.h', 'heap.h', 'indstream.h', 'iomanip',
+    'iomanip.h', 'ios', 'iosfwd', 'iostream', 'iostream.h', 'istream',
+    'istream.h', 'iterator.h', 'limits', 'map.h', 'multimap.h', 'multiset.h',
+    'numeric', 'ostream', 'ostream.h', 'parsestream.h', 'pfstream.h',
+    'PlotFile.h', 'procbuf.h', 'pthread_alloc.h', 'rope', 'rope.h',
+    'ropeimpl.h', 'SFile.h', 'slist', 'slist.h', 'stack.h', 'stdexcept',
+    'stdiostream.h', 'streambuf.h', 'stream.h', 'strfile.h', 'string',
+    'strstream', 'strstream.h', 'tempbuf.h', 'tree.h', 'typeinfo', 'valarray',
+    ])
+
+
+# Assertion macros.  These are defined in base/logging.h and
+# testing/base/gunit.h.  Note that the _M versions need to come first
+# for substring matching to work.
+_CHECK_MACROS = [
+    'DCHECK', 'CHECK',
+    'EXPECT_TRUE_M', 'EXPECT_TRUE',
+    'ASSERT_TRUE_M', 'ASSERT_TRUE',
+    'EXPECT_FALSE_M', 'EXPECT_FALSE',
+    'ASSERT_FALSE_M', 'ASSERT_FALSE',
+    ]
+
+# Replacement macros for CHECK/DCHECK/EXPECT_TRUE/EXPECT_FALSE
+_CHECK_REPLACEMENT = dict([(m, {}) for m in _CHECK_MACROS])
+
+for op, replacement in [('==', 'EQ'), ('!=', 'NE'),
+                        ('>=', 'GE'), ('>', 'GT'),
+                        ('<=', 'LE'), ('<', 'LT')]:
+  _CHECK_REPLACEMENT['DCHECK'][op] = 'DCHECK_%s' % replacement
+  _CHECK_REPLACEMENT['CHECK'][op] = 'CHECK_%s' % replacement
+  _CHECK_REPLACEMENT['EXPECT_TRUE'][op] = 'EXPECT_%s' % replacement
+  _CHECK_REPLACEMENT['ASSERT_TRUE'][op] = 'ASSERT_%s' % replacement
+  _CHECK_REPLACEMENT['EXPECT_TRUE_M'][op] = 'EXPECT_%s_M' % replacement
+  _CHECK_REPLACEMENT['ASSERT_TRUE_M'][op] = 'ASSERT_%s_M' % replacement
+
+for op, inv_replacement in [('==', 'NE'), ('!=', 'EQ'),
+                            ('>=', 'LT'), ('>', 'LE'),
+                            ('<=', 'GT'), ('<', 'GE')]:
+  _CHECK_REPLACEMENT['EXPECT_FALSE'][op] = 'EXPECT_%s' % inv_replacement
+  _CHECK_REPLACEMENT['ASSERT_FALSE'][op] = 'ASSERT_%s' % inv_replacement
+  _CHECK_REPLACEMENT['EXPECT_FALSE_M'][op] = 'EXPECT_%s_M' % inv_replacement
+  _CHECK_REPLACEMENT['ASSERT_FALSE_M'][op] = 'ASSERT_%s_M' % inv_replacement
+
+# Alternative tokens and their replacements.  For full list, see section 2.5
+# Alternative tokens [lex.digraph] in the C++ standard.
+#
+# Digraphs (such as '%:') are not included here since it's a mess to
+# match those on a word boundary.
+_ALT_TOKEN_REPLACEMENT = {
+    'and': '&&',
+    'bitor': '|',
+    'or': '||',
+    'xor': '^',
+    'compl': '~',
+    'bitand': '&',
+    'and_eq': '&=',
+    'or_eq': '|=',
+    'xor_eq': '^=',
+    'not': '!',
+    'not_eq': '!='
+    }
+
+# Compile regular expression that matches all the above keywords.  The "[ =()]"
+# bit is meant to avoid matching these keywords outside of boolean expressions.
+#
+# False positives include C-style multi-line comments (http://go/nsiut )
+# and multi-line strings (http://go/beujw ), but those have always been
+# troublesome for cpplint.
+_ALT_TOKEN_REPLACEMENT_PATTERN = re.compile(
+    r'[ =()](' + ('|'.join(_ALT_TOKEN_REPLACEMENT.keys())) + r')(?=[ (]|$)')
+
+
+# These constants define types of headers for use with
+# _IncludeState.CheckNextIncludeOrder().
+_C_SYS_HEADER = 1
+_CPP_SYS_HEADER = 2
+_LIKELY_MY_HEADER = 3
+_POSSIBLE_MY_HEADER = 4
+_OTHER_HEADER = 5
+
+# These constants define the current inline assembly state
+_NO_ASM = 0       # Outside of inline assembly block
+_INSIDE_ASM = 1   # Inside inline assembly block
+_END_ASM = 2      # Last line of inline assembly block
+_BLOCK_ASM = 3    # The whole block is an inline assembly block
+
+# Match start of assembly blocks
+_MATCH_ASM = re.compile(r'^\s*(?:asm|_asm|__asm|__asm__)'
+                        r'(?:\s+(volatile|__volatile__))?'
+                        r'\s*[{(]')
+
+
+_regexp_compile_cache = {}
+
+# Finds occurrences of NOLINT or NOLINT(...).
+_RE_SUPPRESSION = re.compile(r'\bNOLINT\b(\([^)]*\))?')
+
+# {str, set(int)}: a map from error categories to sets of linenumbers
+# on which those errors are expected and should be suppressed.
+_error_suppressions = {}
+
+# The root directory used for deriving header guard CPP variable.
+# This is set by --root flag.
+_root = None
+
+def ParseNolintSuppressions(filename, raw_line, linenum, error):
+  """Updates the global list of error-suppressions.
+
+  Parses any NOLINT comments on the current line, updating the global
+  error_suppressions store.  Reports an error if the NOLINT comment
+  was malformed.
+
+  Args:
+    filename: str, the name of the input file.
+    raw_line: str, the line of input text, with comments.
+    linenum: int, the number of the current line.
+    error: function, an error handler.
+  """
+  # FIXME(adonovan): "NOLINT(" is misparsed as NOLINT(*).
+  matched = _RE_SUPPRESSION.search(raw_line)
+  if matched:
+    category = matched.group(1)
+    if category in (None, '(*)'):  # => "suppress all"
+      _error_suppressions.setdefault(None, set()).add(linenum)
+    else:
+      if category.startswith('(') and category.endswith(')'):
+        category = category[1:-1]
+        if category in _ERROR_CATEGORIES:
+          _error_suppressions.setdefault(category, set()).add(linenum)
+        else:
+          error(filename, linenum, 'readability/nolint', 5,
+                'Unknown NOLINT error category: %s' % category)
+
+
+def ResetNolintSuppressions():
+  "Resets the set of NOLINT suppressions to empty."
+  _error_suppressions.clear()
+
+
+def IsErrorSuppressedByNolint(category, linenum):
+  """Returns true if the specified error category is suppressed on this line.
+
+  Consults the global error_suppressions map populated by
+  ParseNolintSuppressions/ResetNolintSuppressions.
+
+  Args:
+    category: str, the category of the error.
+    linenum: int, the current line number.
+  Returns:
+    bool, True iff the error should be suppressed due to a NOLINT comment.
+  """
+  return (linenum in _error_suppressions.get(category, set()) or
+          linenum in _error_suppressions.get(None, set()))
+
+def Match(pattern, s):
+  """Matches the string with the pattern, caching the compiled regexp."""
+  # The regexp compilation caching is inlined in both Match and Search for
+  # performance reasons; factoring it out into a separate function turns out
+  # to be noticeably expensive.
+  if not pattern in _regexp_compile_cache:
+    _regexp_compile_cache[pattern] = sre_compile.compile(pattern)
+  return _regexp_compile_cache[pattern].match(s)
+
+
+def Search(pattern, s):
+  """Searches the string for the pattern, caching the compiled regexp."""
+  if not pattern in _regexp_compile_cache:
+    _regexp_compile_cache[pattern] = sre_compile.compile(pattern)
+  return _regexp_compile_cache[pattern].search(s)
+
+
+class _IncludeState(dict):
+  """Tracks line numbers for includes, and the order in which includes appear.
+
+  As a dict, an _IncludeState object serves as a mapping between include
+  filename and line number on which that file was included.
+
+  Call CheckNextIncludeOrder() once for each header in the file, passing
+  in the type constants defined above. Calls in an illegal order will
+  raise an _IncludeError with an appropriate error message.
+
+  """
+  # self._section will move monotonically through this set. If it ever
+  # needs to move backwards, CheckNextIncludeOrder will raise an error.
+  _INITIAL_SECTION = 0
+  _MY_H_SECTION = 1
+  _C_SECTION = 2
+  _CPP_SECTION = 3
+  _OTHER_H_SECTION = 4
+
+  _TYPE_NAMES = {
+      _C_SYS_HEADER: 'C system header',
+      _CPP_SYS_HEADER: 'C++ system header',
+      _LIKELY_MY_HEADER: 'header this file implements',
+      _POSSIBLE_MY_HEADER: 'header this file may implement',
+      _OTHER_HEADER: 'other header',
+      }
+  _SECTION_NAMES = {
+      _INITIAL_SECTION: "... nothing. (This can't be an error.)",
+      _MY_H_SECTION: 'a header this file implements',
+      _C_SECTION: 'C system header',
+      _CPP_SECTION: 'C++ system header',
+      _OTHER_H_SECTION: 'other header',
+      }
+
+  def __init__(self):
+    dict.__init__(self)
+    # The name of the current section.
+    self._section = self._INITIAL_SECTION
+    # The path of last found header.
+    self._last_header = ''
+
+  def CanonicalizeAlphabeticalOrder(self, header_path):
+    """Returns a path canonicalized for alphabetical comparison.
+
+    - replaces "-" with "_" so they both cmp the same.
+    - removes '-inl' since we don't require them to be after the main header.
+    - lowercase everything, just in case.
+
+    Args:
+      header_path: Path to be canonicalized.
+
+    Returns:
+      Canonicalized path.
+    """
+    return header_path.replace('-inl.h', '.h').replace('-', '_').lower()
+
+  def IsInAlphabeticalOrder(self, header_path):
+    """Check if a header is in alphabetical order with the previous header.
+
+    Args:
+      header_path: Header to be checked.
+
+    Returns:
+      Returns true if the header is in alphabetical order.
+    """
+    canonical_header = self.CanonicalizeAlphabeticalOrder(header_path)
+    if self._last_header > canonical_header:
+      return False
+    self._last_header = canonical_header
+    return True
+
+  def CheckNextIncludeOrder(self, header_type):
+    """Returns a non-empty error message if the next header is out of order.
+
+    This function also updates the internal state to be ready to check
+    the next include.
+
+    Args:
+      header_type: One of the _XXX_HEADER constants defined above.
+
+    Returns:
+      The empty string if the header is in the right order, or an
+      error message describing what's wrong.
+
+    """
+    error_message = ('Found %s after %s' %
+                     (self._TYPE_NAMES[header_type],
+                      self._SECTION_NAMES[self._section]))
+
+    last_section = self._section
+
+    if header_type == _C_SYS_HEADER:
+      if self._section <= self._C_SECTION:
+        self._section = self._C_SECTION
+      else:
+        self._last_header = ''
+        return error_message
+    elif header_type == _CPP_SYS_HEADER:
+      if self._section <= self._CPP_SECTION:
+        self._section = self._CPP_SECTION
+      else:
+        self._last_header = ''
+        return error_message
+    elif header_type == _LIKELY_MY_HEADER:
+      if self._section <= self._MY_H_SECTION:
+        self._section = self._MY_H_SECTION
+      else:
+        self._section = self._OTHER_H_SECTION
+    elif header_type == _POSSIBLE_MY_HEADER:
+      if self._section <= self._MY_H_SECTION:
+        self._section = self._MY_H_SECTION
+      else:
+        # This will always be the fallback because we're not sure
+        # enough that the header is associated with this file.
+        self._section = self._OTHER_H_SECTION
+    else:
+      assert header_type == _OTHER_HEADER
+      self._section = self._OTHER_H_SECTION
+
+    if last_section != self._section:
+      self._last_header = ''
+
+    return ''
+
+
+class _CppLintState(object):
+  """Maintains module-wide state.."""
+
+  def __init__(self):
+    self.verbose_level = 1  # global setting.
+    self.error_count = 0    # global count of reported errors
+    # filters to apply when emitting error messages
+    self.filters = _DEFAULT_FILTERS[:]
+    self.counting = 'total'  # In what way are we counting errors?
+    self.errors_by_category = {}  # string to int dict storing error counts
+
+    # output format:
+    # "emacs" - format that emacs can parse (default)
+    # "vs7" - format that Microsoft Visual Studio 7 can parse
+    self.output_format = 'emacs'
+
+  def SetOutputFormat(self, output_format):
+    """Sets the output format for errors."""
+    self.output_format = output_format
+
+  def SetVerboseLevel(self, level):
+    """Sets the module's verbosity, and returns the previous setting."""
+    last_verbose_level = self.verbose_level
+    self.verbose_level = level
+    return last_verbose_level
+
+  def SetCountingStyle(self, counting_style):
+    """Sets the module's counting options."""
+    self.counting = counting_style
+
+  def SetFilters(self, filters):
+    """Sets the error-message filters.
+
+    These filters are applied when deciding whether to emit a given
+    error message.
+
+    Args:
+      filters: A string of comma-separated filters (eg "+whitespace/indent").
+               Each filter should start with + or -; else we die.
+
+    Raises:
+      ValueError: The comma-separated filters did not all start with '+' or '-'.
+                  E.g. "-,+whitespace,-whitespace/indent,whitespace/badfilter"
+    """
+    # Default filters always have less priority than the flag ones.
+    self.filters = _DEFAULT_FILTERS[:]
+    for filt in filters.split(','):
+      clean_filt = filt.strip()
+      if clean_filt:
+        self.filters.append(clean_filt)
+    for filt in self.filters:
+      if not (filt.startswith('+') or filt.startswith('-')):
+        raise ValueError('Every filter in --filters must start with + or -'
+                         ' (%s does not)' % filt)
+
+  def ResetErrorCounts(self):
+    """Sets the module's error statistic back to zero."""
+    self.error_count = 0
+    self.errors_by_category = {}
+
+  def IncrementErrorCount(self, category):
+    """Bumps the module's error statistic."""
+    self.error_count += 1
+    if self.counting in ('toplevel', 'detailed'):
+      if self.counting != 'detailed':
+        category = category.split('/')[0]
+      if category not in self.errors_by_category:
+        self.errors_by_category[category] = 0
+      self.errors_by_category[category] += 1
+
+  def PrintErrorCounts(self):
+    """Print a summary of errors by category, and the total."""
+    for category, count in self.errors_by_category.iteritems():
+      sys.stderr.write('Category \'%s\' errors found: %d\n' %
+                       (category, count))
+    sys.stderr.write('Total errors found: %d\n' % self.error_count)
+
+_cpplint_state = _CppLintState()
+
+
+def _OutputFormat():
+  """Gets the module's output format."""
+  return _cpplint_state.output_format
+
+
+def _SetOutputFormat(output_format):
+  """Sets the module's output format."""
+  _cpplint_state.SetOutputFormat(output_format)
+
+
+def _VerboseLevel():
+  """Returns the module's verbosity setting."""
+  return _cpplint_state.verbose_level
+
+
+def _SetVerboseLevel(level):
+  """Sets the module's verbosity, and returns the previous setting."""
+  return _cpplint_state.SetVerboseLevel(level)
+
+
+def _SetCountingStyle(level):
+  """Sets the module's counting options."""
+  _cpplint_state.SetCountingStyle(level)
+
+
+def _Filters():
+  """Returns the module's list of output filters, as a list."""
+  return _cpplint_state.filters
+
+
+def _SetFilters(filters):
+  """Sets the module's error-message filters.
+
+  These filters are applied when deciding whether to emit a given
+  error message.
+
+  Args:
+    filters: A string of comma-separated filters (eg "whitespace/indent").
+             Each filter should start with + or -; else we die.
+  """
+  _cpplint_state.SetFilters(filters)
+
+
+class _FunctionState(object):
+  """Tracks current function name and the number of lines in its body."""
+
+  _NORMAL_TRIGGER = 250  # for --v=0, 500 for --v=1, etc.
+  _TEST_TRIGGER = 400    # about 50% more than _NORMAL_TRIGGER.
+
+  def __init__(self):
+    self.in_a_function = False
+    self.lines_in_function = 0
+    self.current_function = ''
+
+  def Begin(self, function_name):
+    """Start analyzing function body.
+
+    Args:
+      function_name: The name of the function being tracked.
+    """
+    self.in_a_function = True
+    self.lines_in_function = 0
+    self.current_function = function_name
+
+  def Count(self):
+    """Count line in current function body."""
+    if self.in_a_function:
+      self.lines_in_function += 1
+
+  def Check(self, error, filename, linenum):
+    """Report if too many lines in function body.
+
+    Args:
+      error: The function to call with any errors found.
+      filename: The name of the current file.
+      linenum: The number of the line to check.
+    """
+    if Match(r'T(EST|est)', self.current_function):
+      base_trigger = self._TEST_TRIGGER
+    else:
+      base_trigger = self._NORMAL_TRIGGER
+    trigger = base_trigger * 2**_VerboseLevel()
+
+    if self.lines_in_function > trigger:
+      error_level = int(math.log(self.lines_in_function / base_trigger, 2))
+      # 50 => 0, 100 => 1, 200 => 2, 400 => 3, 800 => 4, 1600 => 5, ...
+      if error_level > 5:
+        error_level = 5
+      error(filename, linenum, 'readability/fn_size', error_level,
+            'Small and focused functions are preferred:'
+            ' %s has %d non-comment lines'
+            ' (error triggered by exceeding %d lines).'  % (
+                self.current_function, self.lines_in_function, trigger))
+
+  def End(self):
+    """Stop analyzing function body."""
+    self.in_a_function = False
+
+
+class _IncludeError(Exception):
+  """Indicates a problem with the include order in a file."""
+  pass
+
+
+class FileInfo:
+  """Provides utility functions for filenames.
+
+  FileInfo provides easy access to the components of a file's path
+  relative to the project root.
+  """
+
+  def __init__(self, filename):
+    self._filename = filename
+
+  def FullName(self):
+    """Make Windows paths like Unix."""
+    return os.path.abspath(self._filename).replace('\\', '/')
+
+  def RepositoryName(self):
+    """FullName after removing the local path to the repository.
+
+    If we have a real absolute path name here we can try to do something smart:
+    detecting the root of the checkout and truncating /path/to/checkout from
+    the name so that we get header guards that don't include things like
+    "C:\Documents and Settings\..." or "/home/username/..." in them and thus
+    people on different computers who have checked the source out to different
+    locations won't see bogus errors.
+    """
+    fullname = self.FullName()
+
+    if os.path.exists(fullname):
+      project_dir = os.path.dirname(fullname)
+
+      if os.path.exists(os.path.join(project_dir, ".svn")):
+        # If there's a .svn file in the current directory, we recursively look
+        # up the directory tree for the top of the SVN checkout
+        root_dir = project_dir
+        one_up_dir = os.path.dirname(root_dir)
+        while os.path.exists(os.path.join(one_up_dir, ".svn")):
+          root_dir = os.path.dirname(root_dir)
+          one_up_dir = os.path.dirname(one_up_dir)
+
+        prefix = os.path.commonprefix([root_dir, project_dir])
+        return fullname[len(prefix) + 1:]
+
+      # Not SVN <= 1.6? Try to find a git, hg, or svn top level directory by
+      # searching up from the current path.
+      root_dir = os.path.dirname(fullname)
+      while (root_dir != os.path.dirname(root_dir) and
+             not os.path.exists(os.path.join(root_dir, ".git")) and
+             not os.path.exists(os.path.join(root_dir, ".hg")) and
+             not os.path.exists(os.path.join(root_dir, ".svn"))):
+        root_dir = os.path.dirname(root_dir)
+
+      if (os.path.exists(os.path.join(root_dir, ".git")) or
+          os.path.exists(os.path.join(root_dir, ".hg")) or
+          os.path.exists(os.path.join(root_dir, ".svn"))):
+        prefix = os.path.commonprefix([root_dir, project_dir])
+        return fullname[len(prefix) + 1:]
+
+    # Don't know what to do; header guard warnings may be wrong...
+    return fullname
+
+  def Split(self):
+    """Splits the file into the directory, basename, and extension.
+
+    For 'chrome/browser/browser.cc', Split() would
+    return ('chrome/browser', 'browser', '.cc')
+
+    Returns:
+      A tuple of (directory, basename, extension).
+    """
+
+    googlename = self.RepositoryName()
+    project, rest = os.path.split(googlename)
+    return (project,) + os.path.splitext(rest)
+
+  def BaseName(self):
+    """File base name - text after the final slash, before the final period."""
+    return self.Split()[1]
+
+  def Extension(self):
+    """File extension - text following the final period."""
+    return self.Split()[2]
+
+  def NoExtension(self):
+    """File has no source file extension."""
+    return '/'.join(self.Split()[0:2])
+
+  def IsSource(self):
+    """File has a source file extension."""
+    return self.Extension()[1:] in ('c', 'cc', 'cpp', 'cxx')
+
+
+def _ShouldPrintError(category, confidence, linenum):
+  """If confidence >= verbose, category passes filter and is not suppressed."""
+
+  # There are three ways we might decide not to print an error message:
+  # a "NOLINT(category)" comment appears in the source,
+  # the verbosity level isn't high enough, or the filters filter it out.
+  if IsErrorSuppressedByNolint(category, linenum):
+    return False
+  if confidence < _cpplint_state.verbose_level:
+    return False
+
+  is_filtered = False
+  for one_filter in _Filters():
+    if one_filter.startswith('-'):
+      if category.startswith(one_filter[1:]):
+        is_filtered = True
+    elif one_filter.startswith('+'):
+      if category.startswith(one_filter[1:]):
+        is_filtered = False
+    else:
+      assert False  # should have been checked for in SetFilter.
+  if is_filtered:
+    return False
+
+  return True
+
+
+def Error(filename, linenum, category, confidence, message):
+  """Logs the fact we've found a lint error.
+
+  We log where the error was found, and also our confidence in the error,
+  that is, how certain we are this is a legitimate style regression, and
+  not a misidentification or a use that's sometimes justified.
+
+  False positives can be suppressed by the use of
+  "cpplint(category)"  comments on the offending line.  These are
+  parsed into _error_suppressions.
+
+  Args:
+    filename: The name of the file containing the error.
+    linenum: The number of the line containing the error.
+    category: A string used to describe the "category" this bug
+      falls under: "whitespace", say, or "runtime".  Categories
+      may have a hierarchy separated by slashes: "whitespace/indent".
+    confidence: A number from 1-5 representing a confidence score for
+      the error, with 5 meaning that we are certain of the problem,
+      and 1 meaning that it could be a legitimate construct.
+    message: The error message.
+  """
+  if _ShouldPrintError(category, confidence, linenum):
+    _cpplint_state.IncrementErrorCount(category)
+    if _cpplint_state.output_format == 'vs7':
+      sys.stderr.write('%s(%s):  %s  [%s] [%d]\n' % (
+          filename, linenum, message, category, confidence))
+    else:
+      sys.stderr.write('%s:%s:  %s  [%s] [%d]\n' % (
+          filename, linenum, message, category, confidence))
+
+
+# Matches standard C++ escape esequences per 2.13.2.3 of the C++ standard.
+_RE_PATTERN_CLEANSE_LINE_ESCAPES = re.compile(
+    r'\\([abfnrtv?"\\\']|\d+|x[0-9a-fA-F]+)')
+# Matches strings.  Escape codes should already be removed by ESCAPES.
+_RE_PATTERN_CLEANSE_LINE_DOUBLE_QUOTES = re.compile(r'"[^"]*"')
+# Matches characters.  Escape codes should already be removed by ESCAPES.
+_RE_PATTERN_CLEANSE_LINE_SINGLE_QUOTES = re.compile(r"'.'")
+# Matches multi-line C++ comments.
+# This RE is a little bit more complicated than one might expect, because we
+# have to take care of space removals tools so we can handle comments inside
+# statements better.
+# The current rule is: We only clear spaces from both sides when we're at the
+# end of the line. Otherwise, we try to remove spaces from the right side,
+# if this doesn't work we try on left side but only if there's a non-character
+# on the right.
+_RE_PATTERN_CLEANSE_LINE_C_COMMENTS = re.compile(
+    r"""(\s*/\*.*\*/\s*$|
+            /\*.*\*/\s+|
+         \s+/\*.*\*/(?=\W)|
+            /\*.*\*/)""", re.VERBOSE)
+
+
+def IsCppString(line):
+  """Does line terminate so, that the next symbol is in string constant.
+
+  This function does not consider single-line nor multi-line comments.
+
+  Args:
+    line: is a partial line of code starting from the 0..n.
+
+  Returns:
+    True, if next character appended to 'line' is inside a
+    string constant.
+  """
+
+  line = line.replace(r'\\', 'XX')  # after this, \\" does not match to \"
+  return ((line.count('"') - line.count(r'\"') - line.count("'\"'")) & 1) == 1
+
+
+def FindNextMultiLineCommentStart(lines, lineix):
+  """Find the beginning marker for a multiline comment."""
+  while lineix < len(lines):
+    if lines[lineix].strip().startswith('/*'):
+      # Only return this marker if the comment goes beyond this line
+      if lines[lineix].strip().find('*/', 2) < 0:
+        return lineix
+    lineix += 1
+  return len(lines)
+
+
+def FindNextMultiLineCommentEnd(lines, lineix):
+  """We are inside a comment, find the end marker."""
+  while lineix < len(lines):
+    if lines[lineix].strip().endswith('*/'):
+      return lineix
+    lineix += 1
+  return len(lines)
+
+
+def RemoveMultiLineCommentsFromRange(lines, begin, end):
+  """Clears a range of lines for multi-line comments."""
+  # Having // dummy comments makes the lines non-empty, so we will not get
+  # unnecessary blank line warnings later in the code.
+  for i in range(begin, end):
+    lines[i] = '// dummy'
+
+
+def RemoveMultiLineComments(filename, lines, error):
+  """Removes multiline (c-style) comments from lines."""
+  lineix = 0
+  while lineix < len(lines):
+    lineix_begin = FindNextMultiLineCommentStart(lines, lineix)
+    if lineix_begin >= len(lines):
+      return
+    lineix_end = FindNextMultiLineCommentEnd(lines, lineix_begin)
+    if lineix_end >= len(lines):
+      error(filename, lineix_begin + 1, 'readability/multiline_comment', 5,
+            'Could not find end of multi-line comment')
+      return
+    RemoveMultiLineCommentsFromRange(lines, lineix_begin, lineix_end + 1)
+    lineix = lineix_end + 1
+
+
+def CleanseComments(line):
+  """Removes //-comments and single-line C-style /* */ comments.
+
+  Args:
+    line: A line of C++ source.
+
+  Returns:
+    The line with single-line comments removed.
+  """
+  commentpos = line.find('//')
+  if commentpos != -1 and not IsCppString(line[:commentpos]):
+    line = line[:commentpos].rstrip()
+  # get rid of /* ... */
+  return _RE_PATTERN_CLEANSE_LINE_C_COMMENTS.sub('', line)
+
+
+class CleansedLines(object):
+  """Holds 3 copies of all lines with different preprocessing applied to them.
+
+  1) elided member contains lines without strings and comments,
+  2) lines member contains lines without comments, and
+  3) raw_lines member contains all the lines without processing.
+  All these three members are of <type 'list'>, and of the same length.
+  """
+
+  def __init__(self, lines):
+    self.elided = []
+    self.lines = []
+    self.raw_lines = lines
+    self.num_lines = len(lines)
+    for linenum in range(len(lines)):
+      self.lines.append(CleanseComments(lines[linenum]))
+      elided = self._CollapseStrings(lines[linenum])
+      self.elided.append(CleanseComments(elided))
+
+  def NumLines(self):
+    """Returns the number of lines represented."""
+    return self.num_lines
+
+  @staticmethod
+  def _CollapseStrings(elided):
+    """Collapses strings and chars on a line to simple "" or '' blocks.
+
+    We nix strings first so we're not fooled by text like '"http://"'
+
+    Args:
+      elided: The line being processed.
+
+    Returns:
+      The line with collapsed strings.
+    """
+    if not _RE_PATTERN_INCLUDE.match(elided):
+      # Remove escaped characters first to make quote/single quote collapsing
+      # basic.  Things that look like escaped characters shouldn't occur
+      # outside of strings and chars.
+      elided = _RE_PATTERN_CLEANSE_LINE_ESCAPES.sub('', elided)
+      elided = _RE_PATTERN_CLEANSE_LINE_SINGLE_QUOTES.sub("''", elided)
+      elided = _RE_PATTERN_CLEANSE_LINE_DOUBLE_QUOTES.sub('""', elided)
+    return elided
+
+
+def FindEndOfExpressionInLine(line, startpos, depth, startchar, endchar):
+  """Find the position just after the matching endchar.
+
+  Args:
+    line: a CleansedLines line.
+    startpos: start searching at this position.
+    depth: nesting level at startpos.
+    startchar: expression opening character.
+    endchar: expression closing character.
+
+  Returns:
+    Index just after endchar.
+  """
+  for i in xrange(startpos, len(line)):
+    if line[i] == startchar:
+      depth += 1
+    elif line[i] == endchar:
+      depth -= 1
+      if depth == 0:
+        return i + 1
+  return -1
+
+
+def CloseExpression(clean_lines, linenum, pos):
+  """If input points to ( or { or [, finds the position that closes it.
+
+  If lines[linenum][pos] points to a '(' or '{' or '[', finds the
+  linenum/pos that correspond to the closing of the expression.
+
+  Args:
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    pos: A position on the line.
+
+  Returns:
+    A tuple (line, linenum, pos) pointer *past* the closing brace, or
+    (line, len(lines), -1) if we never find a close.  Note we ignore
+    strings and comments when matching; and the line we return is the
+    'cleansed' line at linenum.
+  """
+
+  line = clean_lines.elided[linenum]
+  startchar = line[pos]
+  if startchar not in '({[':
+    return (line, clean_lines.NumLines(), -1)
+  if startchar == '(': endchar = ')'
+  if startchar == '[': endchar = ']'
+  if startchar == '{': endchar = '}'
+
+  # Check first line
+  end_pos = FindEndOfExpressionInLine(line, pos, 0, startchar, endchar)
+  if end_pos > -1:
+    return (line, linenum, end_pos)
+  tail = line[pos:]
+  num_open = tail.count(startchar) - tail.count(endchar)
+  while linenum < clean_lines.NumLines() - 1:
+    linenum += 1
+    line = clean_lines.elided[linenum]
+    delta = line.count(startchar) - line.count(endchar)
+    if num_open + delta <= 0:
+      return (line, linenum,
+              FindEndOfExpressionInLine(line, 0, num_open, startchar, endchar))
+    num_open += delta
+
+  # Did not find endchar before end of file, give up
+  return (line, clean_lines.NumLines(), -1)
+
+def CheckForCopyright(filename, lines, error):
+  """Logs an error if no Copyright message appears at the top of the file."""
+
+  # We'll say it should occur by line 10. Don't forget there's a
+  # dummy line at the front.
+  for line in xrange(1, min(len(lines), 11)):
+    if re.search(r'Copyright', lines[line], re.I): break
+  else:                       # means no copyright line was found
+    error(filename, 0, 'legal/copyright', 5,
+          'No copyright message found.  '
+          'You should have a line: "Copyright [year] <Copyright Owner>"')
+
+
+def GetHeaderGuardCPPVariable(filename):
+  """Returns the CPP variable that should be used as a header guard.
+
+  Args:
+    filename: The name of a C++ header file.
+
+  Returns:
+    The CPP variable that should be used as a header guard in the
+    named file.
+
+  """
+
+  # Restores original filename in case that cpplint is invoked from Emacs's
+  # flymake.
+  filename = re.sub(r'_flymake\.h$', '.h', filename)
+  filename = re.sub(r'/\.flymake/([^/]*)$', r'/\1', filename)
+
+  fileinfo = FileInfo(filename)
+  file_path_from_root = fileinfo.RepositoryName()
+  if _root:
+    file_path_from_root = re.sub('^' + _root + os.sep, '', file_path_from_root)
+  return  re.sub(r'[-./\s]', '_', file_path_from_root).upper() + '_'
+
+
+def CheckForHeaderGuard(filename, lines, error):
+  """Checks that the file contains a header guard.
+
+  Logs an error if no #ifndef header guard is present.  For other
+  headers, checks that the full pathname is used.
+
+  Args:
+    filename: The name of the C++ header file.
+    lines: An array of strings, each representing a line of the file.
+    error: The function to call with any errors found.
+  """
+
+  cppvar = GetHeaderGuardCPPVariable(filename)
+
+  ifndef = None
+  ifndef_linenum = 0
+  define = None
+  endif = None
+  endif_linenum = 0
+  for linenum, line in enumerate(lines):
+    linesplit = line.split()
+    if len(linesplit) >= 2:
+      # find the first occurrence of #ifndef and #define, save arg
+      if not ifndef and linesplit[0] == '#ifndef':
+        # set ifndef to the header guard presented on the #ifndef line.
+        ifndef = linesplit[1]
+        ifndef_linenum = linenum
+      if not define and linesplit[0] == '#define':
+        define = linesplit[1]
+    # find the last occurrence of #endif, save entire line
+    if line.startswith('#endif'):
+      endif = line
+      endif_linenum = linenum
+
+  if not ifndef:
+    error(filename, 0, 'build/header_guard', 5,
+          'No #ifndef header guard found, suggested CPP variable is: %s' %
+          cppvar)
+    return
+
+  if not define:
+    error(filename, 0, 'build/header_guard', 5,
+          'No #define header guard found, suggested CPP variable is: %s' %
+          cppvar)
+    return
+
+  # The guard should be PATH_FILE_H_, but we also allow PATH_FILE_H__
+  # for backward compatibility.
+  if ifndef != cppvar:
+    error_level = 0
+    if ifndef != cppvar + '_':
+      error_level = 5
+
+    ParseNolintSuppressions(filename, lines[ifndef_linenum], ifndef_linenum,
+                            error)
+    error(filename, ifndef_linenum, 'build/header_guard', error_level,
+          '#ifndef header guard has wrong style, please use: %s' % cppvar)
+
+  if define != ifndef:
+    error(filename, 0, 'build/header_guard', 5,
+          '#ifndef and #define don\'t match, suggested CPP variable is: %s' %
+          cppvar)
+    return
+
+  if endif != ('#endif  // %s' % cppvar):
+    error_level = 0
+    if endif != ('#endif  // %s' % (cppvar + '_')):
+      error_level = 5
+
+    ParseNolintSuppressions(filename, lines[endif_linenum], endif_linenum,
+                            error)
+    error(filename, endif_linenum, 'build/header_guard', error_level,
+          '#endif line should be "#endif  // %s"' % cppvar)
+
+
+def CheckForUnicodeReplacementCharacters(filename, lines, error):
+  """Logs an error for each line containing Unicode replacement characters.
+
+  These indicate that either the file contained invalid UTF-8 (likely)
+  or Unicode replacement characters (which it shouldn't).  Note that
+  it's possible for this to throw off line numbering if the invalid
+  UTF-8 occurred adjacent to a newline.
+
+  Args:
+    filename: The name of the current file.
+    lines: An array of strings, each representing a line of the file.
+    error: The function to call with any errors found.
+  """
+  for linenum, line in enumerate(lines):
+    if u'\ufffd' in line:
+      error(filename, linenum, 'readability/utf8', 5,
+            'Line contains invalid UTF-8 (or Unicode replacement character).')
+
+
+def CheckForNewlineAtEOF(filename, lines, error):
+  """Logs an error if there is no newline char at the end of the file.
+
+  Args:
+    filename: The name of the current file.
+    lines: An array of strings, each representing a line of the file.
+    error: The function to call with any errors found.
+  """
+
+  # The array lines() was created by adding two newlines to the
+  # original file (go figure), then splitting on \n.
+  # To verify that the file ends in \n, we just have to make sure the
+  # last-but-two element of lines() exists and is empty.
+  if len(lines) < 3 or lines[-2]:
+    error(filename, len(lines) - 2, 'whitespace/ending_newline', 5,
+          'Could not find a newline character at the end of the file.')
+
+
+def CheckForMultilineCommentsAndStrings(filename, clean_lines, linenum, error):
+  """Logs an error if we see /* ... */ or "..." that extend past one line.
+
+  /* ... */ comments are legit inside macros, for one line.
+  Otherwise, we prefer // comments, so it's ok to warn about the
+  other.  Likewise, it's ok for strings to extend across multiple
+  lines, as long as a line continuation character (backslash)
+  terminates each line. Although not currently prohibited by the C++
+  style guide, it's ugly and unnecessary. We don't do well with either
+  in this lint program, so we warn about both.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+  line = clean_lines.elided[linenum]
+
+  # Remove all \\ (escaped backslashes) from the line. They are OK, and the
+  # second (escaped) slash may trigger later \" detection erroneously.
+  line = line.replace('\\\\', '')
+
+  if line.count('/*') > line.count('*/'):
+    error(filename, linenum, 'readability/multiline_comment', 5,
+          'Complex multi-line /*...*/-style comment found. '
+          'Lint may give bogus warnings.  '
+          'Consider replacing these with //-style comments, '
+          'with #if 0...#endif, '
+          'or with more clearly structured multi-line comments.')
+
+  if (line.count('"') - line.count('\\"')) % 2:
+    error(filename, linenum, 'readability/multiline_string', 5,
+          'Multi-line string ("...") found.  This lint script doesn\'t '
+          'do well with such strings, and may give bogus warnings.  They\'re '
+          'ugly and unnecessary, and you should use concatenation instead".')
+
+
+threading_list = (
+    ('asctime(', 'asctime_r('),
+    ('ctime(', 'ctime_r('),
+    ('getgrgid(', 'getgrgid_r('),
+    ('getgrnam(', 'getgrnam_r('),
+    ('getlogin(', 'getlogin_r('),
+    ('getpwnam(', 'getpwnam_r('),
+    ('getpwuid(', 'getpwuid_r('),
+    ('gmtime(', 'gmtime_r('),
+    ('localtime(', 'localtime_r('),
+    ('rand(', 'rand_r('),
+    ('readdir(', 'readdir_r('),
+    ('strtok(', 'strtok_r('),
+    ('ttyname(', 'ttyname_r('),
+    )
+
+
+def CheckPosixThreading(filename, clean_lines, linenum, error):
+  """Checks for calls to thread-unsafe functions.
+
+  Much code has been originally written without consideration of
+  multi-threading. Also, engineers are relying on their old experience;
+  they have learned posix before threading extensions were added. These
+  tests guide the engineers to use thread-safe functions (when using
+  posix directly).
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+  line = clean_lines.elided[linenum]
+  for single_thread_function, multithread_safe_function in threading_list:
+    ix = line.find(single_thread_function)
+    # Comparisons made explicit for clarity -- pylint: disable-msg=C6403
+    if ix >= 0 and (ix == 0 or (not line[ix - 1].isalnum() and
+                                line[ix - 1] not in ('_', '.', '>'))):
+      error(filename, linenum, 'runtime/threadsafe_fn', 2,
+            'Consider using ' + multithread_safe_function +
+            '...) instead of ' + single_thread_function +
+            '...) for improved thread safety.')
+
+
+# Matches invalid increment: *count++, which moves pointer instead of
+# incrementing a value.
+_RE_PATTERN_INVALID_INCREMENT = re.compile(
+    r'^\s*\*\w+(\+\+|--);')
+
+
+def CheckInvalidIncrement(filename, clean_lines, linenum, error):
+  """Checks for invalid increment *count++.
+
+  For example following function:
+  void increment_counter(int* count) {
+    *count++;
+  }
+  is invalid, because it effectively does count++, moving pointer, and should
+  be replaced with ++*count, (*count)++ or *count += 1.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+  line = clean_lines.elided[linenum]
+  if _RE_PATTERN_INVALID_INCREMENT.match(line):
+    error(filename, linenum, 'runtime/invalid_increment', 5,
+          'Changing pointer instead of value (or unused value of operator*).')
+
+
+class _BlockInfo(object):
+  """Stores information about a generic block of code."""
+
+  def __init__(self, seen_open_brace):
+    self.seen_open_brace = seen_open_brace
+    self.open_parentheses = 0
+    self.inline_asm = _NO_ASM
+
+  def CheckBegin(self, filename, clean_lines, linenum, error):
+    """Run checks that applies to text up to the opening brace.
+
+    This is mostly for checking the text after the class identifier
+    and the "{", usually where the base class is specified.  For other
+    blocks, there isn't much to check, so we always pass.
+
+    Args:
+      filename: The name of the current file.
+      clean_lines: A CleansedLines instance containing the file.
+      linenum: The number of the line to check.
+      error: The function to call with any errors found.
+    """
+    pass
+
+  def CheckEnd(self, filename, clean_lines, linenum, error):
+    """Run checks that applies to text after the closing brace.
+
+    This is mostly used for checking end of namespace comments.
+
+    Args:
+      filename: The name of the current file.
+      clean_lines: A CleansedLines instance containing the file.
+      linenum: The number of the line to check.
+      error: The function to call with any errors found.
+    """
+    pass
+
+
+class _ClassInfo(_BlockInfo):
+  """Stores information about a class."""
+
+  def __init__(self, name, class_or_struct, clean_lines, linenum):
+    _BlockInfo.__init__(self, False)
+    self.name = name
+    self.starting_linenum = linenum
+    self.is_derived = False
+    if class_or_struct == 'struct':
+      self.access = 'public'
+    else:
+      self.access = 'private'
+
+    # Try to find the end of the class.  This will be confused by things like:
+    #   class A {
+    #   } *x = { ...
+    #
+    # But it's still good enough for CheckSectionSpacing.
+    self.last_line = 0
+    depth = 0
+    for i in range(linenum, clean_lines.NumLines()):
+      line = clean_lines.elided[i]
+      depth += line.count('{') - line.count('}')
+      if not depth:
+        self.last_line = i
+        break
+
+  def CheckBegin(self, filename, clean_lines, linenum, error):
+    # Look for a bare ':'
+    if Search('(^|[^:]):($|[^:])', clean_lines.elided[linenum]):
+      self.is_derived = True
+
+
+class _NamespaceInfo(_BlockInfo):
+  """Stores information about a namespace."""
+
+  def __init__(self, name, linenum):
+    _BlockInfo.__init__(self, False)
+    self.name = name or ''
+    self.starting_linenum = linenum
+
+  def CheckEnd(self, filename, clean_lines, linenum, error):
+    """Check end of namespace comments."""
+    line = clean_lines.raw_lines[linenum]
+
+    # Check how many lines is enclosed in this namespace.  Don't issue
+    # warning for missing namespace comments if there aren't enough
+    # lines.  However, do apply checks if there is already an end of
+    # namespace comment and it's incorrect.
+    #
+    # TODO(unknown): We always want to check end of namespace comments
+    # if a namespace is large, but sometimes we also want to apply the
+    # check if a short namespace contained nontrivial things (something
+    # other than forward declarations).  There is currently no logic on
+    # deciding what these nontrivial things are, so this check is
+    # triggered by namespace size only, which works most of the time.
+    if (linenum - self.starting_linenum < 10
+        and not Match(r'};*\s*(//|/\*).*\bnamespace\b', line)):
+      return
+
+    # Look for matching comment at end of namespace.
+    #
+    # Note that we accept C style "/* */" comments for terminating
+    # namespaces, so that code that terminate namespaces inside
+    # preprocessor macros can be cpplint clean.  Example: http://go/nxpiz
+    #
+    # We also accept stuff like "// end of namespace <name>." with the
+    # period at the end.
+    #
+    # Besides these, we don't accept anything else, otherwise we might
+    # get false negatives when existing comment is a substring of the
+    # expected namespace.  Example: http://go/ldkdc, http://cl/23548205
+    if self.name:
+      # Named namespace
+      if not Match((r'};*\s*(//|/\*).*\bnamespace\s+' + re.escape(self.name) +
+                    r'[\*/\.\\\s]*$'),
+                   line):
+        error(filename, linenum, 'readability/namespace', 5,
+              'Namespace should be terminated with "// namespace %s"' %
+              self.name)
+    else:
+      # Anonymous namespace
+      if not Match(r'};*\s*(//|/\*).*\bnamespace[\*/\.\\\s]*$', line):
+        error(filename, linenum, 'readability/namespace', 5,
+              'Namespace should be terminated with "// namespace"')
+
+
+class _PreprocessorInfo(object):
+  """Stores checkpoints of nesting stacks when #if/#else is seen."""
+
+  def __init__(self, stack_before_if):
+    # The entire nesting stack before #if
+    self.stack_before_if = stack_before_if
+
+    # The entire nesting stack up to #else
+    self.stack_before_else = []
+
+    # Whether we have already seen #else or #elif
+    self.seen_else = False
+
+
+class _NestingState(object):
+  """Holds states related to parsing braces."""
+
+  def __init__(self):
+    # Stack for tracking all braces.  An object is pushed whenever we
+    # see a "{", and popped when we see a "}".  Only 3 types of
+    # objects are possible:
+    # - _ClassInfo: a class or struct.
+    # - _NamespaceInfo: a namespace.
+    # - _BlockInfo: some other type of block.
+    self.stack = []
+
+    # Stack of _PreprocessorInfo objects.
+    self.pp_stack = []
+
+  def SeenOpenBrace(self):
+    """Check if we have seen the opening brace for the innermost block.
+
+    Returns:
+      True if we have seen the opening brace, False if the innermost
+      block is still expecting an opening brace.
+    """
+    return (not self.stack) or self.stack[-1].seen_open_brace
+
+  def InNamespaceBody(self):
+    """Check if we are currently one level inside a namespace body.
+
+    Returns:
+      True if top of the stack is a namespace block, False otherwise.
+    """
+    return self.stack and isinstance(self.stack[-1], _NamespaceInfo)
+
+  def UpdatePreprocessor(self, line):
+    """Update preprocessor stack.
+
+    We need to handle preprocessors due to classes like this:
+      #ifdef SWIG
+      struct ResultDetailsPageElementExtensionPoint {
+      #else
+      struct ResultDetailsPageElementExtensionPoint : public Extension {
+      #endif
+    (see http://go/qwddn for original example)
+
+    We make the following assumptions (good enough for most files):
+    - Preprocessor condition evaluates to true from #if up to first
+      #else/#elif/#endif.
+
+    - Preprocessor condition evaluates to false from #else/#elif up
+      to #endif.  We still perform lint checks on these lines, but
+      these do not affect nesting stack.
+
+    Args:
+      line: current line to check.
+    """
+    if Match(r'^\s*#\s*(if|ifdef|ifndef)\b', line):
+      # Beginning of #if block, save the nesting stack here.  The saved
+      # stack will allow us to restore the parsing state in the #else case.
+      self.pp_stack.append(_PreprocessorInfo(copy.deepcopy(self.stack)))
+    elif Match(r'^\s*#\s*(else|elif)\b', line):
+      # Beginning of #else block
+      if self.pp_stack:
+        if not self.pp_stack[-1].seen_else:
+          # This is the first #else or #elif block.  Remember the
+          # whole nesting stack up to this point.  This is what we
+          # keep after the #endif.
+          self.pp_stack[-1].seen_else = True
+          self.pp_stack[-1].stack_before_else = copy.deepcopy(self.stack)
+
+        # Restore the stack to how it was before the #if
+        self.stack = copy.deepcopy(self.pp_stack[-1].stack_before_if)
+      else:
+        # TODO(unknown): unexpected #else, issue warning?
+        pass
+    elif Match(r'^\s*#\s*endif\b', line):
+      # End of #if or #else blocks.
+      if self.pp_stack:
+        # If we saw an #else, we will need to restore the nesting
+        # stack to its former state before the #else, otherwise we
+        # will just continue from where we left off.
+        if self.pp_stack[-1].seen_else:
+          # Here we can just use a shallow copy since we are the last
+          # reference to it.
+          self.stack = self.pp_stack[-1].stack_before_else
+        # Drop the corresponding #if
+        self.pp_stack.pop()
+      else:
+        # TODO(unknown): unexpected #endif, issue warning?
+        pass
+
+  def Update(self, filename, clean_lines, linenum, error):
+    """Update nesting state with current line.
+
+    Args:
+      filename: The name of the current file.
+      clean_lines: A CleansedLines instance containing the file.
+      linenum: The number of the line to check.
+      error: The function to call with any errors found.
+    """
+    line = clean_lines.elided[linenum]
+
+    # Update pp_stack first
+    self.UpdatePreprocessor(line)
+
+    # Count parentheses.  This is to avoid adding struct arguments to
+    # the nesting stack.
+    if self.stack:
+      inner_block = self.stack[-1]
+      depth_change = line.count('(') - line.count(')')
+      inner_block.open_parentheses += depth_change
+
+      # Also check if we are starting or ending an inline assembly block.
+      if inner_block.inline_asm in (_NO_ASM, _END_ASM):
+        if (depth_change != 0 and
+            inner_block.open_parentheses == 1 and
+            _MATCH_ASM.match(line)):
+          # Enter assembly block
+          inner_block.inline_asm = _INSIDE_ASM
+        else:
+          # Not entering assembly block.  If previous line was _END_ASM,
+          # we will now shift to _NO_ASM state.
+          inner_block.inline_asm = _NO_ASM
+      elif (inner_block.inline_asm == _INSIDE_ASM and
+            inner_block.open_parentheses == 0):
+        # Exit assembly block
+        inner_block.inline_asm = _END_ASM
+
+    # Consume namespace declaration at the beginning of the line.  Do
+    # this in a loop so that we catch same line declarations like this:
+    #   namespace proto2 { namespace bridge { class MessageSet; } }
+    while True:
+      # Match start of namespace.  The "\b\s*" below catches namespace
+      # declarations even if it weren't followed by a whitespace, this
+      # is so that we don't confuse our namespace checker.  The
+      # missing spaces will be flagged by CheckSpacing.
+      namespace_decl_match = Match(r'^\s*namespace\b\s*([:\w]+)?(.*)$', line)
+      if not namespace_decl_match:
+        break
+
+      new_namespace = _NamespaceInfo(namespace_decl_match.group(1), linenum)
+      self.stack.append(new_namespace)
+
+      line = namespace_decl_match.group(2)
+      if line.find('{') != -1:
+        new_namespace.seen_open_brace = True
+        line = line[line.find('{') + 1:]
+
+    # Look for a class declaration in whatever is left of the line
+    # after parsing namespaces.  The regexp accounts for decorated classes
+    # such as in:
+    #   class LOCKABLE API Object {
+    #   };
+    #
+    # Templates with class arguments may confuse the parser, for example:
+    #   template <class T
+    #             class Comparator = less<T>,
+    #             class Vector = vector<T> >
+    #   class HeapQueue {
+    #
+    # Because this parser has no nesting state about templates, by the
+    # time it saw "class Comparator", it may think that it's a new class.
+    # Nested templates have a similar problem:
+    #   template <
+    #       typename ExportedType,
+    #       typename TupleType,
+    #       template <typename, typename> class ImplTemplate>
+    #
+    # To avoid these cases, we ignore classes that are followed by '=' or '>'
+    class_decl_match = Match(
+        r'\s*(template\s*<[\w\s<>,:]*>\s*)?'
+        '(class|struct)\s+([A-Z_]+\s+)*(\w+(?:::\w+)*)'
+        '(([^=>]|<[^<>]*>)*)$', line)
+    if (class_decl_match and
+        (not self.stack or self.stack[-1].open_parentheses == 0)):
+      self.stack.append(_ClassInfo(
+          class_decl_match.group(4), class_decl_match.group(2),
+          clean_lines, linenum))
+      line = class_decl_match.group(5)
+
+    # If we have not yet seen the opening brace for the innermost block,
+    # run checks here.
+    if not self.SeenOpenBrace():
+      self.stack[-1].CheckBegin(filename, clean_lines, linenum, error)
+
+    # Update access control if we are inside a class/struct
+    if self.stack and isinstance(self.stack[-1], _ClassInfo):
+      access_match = Match(r'\s*(public|private|protected)\s*:', line)
+      if access_match:
+        self.stack[-1].access = access_match.group(1)
+
+    # Consume braces or semicolons from what's left of the line
+    while True:
+      # Match first brace, semicolon, or closed parenthesis.
+      matched = Match(r'^[^{;)}]*([{;)}])(.*)$', line)
+      if not matched:
+        break
+
+      token = matched.group(1)
+      if token == '{':
+        # If namespace or class hasn't seen a opening brace yet, mark
+        # namespace/class head as complete.  Push a new block onto the
+        # stack otherwise.
+        if not self.SeenOpenBrace():
+          self.stack[-1].seen_open_brace = True
+        else:
+          self.stack.append(_BlockInfo(True))
+          if _MATCH_ASM.match(line):
+            self.stack[-1].inline_asm = _BLOCK_ASM
+      elif token == ';' or token == ')':
+        # If we haven't seen an opening brace yet, but we already saw
+        # a semicolon, this is probably a forward declaration.  Pop
+        # the stack for these.
+        #
+        # Similarly, if we haven't seen an opening brace yet, but we
+        # already saw a closing parenthesis, then these are probably
+        # function arguments with extra "class" or "struct" keywords.
+        # Also pop these stack for these.
+        if not self.SeenOpenBrace():
+          self.stack.pop()
+      else:  # token == '}'
+        # Perform end of block checks and pop the stack.
+        if self.stack:
+          self.stack[-1].CheckEnd(filename, clean_lines, linenum, error)
+          self.stack.pop()
+      line = matched.group(2)
+
+  def InnermostClass(self):
+    """Get class info on the top of the stack.
+
+    Returns:
+      A _ClassInfo object if we are inside a class, or None otherwise.
+    """
+    for i in range(len(self.stack), 0, -1):
+      classinfo = self.stack[i - 1]
+      if isinstance(classinfo, _ClassInfo):
+        return classinfo
+    return None
+
+  def CheckClassFinished(self, filename, error):
+    """Checks that all classes have been completely parsed.
+
+    Call this when all lines in a file have been processed.
+    Args:
+      filename: The name of the current file.
+      error: The function to call with any errors found.
+    """
+    # Note: This test can result in false positives if #ifdef constructs
+    # get in the way of brace matching. See the testBuildClass test in
+    # cpplint_unittest.py for an example of this.
+    for obj in self.stack:
+      if isinstance(obj, _ClassInfo):
+        error(filename, obj.starting_linenum, 'build/class', 5,
+              'Failed to find complete declaration of class %s' %
+              obj.name)
+
+
+def CheckForNonStandardConstructs(filename, clean_lines, linenum,
+                                  nesting_state, error):
+  """Logs an error if we see certain non-ANSI constructs ignored by gcc-2.
+
+  Complain about several constructs which gcc-2 accepts, but which are
+  not standard C++.  Warning about these in lint is one way to ease the
+  transition to new compilers.
+  - put storage class first (e.g. "static const" instead of "const static").
+  - "%lld" instead of %qd" in printf-type functions.
+  - "%1$d" is non-standard in printf-type functions.
+  - "\%" is an undefined character escape sequence.
+  - text after #endif is not allowed.
+  - invalid inner-style forward declaration.
+  - >? and <? operators, and their >?= and <?= cousins.
+
+  Additionally, check for constructor/destructor style violations and reference
+  members, as it is very convenient to do so while checking for
+  gcc-2 compliance.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    nesting_state: A _NestingState instance which maintains information about
+                   the current stack of nested blocks being parsed.
+    error: A callable to which errors are reported, which takes 4 arguments:
+           filename, line number, error level, and message
+  """
+
+  # Remove comments from the line, but leave in strings for now.
+  line = clean_lines.lines[linenum]
+
+  if Search(r'printf\s*\(.*".*%[-+ ]?\d*q', line):
+    error(filename, linenum, 'runtime/printf_format', 3,
+          '%q in format strings is deprecated.  Use %ll instead.')
+
+  if Search(r'printf\s*\(.*".*%\d+\$', line):
+    error(filename, linenum, 'runtime/printf_format', 2,
+          '%N$ formats are unconventional.  Try rewriting to avoid them.')
+
+  # Remove escaped backslashes before looking for undefined escapes.
+  line = line.replace('\\\\', '')
+
+  if Search(r'("|\').*\\(%|\[|\(|{)', line):
+    error(filename, linenum, 'build/printf_format', 3,
+          '%, [, (, and { are undefined character escapes.  Unescape them.')
+
+  # For the rest, work with both comments and strings removed.
+  line = clean_lines.elided[linenum]
+
+  if Search(r'\b(const|volatile|void|char|short|int|long'
+            r'|float|double|signed|unsigned'
+            r'|schar|u?int8|u?int16|u?int32|u?int64)'
+            r'\s+(register|static|extern|typedef)\b',
+            line):
+    error(filename, linenum, 'build/storage_class', 5,
+          'Storage class (static, extern, typedef, etc) should be first.')
+
+  if Match(r'\s*#\s*endif\s*[^/\s]+', line):
+    error(filename, linenum, 'build/endif_comment', 5,
+          'Uncommented text after #endif is non-standard.  Use a comment.')
+
+  if Match(r'\s*class\s+(\w+\s*::\s*)+\w+\s*;', line):
+    error(filename, linenum, 'build/forward_decl', 5,
+          'Inner-style forward declarations are invalid.  Remove this line.')
+
+  if Search(r'(\w+|[+-]?\d+(\.\d*)?)\s*(<|>)\?=?\s*(\w+|[+-]?\d+)(\.\d*)?',
+            line):
+    error(filename, linenum, 'build/deprecated', 3,
+          '>? and <? (max and min) operators are non-standard and deprecated.')
+
+  if Search(r'^\s*const\s*string\s*&\s*\w+\s*;', line):
+    # TODO(unknown): Could it be expanded safely to arbitrary references,
+    # without triggering too many false positives? The first
+    # attempt triggered 5 warnings for mostly benign code in the regtest, hence
+    # the restriction.
+    # Here's the original regexp, for the reference:
+    # type_name = r'\w+((\s*::\s*\w+)|(\s*<\s*\w+?\s*>))?'
+    # r'\s*const\s*' + type_name + '\s*&\s*\w+\s*;'
+    error(filename, linenum, 'runtime/member_string_references', 2,
+          'const string& members are dangerous. It is much better to use '
+          'alternatives, such as pointers or simple constants.')
+
+  # Everything else in this function operates on class declarations.
+  # Return early if the top of the nesting stack is not a class, or if
+  # the class head is not completed yet.
+  classinfo = nesting_state.InnermostClass()
+  if not classinfo or not classinfo.seen_open_brace:
+    return
+
+  # The class may have been declared with namespace or classname qualifiers.
+  # The constructor and destructor will not have those qualifiers.
+  base_classname = classinfo.name.split('::')[-1]
+
+  # Look for single-argument constructors that aren't marked explicit.
+  # Technically a valid construct, but against style.
+  args = Match(r'\s+(?:inline\s+)?%s\s*\(([^,()]+)\)'
+               % re.escape(base_classname),
+               line)
+  if (args and
+      args.group(1) != 'void' and
+      not Match(r'(const\s+)?%s\s*(?:<\w+>\s*)?&' % re.escape(base_classname),
+                args.group(1).strip())):
+    error(filename, linenum, 'runtime/explicit', 5,
+          'Single-argument constructors should be marked explicit.')
+
+
+def CheckSpacingForFunctionCall(filename, line, linenum, error):
+  """Checks for the correctness of various spacing around function calls.
+
+  Args:
+    filename: The name of the current file.
+    line: The text of the line to check.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+
+  # Since function calls often occur inside if/for/while/switch
+  # expressions - which have their own, more liberal conventions - we
+  # first see if we should be looking inside such an expression for a
+  # function call, to which we can apply more strict standards.
+  fncall = line    # if there's no control flow construct, look at whole line
+  for pattern in (r'\bif\s*\((.*)\)\s*{',
+                  r'\bfor\s*\((.*)\)\s*{',
+                  r'\bwhile\s*\((.*)\)\s*[{;]',
+                  r'\bswitch\s*\((.*)\)\s*{'):
+    match = Search(pattern, line)
+    if match:
+      fncall = match.group(1)    # look inside the parens for function calls
+      break
+
+  # Except in if/for/while/switch, there should never be space
+  # immediately inside parens (eg "f( 3, 4 )").  We make an exception
+  # for nested parens ( (a+b) + c ).  Likewise, there should never be
+  # a space before a ( when it's a function argument.  I assume it's a
+  # function argument when the char before the whitespace is legal in
+  # a function name (alnum + _) and we're not starting a macro. Also ignore
+  # pointers and references to arrays and functions coz they're too tricky:
+  # we use a very simple way to recognize these:
+  # " (something)(maybe-something)" or
+  # " (something)(maybe-something," or
+  # " (something)[something]"
+  # Note that we assume the contents of [] to be short enough that
+  # they'll never need to wrap.
+  if (  # Ignore control structures.
+      not Search(r'\b(if|for|while|switch|return|delete)\b', fncall) and
+      # Ignore pointers/references to functions.
+      not Search(r' \([^)]+\)\([^)]*(\)|,$)', fncall) and
+      # Ignore pointers/references to arrays.
+      not Search(r' \([^)]+\)\[[^\]]+\]', fncall)):
+    if Search(r'\w\s*\(\s(?!\s*\\$)', fncall):      # a ( used for a fn call
+      error(filename, linenum, 'whitespace/parens', 4,
+            'Extra space after ( in function call')
+    elif Search(r'\(\s+(?!(\s*\\)|\()', fncall):
+      error(filename, linenum, 'whitespace/parens', 2,
+            'Extra space after (')
+    if (Search(r'\w\s+\(', fncall) and
+        not Search(r'#\s*define|typedef', fncall) and
+        not Search(r'\w\s+\((\w+::)?\*\w+\)\(', fncall)):
+      error(filename, linenum, 'whitespace/parens', 4,
+            'Extra space before ( in function call')
+    # If the ) is followed only by a newline or a { + newline, assume it's
+    # part of a control statement (if/while/etc), and don't complain
+    if Search(r'[^)]\s+\)\s*[^{\s]', fncall):
+      # If the closing parenthesis is preceded by only whitespaces,
+      # try to give a more descriptive error message.
+      if Search(r'^\s+\)', fncall):
+        error(filename, linenum, 'whitespace/parens', 2,
+              'Closing ) should be moved to the previous line')
+      else:
+        error(filename, linenum, 'whitespace/parens', 2,
+              'Extra space before )')
+
+
+def IsBlankLine(line):
+  """Returns true if the given line is blank.
+
+  We consider a line to be blank if the line is empty or consists of
+  only white spaces.
+
+  Args:
+    line: A line of a string.
+
+  Returns:
+    True, if the given line is blank.
+  """
+  return not line or line.isspace()
+
+
+def CheckForFunctionLengths(filename, clean_lines, linenum,
+                            function_state, error):
+  """Reports for long function bodies.
+
+  For an overview why this is done, see:
+  http://google-styleguide.googlecode.com/svn/trunk/cppguide.xml#Write_Short_Functions
+
+  Uses a simplistic algorithm assuming other style guidelines
+  (especially spacing) are followed.
+  Only checks unindented functions, so class members are unchecked.
+  Trivial bodies are unchecked, so constructors with huge initializer lists
+  may be missed.
+  Blank/comment lines are not counted so as to avoid encouraging the removal
+  of vertical space and comments just to get through a lint check.
+  NOLINT *on the last line of a function* disables this check.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    function_state: Current function name and lines in body so far.
+    error: The function to call with any errors found.
+  """
+  lines = clean_lines.lines
+  line = lines[linenum]
+  raw = clean_lines.raw_lines
+  raw_line = raw[linenum]
+  joined_line = ''
+
+  starting_func = False
+  regexp = r'(\w(\w|::|\*|\&|\s)*)\('  # decls * & space::name( ...
+  match_result = Match(regexp, line)
+  if match_result:
+    # If the name is all caps and underscores, figure it's a macro and
+    # ignore it, unless it's TEST or TEST_F.
+    function_name = match_result.group(1).split()[-1]
+    if function_name == 'TEST' or function_name == 'TEST_F' or (
+        not Match(r'[A-Z_]+$', function_name)):
+      starting_func = True
+
+  if starting_func:
+    body_found = False
+    for start_linenum in xrange(linenum, clean_lines.NumLines()):
+      start_line = lines[start_linenum]
+      joined_line += ' ' + start_line.lstrip()
+      if Search(r'(;|})', start_line):  # Declarations and trivial functions
+        body_found = True
+        break                              # ... ignore
+      elif Search(r'{', start_line):
+        body_found = True
+        function = Search(r'((\w|:)*)\(', line).group(1)
+        if Match(r'TEST', function):    # Handle TEST... macros
+          parameter_regexp = Search(r'(\(.*\))', joined_line)
+          if parameter_regexp:             # Ignore bad syntax
+            function += parameter_regexp.group(1)
+        else:
+          function += '()'
+        function_state.Begin(function)
+        break
+    if not body_found:
+      # No body for the function (or evidence of a non-function) was found.
+      error(filename, linenum, 'readability/fn_size', 5,
+            'Lint failed to find start of function body.')
+  elif Match(r'^\}\s*$', line):  # function end
+    function_state.Check(error, filename, linenum)
+    function_state.End()
+  elif not Match(r'^\s*$', line):
+    function_state.Count()  # Count non-blank/non-comment lines.
+
+
+_RE_PATTERN_TODO = re.compile(r'^//(\s*)TODO(\(.+?\))?:?(\s|$)?')
+
+
+def CheckComment(comment, filename, linenum, error):
+  """Checks for common mistakes in TODO comments.
+
+  Args:
+    comment: The text of the comment from the line in question.
+    filename: The name of the current file.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+  match = _RE_PATTERN_TODO.match(comment)
+  if match:
+    # One whitespace is correct; zero whitespace is handled elsewhere.
+    leading_whitespace = match.group(1)
+    if len(leading_whitespace) > 1:
+      error(filename, linenum, 'whitespace/todo', 2,
+            'Too many spaces before TODO')
+
+    username = match.group(2)
+    if not username:
+      error(filename, linenum, 'readability/todo', 2,
+            'Missing username in TODO; it should look like '
+            '"// TODO(my_username): Stuff."')
+
+    middle_whitespace = match.group(3)
+    # Comparisons made explicit for correctness -- pylint: disable-msg=C6403
+    if middle_whitespace != ' ' and middle_whitespace != '':
+      error(filename, linenum, 'whitespace/todo', 2,
+            'TODO(my_username) should be followed by a space')
+
+def CheckAccess(filename, clean_lines, linenum, nesting_state, error):
+  """Checks for improper use of DISALLOW* macros.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    nesting_state: A _NestingState instance which maintains information about
+                   the current stack of nested blocks being parsed.
+    error: The function to call with any errors found.
+  """
+  line = clean_lines.elided[linenum]  # get rid of comments and strings
+
+  matched = Match((r'\s*(DISALLOW_COPY_AND_ASSIGN|'
+                   r'DISALLOW_EVIL_CONSTRUCTORS|'
+                   r'DISALLOW_IMPLICIT_CONSTRUCTORS)'), line)
+  if not matched:
+    return
+  if nesting_state.stack and isinstance(nesting_state.stack[-1], _ClassInfo):
+    if nesting_state.stack[-1].access != 'private':
+      error(filename, linenum, 'readability/constructors', 3,
+            '%s must be in the private: section' % matched.group(1))
+
+  else:
+    # Found DISALLOW* macro outside a class declaration, or perhaps it
+    # was used inside a function when it should have been part of the
+    # class declaration.  We could issue a warning here, but it
+    # probably resulted in a compiler error already.
+    pass
+
+
+def FindNextMatchingAngleBracket(clean_lines, linenum, init_suffix):
+  """Find the corresponding > to close a template.
+
+  Args:
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: Current line number.
+    init_suffix: Remainder of the current line after the initial <.
+
+  Returns:
+    True if a matching bracket exists.
+  """
+  line = init_suffix
+  nesting_stack = ['<']
+  while True:
+    # Find the next operator that can tell us whether < is used as an
+    # opening bracket or as a less-than operator.  We only want to
+    # warn on the latter case.
+    #
+    # We could also check all other operators and terminate the search
+    # early, e.g. if we got something like this "a<b+c", the "<" is
+    # most likely a less-than operator, but then we will get false
+    # positives for default arguments (e.g. http://go/prccd) and
+    # other template expressions (e.g. http://go/oxcjq).
+    match = Search(r'^[^<>(),;\[\]]*([<>(),;\[\]])(.*)$', line)
+    if match:
+      # Found an operator, update nesting stack
+      operator = match.group(1)
+      line = match.group(2)
+
+      if nesting_stack[-1] == '<':
+        # Expecting closing angle bracket
+        if operator in ('<', '(', '['):
+          nesting_stack.append(operator)
+        elif operator == '>':
+          nesting_stack.pop()
+          if not nesting_stack:
+            # Found matching angle bracket
+            return True
+        elif operator == ',':
+          # Got a comma after a bracket, this is most likely a template
+          # argument.  We have not seen a closing angle bracket yet, but
+          # it's probably a few lines later if we look for it, so just
+          # return early here.
+          return True
+        else:
+          # Got some other operator.
+          return False
+
+      else:
+        # Expecting closing parenthesis or closing bracket
+        if operator in ('<', '(', '['):
+          nesting_stack.append(operator)
+        elif operator in (')', ']'):
+          # We don't bother checking for matching () or [].  If we got
+          # something like (] or [), it would have been a syntax error.
+          nesting_stack.pop()
+
+    else:
+      # Scan the next line
+      linenum += 1
+      if linenum >= len(clean_lines.elided):
+        break
+      line = clean_lines.elided[linenum]
+
+  # Exhausted all remaining lines and still no matching angle bracket.
+  # Most likely the input was incomplete, otherwise we should have
+  # seen a semicolon and returned early.
+  return True
+
+
+def FindPreviousMatchingAngleBracket(clean_lines, linenum, init_prefix):
+  """Find the corresponding < that started a template.
+
+  Args:
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: Current line number.
+    init_prefix: Part of the current line before the initial >.
+
+  Returns:
+    True if a matching bracket exists.
+  """
+  line = init_prefix
+  nesting_stack = ['>']
+  while True:
+    # Find the previous operator
+    match = Search(r'^(.*)([<>(),;\[\]])[^<>(),;\[\]]*$', line)
+    if match:
+      # Found an operator, update nesting stack
+      operator = match.group(2)
+      line = match.group(1)
+
+      if nesting_stack[-1] == '>':
+        # Expecting opening angle bracket
+        if operator in ('>', ')', ']'):
+          nesting_stack.append(operator)
+        elif operator == '<':
+          nesting_stack.pop()
+          if not nesting_stack:
+            # Found matching angle bracket
+            return True
+        elif operator == ',':
+          # Got a comma before a bracket, this is most likely a
+          # template argument.  The opening angle bracket is probably
+          # there if we look for it, so just return early here.
+          return True
+        else:
+          # Got some other operator.
+          return False
+
+      else:
+        # Expecting opening parenthesis or opening bracket
+        if operator in ('>', ')', ']'):
+          nesting_stack.append(operator)
+        elif operator in ('(', '['):
+          nesting_stack.pop()
+
+    else:
+      # Scan the previous line
+      linenum -= 1
+      if linenum < 0:
+        break
+      line = clean_lines.elided[linenum]
+
+  # Exhausted all earlier lines and still no matching angle bracket.
+  return False
+
+
+def CheckSpacing(filename, clean_lines, linenum, nesting_state, error):
+  """Checks for the correctness of various spacing issues in the code.
+
+  Things we check for: spaces around operators, spaces after
+  if/for/while/switch, no spaces around parens in function calls, two
+  spaces between code and comment, don't start a block with a blank
+  line, don't end a function with a blank line, don't add a blank line
+  after public/protected/private, don't have too many blank lines in a row.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    nesting_state: A _NestingState instance which maintains information about
+                   the current stack of nested blocks being parsed.
+    error: The function to call with any errors found.
+  """
+
+  raw = clean_lines.raw_lines
+  line = raw[linenum]
+
+  # Before nixing comments, check if the line is blank for no good
+  # reason.  This includes the first line after a block is opened, and
+  # blank lines at the end of a function (ie, right before a line like '}'
+  #
+  # Skip all the blank line checks if we are immediately inside a
+  # namespace body.  In other words, don't issue blank line warnings
+  # for this block:
+  #   namespace {
+  #
+  #   }
+  #
+  # A warning about missing end of namespace comments will be issued instead.
+  if IsBlankLine(line) and not nesting_state.InNamespaceBody():
+    elided = clean_lines.elided
+    prev_line = elided[linenum - 1]
+    prevbrace = prev_line.rfind('{')
+    # TODO(unknown): Don't complain if line before blank line, and line after,
+    #                both start with alnums and are indented the same amount.
+    #                This ignores whitespace at the start of a namespace block
+    #                because those are not usually indented.
+    if prevbrace != -1 and prev_line[prevbrace:].find('}') == -1:
+      # OK, we have a blank line at the start of a code block.  Before we
+      # complain, we check if it is an exception to the rule: The previous
+      # non-empty line has the parameters of a function header that are indented
+      # 4 spaces (because they did not fit in a 80 column line when placed on
+      # the same line as the function name).  We also check for the case where
+      # the previous line is indented 6 spaces, which may happen when the
+      # initializers of a constructor do not fit into a 80 column line.
+      exception = False
+      if Match(r' {6}\w', prev_line):  # Initializer list?
+        # We are looking for the opening column of initializer list, which
+        # should be indented 4 spaces to cause 6 space indentation afterwards.
+        search_position = linenum-2
+        while (search_position >= 0
+               and Match(r' {6}\w', elided[search_position])):
+          search_position -= 1
+        exception = (search_position >= 0
+                     and elided[search_position][:5] == '    :')
+      else:
+        # Search for the function arguments or an initializer list.  We use a
+        # simple heuristic here: If the line is indented 4 spaces; and we have a
+        # closing paren, without the opening paren, followed by an opening brace
+        # or colon (for initializer lists) we assume that it is the last line of
+        # a function header.  If we have a colon indented 4 spaces, it is an
+        # initializer list.
+        exception = (Match(r' {4}\w[^\(]*\)\s*(const\s*)?(\{\s*$|:)',
+                           prev_line)
+                     or Match(r' {4}:', prev_line))
+
+      if not exception:
+        error(filename, linenum, 'whitespace/blank_line', 2,
+              'Blank line at the start of a code block.  Is this needed?')
+    # Ignore blank lines at the end of a block in a long if-else
+    # chain, like this:
+    #   if (condition1) {
+    #     // Something followed by a blank line
+    #
+    #   } else if (condition2) {
+    #     // Something else
+    #   }
+    if linenum + 1 < clean_lines.NumLines():
+      next_line = raw[linenum + 1]
+      if (next_line
+          and Match(r'\s*}', next_line)
+          and next_line.find('} else ') == -1):
+        error(filename, linenum, 'whitespace/blank_line', 3,
+              'Blank line at the end of a code block.  Is this needed?')
+
+    matched = Match(r'\s*(public|protected|private):', prev_line)
+    if matched:
+      error(filename, linenum, 'whitespace/blank_line', 3,
+            'Do not leave a blank line after "%s:"' % matched.group(1))
+
+  # Next, we complain if there's a comment too near the text
+  commentpos = line.find('//')
+  if commentpos != -1:
+    # Check if the // may be in quotes.  If so, ignore it
+    # Comparisons made explicit for clarity -- pylint: disable-msg=C6403
+    if (line.count('"', 0, commentpos) -
+        line.count('\\"', 0, commentpos)) % 2 == 0:   # not in quotes
+      # Allow one space for new scopes, two spaces otherwise:
+      if (not Match(r'^\s*{ //', line) and
+          ((commentpos >= 1 and
+            line[commentpos-1] not in string.whitespace) or
+           (commentpos >= 2 and
+            line[commentpos-2] not in string.whitespace))):
+        error(filename, linenum, 'whitespace/comments', 2,
+              'At least two spaces is best between code and comments')
+      # There should always be a space between the // and the comment
+      commentend = commentpos + 2
+      if commentend < len(line) and not line[commentend] == ' ':
+        # but some lines are exceptions -- e.g. if they're big
+        # comment delimiters like:
+        # //----------------------------------------------------------
+        # or are an empty C++ style Doxygen comment, like:
+        # ///
+        # or they begin with multiple slashes followed by a space:
+        # //////// Header comment
+        match = (Search(r'[=/-]{4,}\s*$', line[commentend:]) or
+                 Search(r'^/$', line[commentend:]) or
+                 Search(r'^/+ ', line[commentend:]))
+        if not match:
+          error(filename, linenum, 'whitespace/comments', 4,
+                'Should have a space between // and comment')
+      CheckComment(line[commentpos:], filename, linenum, error)
+
+  line = clean_lines.elided[linenum]  # get rid of comments and strings
+
+  # Don't try to do spacing checks for operator methods
+  line = re.sub(r'operator(==|!=|<|<<|<=|>=|>>|>)\(', 'operator\(', line)
+
+  # We allow no-spaces around = within an if: "if ( (a=Foo()) == 0 )".
+  # Otherwise not.  Note we only check for non-spaces on *both* sides;
+  # sometimes people put non-spaces on one side when aligning ='s among
+  # many lines (not that this is behavior that I approve of...)
+  if Search(r'[\w.]=[\w.]', line) and not Search(r'\b(if|while) ', line):
+    error(filename, linenum, 'whitespace/operators', 4,
+          'Missing spaces around =')
+
+  # It's ok not to have spaces around binary operators like + - * /, but if
+  # there's too little whitespace, we get concerned.  It's hard to tell,
+  # though, so we punt on this one for now.  TODO.
+
+  # You should always have whitespace around binary operators.
+  #
+  # Check <= and >= first to avoid false positives with < and >, then
+  # check non-include lines for spacing around < and >.
+  match = Search(r'[^<>=!\s](==|!=|<=|>=)[^<>=!\s]', line)
+  if match:
+    error(filename, linenum, 'whitespace/operators', 3,
+          'Missing spaces around %s' % match.group(1))
+  # We allow no-spaces around << when used like this: 10<<20, but
+  # not otherwise (particularly, not when used as streams)
+  match = Search(r'(\S)(?:L|UL|ULL|l|ul|ull)?<<(\S)', line)
+  if match and not (match.group(1).isdigit() and match.group(2).isdigit()):
+    error(filename, linenum, 'whitespace/operators', 3,
+          'Missing spaces around <<')
+  elif not Match(r'#.*include', line):
+    # Avoid false positives on ->
+    reduced_line = line.replace('->', '')
+
+    # Look for < that is not surrounded by spaces.  This is only
+    # triggered if both sides are missing spaces, even though
+    # technically should should flag if at least one side is missing a
+    # space.  This is done to avoid some false positives with shifts.
+    match = Search(r'[^\s<]<([^\s=<].*)', reduced_line)
+    if (match and
+        not FindNextMatchingAngleBracket(clean_lines, linenum, match.group(1))):
+      error(filename, linenum, 'whitespace/operators', 3,
+            'Missing spaces around <')
+
+    # Look for > that is not surrounded by spaces.  Similar to the
+    # above, we only trigger if both sides are missing spaces to avoid
+    # false positives with shifts.
+    match = Search(r'^(.*[^\s>])>[^\s=>]', reduced_line)
+    if (match and
+        not FindPreviousMatchingAngleBracket(clean_lines, linenum,
+                                             match.group(1))):
+      error(filename, linenum, 'whitespace/operators', 3,
+            'Missing spaces around >')
+
+  # We allow no-spaces around >> for almost anything.  This is because
+  # C++11 allows ">>" to close nested templates, which accounts for
+  # most cases when ">>" is not followed by a space.
+  #
+  # We still warn on ">>" followed by alpha character, because that is
+  # likely due to ">>" being used for right shifts, e.g.:
+  #   value >> alpha
+  #
+  # When ">>" is used to close templates, the alphanumeric letter that
+  # follows would be part of an identifier, and there should still be
+  # a space separating the template type and the identifier.
+  #   type<type<type>> alpha
+  match = Search(r'>>[a-zA-Z_]', line)
+  if match:
+    error(filename, linenum, 'whitespace/operators', 3,
+          'Missing spaces around >>')
+
+  # There shouldn't be space around unary operators
+  match = Search(r'(!\s|~\s|[\s]--[\s;]|[\s]\+\+[\s;])', line)
+  if match:
+    error(filename, linenum, 'whitespace/operators', 4,
+          'Extra space for operator %s' % match.group(1))
+
+  # A pet peeve of mine: no spaces after an if, while, switch, or for
+  match = Search(r' (if\(|for\(|while\(|switch\()', line)
+  if match:
+    error(filename, linenum, 'whitespace/parens', 5,
+          'Missing space before ( in %s' % match.group(1))
+
+  # For if/for/while/switch, the left and right parens should be
+  # consistent about how many spaces are inside the parens, and
+  # there should either be zero or one spaces inside the parens.
+  # We don't want: "if ( foo)" or "if ( foo   )".
+  # Exception: "for ( ; foo; bar)" and "for (foo; bar; )" are allowed.
+  match = Search(r'\b(if|for|while|switch)\s*'
+                 r'\(([ ]*)(.).*[^ ]+([ ]*)\)\s*{\s*$',
+                 line)
+  if match:
+    if len(match.group(2)) != len(match.group(4)):
+      if not (match.group(3) == ';' and
+              len(match.group(2)) == 1 + len(match.group(4)) or
+              not match.group(2) and Search(r'\bfor\s*\(.*; \)', line)):
+        error(filename, linenum, 'whitespace/parens', 5,
+              'Mismatching spaces inside () in %s' % match.group(1))
+    if not len(match.group(2)) in [0, 1]:
+      error(filename, linenum, 'whitespace/parens', 5,
+            'Should have zero or one spaces inside ( and ) in %s' %
+            match.group(1))
+
+  # You should always have a space after a comma (either as fn arg or operator)
+  if Search(r',[^\s]', line):
+    error(filename, linenum, 'whitespace/comma', 3,
+          'Missing space after ,')
+
+  # You should always have a space after a semicolon
+  # except for few corner cases
+  # TODO(unknown): clarify if 'if (1) { return 1;}' is requires one more
+  # space after ;
+  if Search(r';[^\s};\\)/]', line):
+    error(filename, linenum, 'whitespace/semicolon', 3,
+          'Missing space after ;')
+
+  # Next we will look for issues with function calls.
+  CheckSpacingForFunctionCall(filename, line, linenum, error)
+
+  # Except after an opening paren, or after another opening brace (in case of
+  # an initializer list, for instance), you should have spaces before your
+  # braces. And since you should never have braces at the beginning of a line,
+  # this is an easy test.
+  if Search(r'[^ ({]{', line):
+    error(filename, linenum, 'whitespace/braces', 5,
+          'Missing space before {')
+
+  # Make sure '} else {' has spaces.
+  if Search(r'}else', line):
+    error(filename, linenum, 'whitespace/braces', 5,
+          'Missing space before else')
+
+  # You shouldn't have spaces before your brackets, except maybe after
+  # 'delete []' or 'new char * []'.
+  if Search(r'\w\s+\[', line) and not Search(r'delete\s+\[', line):
+    error(filename, linenum, 'whitespace/braces', 5,
+          'Extra space before [')
+
+  # You shouldn't have a space before a semicolon at the end of the line.
+  # There's a special case for "for" since the style guide allows space before
+  # the semicolon there.
+  if Search(r':\s*;\s*$', line):
+    error(filename, linenum, 'whitespace/semicolon', 5,
+          'Semicolon defining empty statement. Use {} instead.')
+  elif Search(r'^\s*;\s*$', line):
+    error(filename, linenum, 'whitespace/semicolon', 5,
+          'Line contains only semicolon. If this should be an empty statement, '
+          'use {} instead.')
+  elif (Search(r'\s+;\s*$', line) and
+        not Search(r'\bfor\b', line)):
+    error(filename, linenum, 'whitespace/semicolon', 5,
+          'Extra space before last semicolon. If this should be an empty '
+          'statement, use {} instead.')
+
+  # In range-based for, we wanted spaces before and after the colon, but
+  # not around "::" tokens that might appear.
+  if (Search('for *\(.*[^:]:[^: ]', line) or
+      Search('for *\(.*[^: ]:[^:]', line)):
+    error(filename, linenum, 'whitespace/forcolon', 2,
+          'Missing space around colon in range-based for loop')
+
+
+def CheckSectionSpacing(filename, clean_lines, class_info, linenum, error):
+  """Checks for additional blank line issues related to sections.
+
+  Currently the only thing checked here is blank line before protected/private.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    class_info: A _ClassInfo objects.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+  # Skip checks if the class is small, where small means 25 lines or less.
+  # 25 lines seems like a good cutoff since that's the usual height of
+  # terminals, and any class that can't fit in one screen can't really
+  # be considered "small".
+  #
+  # Also skip checks if we are on the first line.  This accounts for
+  # classes that look like
+  #   class Foo { public: ... };
+  #
+  # If we didn't find the end of the class, last_line would be zero,
+  # and the check will be skipped by the first condition.
+  if (class_info.last_line - class_info.starting_linenum <= 24 or
+      linenum <= class_info.starting_linenum):
+    return
+
+  matched = Match(r'\s*(public|protected|private):', clean_lines.lines[linenum])
+  if matched:
+    # Issue warning if the line before public/protected/private was
+    # not a blank line, but don't do this if the previous line contains
+    # "class" or "struct".  This can happen two ways:
+    #  - We are at the beginning of the class.
+    #  - We are forward-declaring an inner class that is semantically
+    #    private, but needed to be public for implementation reasons.
+    # Also ignores cases where the previous line ends with a backslash as can be
+    # common when defining classes in C macros.
+    prev_line = clean_lines.lines[linenum - 1]
+    if (not IsBlankLine(prev_line) and
+        not Search(r'\b(class|struct)\b', prev_line) and
+        not Search(r'\\$', prev_line)):
+      # Try a bit harder to find the beginning of the class.  This is to
+      # account for multi-line base-specifier lists, e.g.:
+      #   class Derived
+      #       : public Base {
+      end_class_head = class_info.starting_linenum
+      for i in range(class_info.starting_linenum, linenum):
+        if Search(r'\{\s*$', clean_lines.lines[i]):
+          end_class_head = i
+          break
+      if end_class_head < linenum - 1:
+        error(filename, linenum, 'whitespace/blank_line', 3,
+              '"%s:" should be preceded by a blank line' % matched.group(1))
+
+
+def GetPreviousNonBlankLine(clean_lines, linenum):
+  """Return the most recent non-blank line and its line number.
+
+  Args:
+    clean_lines: A CleansedLines instance containing the file contents.
+    linenum: The number of the line to check.
+
+  Returns:
+    A tuple with two elements.  The first element is the contents of the last
+    non-blank line before the current line, or the empty string if this is the
+    first non-blank line.  The second is the line number of that line, or -1
+    if this is the first non-blank line.
+  """
+
+  prevlinenum = linenum - 1
+  while prevlinenum >= 0:
+    prevline = clean_lines.elided[prevlinenum]
+    if not IsBlankLine(prevline):     # if not a blank line...
+      return (prevline, prevlinenum)
+    prevlinenum -= 1
+  return ('', -1)
+
+
+def CheckBraces(filename, clean_lines, linenum, error):
+  """Looks for misplaced braces (e.g. at the end of line).
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+
+  line = clean_lines.elided[linenum]        # get rid of comments and strings
+
+  if Match(r'\s*{\s*$', line):
+    # We allow an open brace to start a line in the case where someone
+    # is using braces in a block to explicitly create a new scope,
+    # which is commonly used to control the lifetime of
+    # stack-allocated variables.  We don't detect this perfectly: we
+    # just don't complain if the last non-whitespace character on the
+    # previous non-blank line is ';', ':', '{', or '}', or if the previous
+    # line starts a preprocessor block.
+    prevline = GetPreviousNonBlankLine(clean_lines, linenum)[0]
+    if (not Search(r'[;:}{]\s*$', prevline) and
+        not Match(r'\s*#', prevline)):
+      error(filename, linenum, 'whitespace/braces', 4,
+            '{ should almost always be at the end of the previous line')
+
+  # An else clause should be on the same line as the preceding closing brace.
+  if Match(r'\s*else\s*', line):
+    prevline = GetPreviousNonBlankLine(clean_lines, linenum)[0]
+    if Match(r'\s*}\s*$', prevline):
+      error(filename, linenum, 'whitespace/newline', 4,
+            'An else should appear on the same line as the preceding }')
+
+  # If braces come on one side of an else, they should be on both.
+  # However, we have to worry about "else if" that spans multiple lines!
+  if Search(r'}\s*else[^{]*$', line) or Match(r'[^}]*else\s*{', line):
+    if Search(r'}\s*else if([^{]*)$', line):       # could be multi-line if
+      # find the ( after the if
+      pos = line.find('else if')
+      pos = line.find('(', pos)
+      if pos > 0:
+        (endline, _, endpos) = CloseExpression(clean_lines, linenum, pos)
+        if endline[endpos:].find('{') == -1:    # must be brace after if
+          error(filename, linenum, 'readability/braces', 5,
+                'If an else has a brace on one side, it should have it on both')
+    else:            # common case: else not followed by a multi-line if
+      error(filename, linenum, 'readability/braces', 5,
+            'If an else has a brace on one side, it should have it on both')
+
+  # Likewise, an else should never have the else clause on the same line
+  if Search(r'\belse [^\s{]', line) and not Search(r'\belse if\b', line):
+    error(filename, linenum, 'whitespace/newline', 4,
+          'Else clause should never be on same line as else (use 2 lines)')
+
+  # In the same way, a do/while should never be on one line
+  if Match(r'\s*do [^\s{]', line):
+    error(filename, linenum, 'whitespace/newline', 4,
+          'do/while clauses should not be on a single line')
+
+  # Braces shouldn't be followed by a ; unless they're defining a struct
+  # or initializing an array.
+  # We can't tell in general, but we can for some common cases.
+  prevlinenum = linenum
+  while True:
+    (prevline, prevlinenum) = GetPreviousNonBlankLine(clean_lines, prevlinenum)
+    if Match(r'\s+{.*}\s*;', line) and not prevline.count(';'):
+      line = prevline + line
+    else:
+      break
+  if (Search(r'{.*}\s*;', line) and
+      line.count('{') == line.count('}') and
+      not Search(r'struct|class|enum|\s*=\s*{', line)):
+    error(filename, linenum, 'readability/braces', 4,
+          "You don't need a ; after a }")
+
+
+def CheckEmptyLoopBody(filename, clean_lines, linenum, error):
+  """Loop for empty loop body with only a single semicolon.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+
+  # Search for loop keywords at the beginning of the line.  Because only
+  # whitespaces are allowed before the keywords, this will also ignore most
+  # do-while-loops, since those lines should start with closing brace.
+  line = clean_lines.elided[linenum]
+  if Match(r'\s*(for|while)\s*\(', line):
+    # Find the end of the conditional expression
+    (end_line, end_linenum, end_pos) = CloseExpression(
+        clean_lines, linenum, line.find('('))
+
+    # Output warning if what follows the condition expression is a semicolon.
+    # No warning for all other cases, including whitespace or newline, since we
+    # have a separate check for semicolons preceded by whitespace.
+    if end_pos >= 0 and Match(r';', end_line[end_pos:]):
+      error(filename, end_linenum, 'whitespace/empty_loop_body', 5,
+            'Empty loop bodies should use {} or continue')
+
+
+def ReplaceableCheck(operator, macro, line):
+  """Determine whether a basic CHECK can be replaced with a more specific one.
+
+  For example suggest using CHECK_EQ instead of CHECK(a == b) and
+  similarly for CHECK_GE, CHECK_GT, CHECK_LE, CHECK_LT, CHECK_NE.
+
+  Args:
+    operator: The C++ operator used in the CHECK.
+    macro: The CHECK or EXPECT macro being called.
+    line: The current source line.
+
+  Returns:
+    True if the CHECK can be replaced with a more specific one.
+  """
+
+  # This matches decimal and hex integers, strings, and chars (in that order).
+  match_constant = r'([-+]?(\d+|0[xX][0-9a-fA-F]+)[lLuU]{0,3}|".*"|\'.*\')'
+
+  # Expression to match two sides of the operator with something that
+  # looks like a literal, since CHECK(x == iterator) won't compile.
+  # This means we can't catch all the cases where a more specific
+  # CHECK is possible, but it's less annoying than dealing with
+  # extraneous warnings.
+  match_this = (r'\s*' + macro + r'\((\s*' +
+                match_constant + r'\s*' + operator + r'[^<>].*|'
+                r'.*[^<>]' + operator + r'\s*' + match_constant +
+                r'\s*\))')
+
+  # Don't complain about CHECK(x == NULL) or similar because
+  # CHECK_EQ(x, NULL) won't compile (requires a cast).
+  # Also, don't complain about more complex boolean expressions
+  # involving && or || such as CHECK(a == b || c == d).
+  return Match(match_this, line) and not Search(r'NULL|&&|\|\|', line)
+
+
+def CheckCheck(filename, clean_lines, linenum, error):
+  """Checks the use of CHECK and EXPECT macros.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+
+  # Decide the set of replacement macros that should be suggested
+  raw_lines = clean_lines.raw_lines
+  current_macro = ''
+  for macro in _CHECK_MACROS:
+    if raw_lines[linenum].find(macro) >= 0:
+      current_macro = macro
+      break
+  if not current_macro:
+    # Don't waste time here if line doesn't contain 'CHECK' or 'EXPECT'
+    return
+
+  line = clean_lines.elided[linenum]        # get rid of comments and strings
+
+  # Encourage replacing plain CHECKs with CHECK_EQ/CHECK_NE/etc.
+  for operator in ['==', '!=', '>=', '>', '<=', '<']:
+    if ReplaceableCheck(operator, current_macro, line):
+      error(filename, linenum, 'readability/check', 2,
+            'Consider using %s instead of %s(a %s b)' % (
+                _CHECK_REPLACEMENT[current_macro][operator],
+                current_macro, operator))
+      break
+
+
+def CheckAltTokens(filename, clean_lines, linenum, error):
+  """Check alternative keywords being used in boolean expressions.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+  line = clean_lines.elided[linenum]
+
+  # Avoid preprocessor lines
+  if Match(r'^\s*#', line):
+    return
+
+  # Last ditch effort to avoid multi-line comments.  This will not help
+  # if the comment started before the current line or ended after the
+  # current line, but it catches most of the false positives.  At least,
+  # it provides a way to workaround this warning for people who use
+  # multi-line comments in preprocessor macros.
+  #
+  # TODO(unknown): remove this once cpplint has better support for
+  # multi-line comments.
+  if line.find('/*') >= 0 or line.find('*/') >= 0:
+    return
+
+  for match in _ALT_TOKEN_REPLACEMENT_PATTERN.finditer(line):
+    error(filename, linenum, 'readability/alt_tokens', 2,
+          'Use operator %s instead of %s' % (
+              _ALT_TOKEN_REPLACEMENT[match.group(1)], match.group(1)))
+
+
+def GetLineWidth(line):
+  """Determines the width of the line in column positions.
+
+  Args:
+    line: A string, which may be a Unicode string.
+
+  Returns:
+    The width of the line in column positions, accounting for Unicode
+    combining characters and wide characters.
+  """
+  if isinstance(line, unicode):
+    width = 0
+    for uc in unicodedata.normalize('NFC', line):
+      if unicodedata.east_asian_width(uc) in ('W', 'F'):
+        width += 2
+      elif not unicodedata.combining(uc):
+        width += 1
+    return width
+  else:
+    return len(line)
+
+
+def CheckStyle(filename, clean_lines, linenum, file_extension, nesting_state,
+               error):
+  """Checks rules from the 'C++ style rules' section of cppguide.html.
+
+  Most of these rules are hard to test (naming, comment style), but we
+  do what we can.  In particular we check for 2-space indents, line lengths,
+  tab usage, spaces inside code, etc.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    file_extension: The extension (without the dot) of the filename.
+    nesting_state: A _NestingState instance which maintains information about
+                   the current stack of nested blocks being parsed.
+    error: The function to call with any errors found.
+  """
+
+  raw_lines = clean_lines.raw_lines
+  line = raw_lines[linenum]
+
+  if line.find('\t') != -1:
+    error(filename, linenum, 'whitespace/tab', 1,
+          'Tab found; better to use spaces')
+
+  # One or three blank spaces at the beginning of the line is weird; it's
+  # hard to reconcile that with 2-space indents.
+  # NOTE: here are the conditions rob pike used for his tests.  Mine aren't
+  # as sophisticated, but it may be worth becoming so:  RLENGTH==initial_spaces
+  # if(RLENGTH > 20) complain = 0;
+  # if(match($0, " +(error|private|public|protected):")) complain = 0;
+  # if(match(prev, "&& *$")) complain = 0;
+  # if(match(prev, "\\|\\| *$")) complain = 0;
+  # if(match(prev, "[\",=><] *$")) complain = 0;
+  # if(match($0, " <<")) complain = 0;
+  # if(match(prev, " +for \\(")) complain = 0;
+  # if(prevodd && match(prevprev, " +for \\(")) complain = 0;
+  initial_spaces = 0
+  cleansed_line = clean_lines.elided[linenum]
+  while initial_spaces < len(line) and line[initial_spaces] == ' ':
+    initial_spaces += 1
+  if line and line[-1].isspace():
+    error(filename, linenum, 'whitespace/end_of_line', 4,
+          'Line ends in whitespace.  Consider deleting these extra spaces.')
+  # There are certain situations we allow one space, notably for labels
+  elif ((initial_spaces == 1 or initial_spaces == 3) and
+        not Match(r'\s*\w+\s*:\s*$', cleansed_line)):
+    error(filename, linenum, 'whitespace/indent', 3,
+          'Weird number of spaces at line-start.  '
+          'Are you using a 2-space indent?')
+  # Labels should always be indented at least one space.
+  elif not initial_spaces and line[:2] != '//' and Search(r'[^:]:\s*$',
+                                                          line):
+    error(filename, linenum, 'whitespace/labels', 4,
+          'Labels should always be indented at least one space.  '
+          'If this is a member-initializer list in a constructor or '
+          'the base class list in a class definition, the colon should '
+          'be on the following line.')
+
+
+  # Check if the line is a header guard.
+  is_header_guard = False
+  if file_extension == 'h':
+    cppvar = GetHeaderGuardCPPVariable(filename)
+    if (line.startswith('#ifndef %s' % cppvar) or
+        line.startswith('#define %s' % cppvar) or
+        line.startswith('#endif  // %s' % cppvar)):
+      is_header_guard = True
+  # #include lines and header guards can be long, since there's no clean way to
+  # split them.
+  #
+  # URLs can be long too.  It's possible to split these, but it makes them
+  # harder to cut&paste.
+  #
+  # The "$Id:...$" comment may also get very long without it being the
+  # developers fault.
+  if (not line.startswith('#include') and not is_header_guard and
+      not Match(r'^\s*//.*http(s?)://\S*$', line) and
+      not Match(r'^// \$Id:.*#[0-9]+ \$$', line)):
+    line_width = GetLineWidth(line)
+    if line_width > 100:
+      error(filename, linenum, 'whitespace/line_length', 4,
+            'Lines should very rarely be longer than 100 characters')
+    elif line_width > 80:
+      error(filename, linenum, 'whitespace/line_length', 2,
+            'Lines should be <= 80 characters long')
+
+  if (cleansed_line.count(';') > 1 and
+      # for loops are allowed two ;'s (and may run over two lines).
+      cleansed_line.find('for') == -1 and
+      (GetPreviousNonBlankLine(clean_lines, linenum)[0].find('for') == -1 or
+       GetPreviousNonBlankLine(clean_lines, linenum)[0].find(';') != -1) and
+      # It's ok to have many commands in a switch case that fits in 1 line
+      not ((cleansed_line.find('case ') != -1 or
+            cleansed_line.find('default:') != -1) and
+           cleansed_line.find('break;') != -1)):
+    error(filename, linenum, 'whitespace/newline', 0,
+          'More than one command on the same line')
+
+  # Some more style checks
+  CheckBraces(filename, clean_lines, linenum, error)
+  CheckEmptyLoopBody(filename, clean_lines, linenum, error)
+  CheckAccess(filename, clean_lines, linenum, nesting_state, error)
+  CheckSpacing(filename, clean_lines, linenum, nesting_state, error)
+  CheckCheck(filename, clean_lines, linenum, error)
+  CheckAltTokens(filename, clean_lines, linenum, error)
+  classinfo = nesting_state.InnermostClass()
+  if classinfo:
+    CheckSectionSpacing(filename, clean_lines, classinfo, linenum, error)
+
+
+_RE_PATTERN_INCLUDE_NEW_STYLE = re.compile(r'#include +"[^/]+\.h"')
+_RE_PATTERN_INCLUDE = re.compile(r'^\s*#\s*include\s*([<"])([^>"]*)[>"].*$')
+# Matches the first component of a filename delimited by -s and _s. That is:
+#  _RE_FIRST_COMPONENT.match('foo').group(0) == 'foo'
+#  _RE_FIRST_COMPONENT.match('foo.cc').group(0) == 'foo'
+#  _RE_FIRST_COMPONENT.match('foo-bar_baz.cc').group(0) == 'foo'
+#  _RE_FIRST_COMPONENT.match('foo_bar-baz.cc').group(0) == 'foo'
+_RE_FIRST_COMPONENT = re.compile(r'^[^-_.]+')
+
+
+def _DropCommonSuffixes(filename):
+  """Drops common suffixes like _test.cc or -inl.h from filename.
+
+  For example:
+    >>> _DropCommonSuffixes('foo/foo-inl.h')
+    'foo/foo'
+    >>> _DropCommonSuffixes('foo/bar/foo.cc')
+    'foo/bar/foo'
+    >>> _DropCommonSuffixes('foo/foo_internal.h')
+    'foo/foo'
+    >>> _DropCommonSuffixes('foo/foo_unusualinternal.h')
+    'foo/foo_unusualinternal'
+
+  Args:
+    filename: The input filename.
+
+  Returns:
+    The filename with the common suffix removed.
+  """
+  for suffix in ('test.cc', 'regtest.cc', 'unittest.cc',
+                 'inl.h', 'impl.h', 'internal.h'):
+    if (filename.endswith(suffix) and len(filename) > len(suffix) and
+        filename[-len(suffix) - 1] in ('-', '_')):
+      return filename[:-len(suffix) - 1]
+  return os.path.splitext(filename)[0]
+
+
+def _IsTestFilename(filename):
+  """Determines if the given filename has a suffix that identifies it as a test.
+
+  Args:
+    filename: The input filename.
+
+  Returns:
+    True if 'filename' looks like a test, False otherwise.
+  """
+  if (filename.endswith('_test.cc') or
+      filename.endswith('_unittest.cc') or
+      filename.endswith('_regtest.cc')):
+    return True
+  else:
+    return False
+
+
+def _ClassifyInclude(fileinfo, include, is_system):
+  """Figures out what kind of header 'include' is.
+
+  Args:
+    fileinfo: The current file cpplint is running over. A FileInfo instance.
+    include: The path to a #included file.
+    is_system: True if the #include used <> rather than "".
+
+  Returns:
+    One of the _XXX_HEADER constants.
+
+  For example:
+    >>> _ClassifyInclude(FileInfo('foo/foo.cc'), 'stdio.h', True)
+    _C_SYS_HEADER
+    >>> _ClassifyInclude(FileInfo('foo/foo.cc'), 'string', True)
+    _CPP_SYS_HEADER
+    >>> _ClassifyInclude(FileInfo('foo/foo.cc'), 'foo/foo.h', False)
+    _LIKELY_MY_HEADER
+    >>> _ClassifyInclude(FileInfo('foo/foo_unknown_extension.cc'),
+    ...                  'bar/foo_other_ext.h', False)
+    _POSSIBLE_MY_HEADER
+    >>> _ClassifyInclude(FileInfo('foo/foo.cc'), 'foo/bar.h', False)
+    _OTHER_HEADER
+  """
+  # This is a list of all standard c++ header files, except
+  # those already checked for above.
+  is_stl_h = include in _STL_HEADERS
+  is_cpp_h = is_stl_h or include in _CPP_HEADERS
+
+  if is_system:
+    if is_cpp_h:
+      return _CPP_SYS_HEADER
+    else:
+      return _C_SYS_HEADER
+
+  # If the target file and the include we're checking share a
+  # basename when we drop common extensions, and the include
+  # lives in . , then it's likely to be owned by the target file.
+  target_dir, target_base = (
+      os.path.split(_DropCommonSuffixes(fileinfo.RepositoryName())))
+  include_dir, include_base = os.path.split(_DropCommonSuffixes(include))
+  if target_base == include_base and (
+      include_dir == target_dir or
+      include_dir == os.path.normpath(target_dir + '/../public')):
+    return _LIKELY_MY_HEADER
+
+  # If the target and include share some initial basename
+  # component, it's possible the target is implementing the
+  # include, so it's allowed to be first, but we'll never
+  # complain if it's not there.
+  target_first_component = _RE_FIRST_COMPONENT.match(target_base)
+  include_first_component = _RE_FIRST_COMPONENT.match(include_base)
+  if (target_first_component and include_first_component and
+      target_first_component.group(0) ==
+      include_first_component.group(0)):
+    return _POSSIBLE_MY_HEADER
+
+  return _OTHER_HEADER
+
+
+
+def CheckIncludeLine(filename, clean_lines, linenum, include_state, error):
+  """Check rules that are applicable to #include lines.
+
+  Strings on #include lines are NOT removed from elided line, to make
+  certain tasks easier. However, to prevent false positives, checks
+  applicable to #include lines in CheckLanguage must be put here.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    include_state: An _IncludeState instance in which the headers are inserted.
+    error: The function to call with any errors found.
+  """
+  fileinfo = FileInfo(filename)
+
+  line = clean_lines.lines[linenum]
+
+  # dsm: Disable, I don't care
+  # "include" should use the new style "foo/bar.h" instead of just "bar.h"
+  #if _RE_PATTERN_INCLUDE_NEW_STYLE.search(line):
+  #  error(filename, linenum, 'build/include', 4,
+  #        'Include the directory when naming .h files')
+
+  # we shouldn't include a file more than once. actually, there are a
+  # handful of instances where doing so is okay, but in general it's
+  # not.
+  match = _RE_PATTERN_INCLUDE.search(line)
+  if match:
+    include = match.group(2)
+    is_system = (match.group(1) == '<')
+    if include in include_state:
+      error(filename, linenum, 'build/include', 4,
+            '"%s" already included at %s:%s' %
+            (include, filename, include_state[include]))
+    else:
+      include_state[include] = linenum
+
+      # We want to ensure that headers appear in the right order:
+      # 1) for foo.cc, foo.h  (preferred location)
+      # 2) c system files
+      # 3) cpp system files
+      # 4) for foo.cc, foo.h  (deprecated location)
+      # 5) other google headers
+      #
+      # We classify each include statement as one of those 5 types
+      # using a number of techniques. The include_state object keeps
+      # track of the highest type seen, and complains if we see a
+      # lower type after that.
+      error_message = include_state.CheckNextIncludeOrder(
+          _ClassifyInclude(fileinfo, include, is_system))
+      if error_message:
+        error(filename, linenum, 'build/include_order', 4,
+              '%s. Should be: %s.h, c system, c++ system, other.' %
+              (error_message, fileinfo.BaseName()))
+      if not include_state.IsInAlphabeticalOrder(include):
+        error(filename, linenum, 'build/include_alpha', 4,
+              'Include "%s" not in alphabetical order' % include)
+
+  # Look for any of the stream classes that are part of standard C++.
+  match = _RE_PATTERN_INCLUDE.match(line)
+  if match:
+    include = match.group(2)
+    if Match(r'(f|ind|io|i|o|parse|pf|stdio|str|)?stream$', include):
+      # Many unit tests use cout, so we exempt them.
+      if not _IsTestFilename(filename):
+        error(filename, linenum, 'readability/streams', 3,
+              'Streams are highly discouraged.')
+
+
+def _GetTextInside(text, start_pattern):
+  """Retrieves all the text between matching open and close parentheses.
+
+  Given a string of lines and a regular expression string, retrieve all the text
+  following the expression and between opening punctuation symbols like
+  (, [, or {, and the matching close-punctuation symbol. This properly nested
+  occurrences of the punctuations, so for the text like
+    printf(a(), b(c()));
+  a call to _GetTextInside(text, r'printf\(') will return 'a(), b(c())'.
+  start_pattern must match string having an open punctuation symbol at the end.
+
+  Args:
+    text: The lines to extract text. Its comments and strings must be elided.
+           It can be single line and can span multiple lines.
+    start_pattern: The regexp string indicating where to start extracting
+                   the text.
+  Returns:
+    The extracted text.
+    None if either the opening string or ending punctuation could not be found.
+  """
+  # TODO(sugawarayu): Audit cpplint.py to see what places could be profitably
+  # rewritten to use _GetTextInside (and use inferior regexp matching today).
+
+  # Give opening punctuations to get the matching close-punctuations.
+  matching_punctuation = {'(': ')', '{': '}', '[': ']'}
+  closing_punctuation = set(matching_punctuation.itervalues())
+
+  # Find the position to start extracting text.
+  match = re.search(start_pattern, text, re.M)
+  if not match:  # start_pattern not found in text.
+    return None
+  start_position = match.end(0)
+
+  assert start_position > 0, (
+      'start_pattern must ends with an opening punctuation.')
+  assert text[start_position - 1] in matching_punctuation, (
+      'start_pattern must ends with an opening punctuation.')
+  # Stack of closing punctuations we expect to have in text after position.
+  punctuation_stack = [matching_punctuation[text[start_position - 1]]]
+  position = start_position
+  while punctuation_stack and position < len(text):
+    if text[position] == punctuation_stack[-1]:
+      punctuation_stack.pop()
+    elif text[position] in closing_punctuation:
+      # A closing punctuation without matching opening punctuations.
+      return None
+    elif text[position] in matching_punctuation:
+      punctuation_stack.append(matching_punctuation[text[position]])
+    position += 1
+  if punctuation_stack:
+    # Opening punctuations left without matching close-punctuations.
+    return None
+  # punctuations match.
+  return text[start_position:position - 1]
+
+
+def CheckLanguage(filename, clean_lines, linenum, file_extension, include_state,
+                  error):
+  """Checks rules from the 'C++ language rules' section of cppguide.html.
+
+  Some of these rules are hard to test (function overloading, using
+  uint32 inappropriately), but we do the best we can.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    file_extension: The extension (without the dot) of the filename.
+    include_state: An _IncludeState instance in which the headers are inserted.
+    error: The function to call with any errors found.
+  """
+  # If the line is empty or consists of entirely a comment, no need to
+  # check it.
+  line = clean_lines.elided[linenum]
+  if not line:
+    return
+
+  match = _RE_PATTERN_INCLUDE.search(line)
+  if match:
+    CheckIncludeLine(filename, clean_lines, linenum, include_state, error)
+    return
+
+  # Create an extended_line, which is the concatenation of the current and
+  # next lines, for more effective checking of code that may span more than one
+  # line.
+  if linenum + 1 < clean_lines.NumLines():
+    extended_line = line + clean_lines.elided[linenum + 1]
+  else:
+    extended_line = line
+
+  # Make Windows paths like Unix.
+  fullname = os.path.abspath(filename).replace('\\', '/')
+
+  # TODO(unknown): figure out if they're using default arguments in fn proto.
+
+  # Check for non-const references in functions.  This is tricky because &
+  # is also used to take the address of something.  We allow <> for templates,
+  # (ignoring whatever is between the braces) and : for classes.
+  # These are complicated re's.  They try to capture the following:
+  # paren (for fn-prototype start), typename, &, varname.  For the const
+  # version, we're willing for const to be before typename or after
+  # Don't check the implementation on same line.
+  fnline = line.split('{', 1)[0]
+  if (len(re.findall(r'\([^()]*\b(?:[\w:]|<[^()]*>)+(\s?&|&\s?)\w+', fnline)) >
+      len(re.findall(r'\([^()]*\bconst\s+(?:typename\s+)?(?:struct\s+)?'
+                     r'(?:[\w:]|<[^()]*>)+(\s?&|&\s?)\w+', fnline)) +
+      len(re.findall(r'\([^()]*\b(?:[\w:]|<[^()]*>)+\s+const(\s?&|&\s?)[\w]+',
+                     fnline))):
+
+    # We allow non-const references in a few standard places, like functions
+    # called "swap()" or iostream operators like "<<" or ">>". We also filter
+    # out for loops, which lint otherwise mistakenly thinks are functions.
+    if not Search(
+        r'(for|swap|Swap|operator[<>][<>])\s*\(\s*'
+        r'(?:(?:typename\s*)?[\w:]|<.*>)+\s*&',
+        fnline):
+      error(filename, linenum, 'runtime/references', 2,
+            'Is this a non-const reference? '
+            'If so, make const or use a pointer.')
+
+  # Check to see if they're using an conversion function cast.
+  # I just try to capture the most common basic types, though there are more.
+  # Parameterless conversion functions, such as bool(), are allowed as they are
+  # probably a member operator declaration or default constructor.
+  match = Search(
+      r'(\bnew\s+)?\b'  # Grab 'new' operator, if it's there
+      r'(int|float|double|bool|char|int32|uint32|int64|uint64)\([^)]', line)
+  if match:
+    # gMock methods are defined using some variant of MOCK_METHODx(name, type)
+    # where type may be float(), int(string), etc.  Without context they are
+    # virtually indistinguishable from int(x) casts. Likewise, gMock's
+    # MockCallback takes a template parameter of the form return_type(arg_type),
+    # which looks much like the cast we're trying to detect.
+    if (match.group(1) is None and  # If new operator, then this isn't a cast
+        not (Match(r'^\s*MOCK_(CONST_)?METHOD\d+(_T)?\(', line) or
+             Match(r'^\s*MockCallback<.*>', line))):
+      # Try a bit harder to catch gmock lines: the only place where
+      # something looks like an old-style cast is where we declare the
+      # return type of the mocked method, and the only time when we
+      # are missing context is if MOCK_METHOD was split across
+      # multiple lines (for example http://go/hrfhr ), so we only need
+      # to check the previous line for MOCK_METHOD.
+      if (linenum == 0 or
+          not Match(r'^\s*MOCK_(CONST_)?METHOD\d+(_T)?\(\S+,\s*$',
+                    clean_lines.elided[linenum - 1])):
+        error(filename, linenum, 'readability/casting', 4,
+              'Using deprecated casting style.  '
+              'Use static_cast<%s>(...) instead' %
+              match.group(2))
+
+  CheckCStyleCast(filename, linenum, line, clean_lines.raw_lines[linenum],
+                  'static_cast',
+                  r'\((int|float|double|bool|char|u?int(16|32|64))\)', error)
+
+  # This doesn't catch all cases. Consider (const char * const)"hello".
+  #
+  # (char *) "foo" should always be a const_cast (reinterpret_cast won't
+  # compile).
+  if CheckCStyleCast(filename, linenum, line, clean_lines.raw_lines[linenum],
+                     'const_cast', r'\((char\s?\*+\s?)\)\s*"', error):
+    pass
+  else:
+    # Check pointer casts for other than string constants
+    CheckCStyleCast(filename, linenum, line, clean_lines.raw_lines[linenum],
+                    'reinterpret_cast', r'\((\w+\s?\*+\s?)\)', error)
+
+  # In addition, we look for people taking the address of a cast.  This
+  # is dangerous -- casts can assign to temporaries, so the pointer doesn't
+  # point where you think.
+  if Search(
+      r'(&\([^)]+\)[\w(])|(&(static|dynamic|reinterpret)_cast\b)', line):
+    error(filename, linenum, 'runtime/casting', 4,
+          ('Are you taking an address of a cast?  '
+           'This is dangerous: could be a temp var.  '
+           'Take the address before doing the cast, rather than after'))
+
+  # Check for people declaring static/global STL strings at the top level.
+  # This is dangerous because the C++ language does not guarantee that
+  # globals with constructors are initialized before the first access.
+  match = Match(
+      r'((?:|static +)(?:|const +))string +([a-zA-Z0-9_:]+)\b(.*)',
+      line)
+  # Make sure it's not a function.
+  # Function template specialization looks like: "string foo<Type>(...".
+  # Class template definitions look like: "string Foo<Type>::Method(...".
+  if match and not Match(r'\s*(<.*>)?(::[a-zA-Z0-9_]+)?\s*\(([^"]|$)',
+                         match.group(3)):
+    error(filename, linenum, 'runtime/string', 4,
+          'For a static/global string constant, use a C style string instead: '
+          '"%schar %s[]".' %
+          (match.group(1), match.group(2)))
+
+  # Check that we're not using RTTI outside of testing code.
+  if Search(r'\bdynamic_cast<', line) and not _IsTestFilename(filename):
+    error(filename, linenum, 'runtime/rtti', 5,
+          'Do not use dynamic_cast<>.  If you need to cast within a class '
+          "hierarchy, use static_cast<> to upcast.  Google doesn't support "
+          'RTTI.')
+
+  if Search(r'\b([A-Za-z0-9_]*_)\(\1\)', line):
+    error(filename, linenum, 'runtime/init', 4,
+          'You seem to be initializing a member variable with itself.')
+
+  if file_extension == 'h':
+    # TODO(unknown): check that 1-arg constructors are explicit.
+    #                How to tell it's a constructor?
+    #                (handled in CheckForNonStandardConstructs for now)
+    # TODO(unknown): check that classes have DISALLOW_EVIL_CONSTRUCTORS
+    #                (level 1 error)
+    pass
+
+  # Check if people are using the verboten C basic types.  The only exception
+  # we regularly allow is "unsigned short port" for port.
+  if Search(r'\bshort port\b', line):
+    if not Search(r'\bunsigned short port\b', line):
+      error(filename, linenum, 'runtime/int', 4,
+            'Use "unsigned short" for ports, not "short"')
+  else:
+    match = Search(r'\b(short|long(?! +double)|long long)\b', line)
+    if match:
+      error(filename, linenum, 'runtime/int', 4,
+            'Use int16/int64/etc, rather than the C type %s' % match.group(1))
+
+  # When snprintf is used, the second argument shouldn't be a literal.
+  match = Search(r'snprintf\s*\(([^,]*),\s*([0-9]*)\s*,', line)
+  if match and match.group(2) != '0':
+    # If 2nd arg is zero, snprintf is used to calculate size.
+    error(filename, linenum, 'runtime/printf', 3,
+          'If you can, use sizeof(%s) instead of %s as the 2nd arg '
+          'to snprintf.' % (match.group(1), match.group(2)))
+
+  # Check if some verboten C functions are being used.
+  if Search(r'\bsprintf\b', line):
+    error(filename, linenum, 'runtime/printf', 5,
+          'Never use sprintf.  Use snprintf instead.')
+  match = Search(r'\b(strcpy|strcat)\b', line)
+  if match:
+    error(filename, linenum, 'runtime/printf', 4,
+          'Almost always, snprintf is better than %s' % match.group(1))
+
+  if Search(r'\bsscanf\b', line):
+    error(filename, linenum, 'runtime/printf', 1,
+          'sscanf can be ok, but is slow and can overflow buffers.')
+
+  # Check if some verboten operator overloading is going on
+  # TODO(unknown): catch out-of-line unary operator&:
+  #   class X {};
+  #   int operator&(const X& x) { return 42; }  // unary operator&
+  # The trick is it's hard to tell apart from binary operator&:
+  #   class Y { int operator&(const Y& x) { return 23; } }; // binary operator&
+  if Search(r'\boperator\s*&\s*\(\s*\)', line):
+    error(filename, linenum, 'runtime/operator', 4,
+          'Unary operator& is dangerous.  Do not use it.')
+
+  # Check for suspicious usage of "if" like
+  # } if (a == b) {
+  if Search(r'\}\s*if\s*\(', line):
+    error(filename, linenum, 'readability/braces', 4,
+          'Did you mean "else if"? If not, start a new line for "if".')
+
+  # Check for potential format string bugs like printf(foo).
+  # We constrain the pattern not to pick things like DocidForPrintf(foo).
+  # Not perfect but it can catch printf(foo.c_str()) and printf(foo->c_str())
+  # TODO(sugawarayu): Catch the following case. Need to change the calling
+  # convention of the whole function to process multiple line to handle it.
+  #   printf(
+  #       boy_this_is_a_really_long_variable_that_cannot_fit_on_the_prev_line);
+  printf_args = _GetTextInside(line, r'(?i)\b(string)?printf\s*\(')
+  if printf_args:
+    match = Match(r'([\w.\->()]+)$', printf_args)
+    if match and match.group(1) != '__VA_ARGS__':
+      function_name = re.search(r'\b((?:string)?printf)\s*\(',
+                                line, re.I).group(1)
+      error(filename, linenum, 'runtime/printf', 4,
+            'Potential format string bug. Do %s("%%s", %s) instead.'
+            % (function_name, match.group(1)))
+
+  # Check for potential memset bugs like memset(buf, sizeof(buf), 0).
+  match = Search(r'memset\s*\(([^,]*),\s*([^,]*),\s*0\s*\)', line)
+  if match and not Match(r"^''|-?[0-9]+|0x[0-9A-Fa-f]$", match.group(2)):
+    error(filename, linenum, 'runtime/memset', 4,
+          'Did you mean "memset(%s, 0, %s)"?'
+          % (match.group(1), match.group(2)))
+
+  if Search(r'\busing namespace\b', line):
+    error(filename, linenum, 'build/namespaces', 5,
+          'Do not use namespace using-directives.  '
+          'Use using-declarations instead.')
+
+  # Detect variable-length arrays.
+  match = Match(r'\s*(.+::)?(\w+) [a-z]\w*\[(.+)];', line)
+  if (match and match.group(2) != 'return' and match.group(2) != 'delete' and
+      match.group(3).find(']') == -1):
+    # Split the size using space and arithmetic operators as delimiters.
+    # If any of the resulting tokens are not compile time constants then
+    # report the error.
+    tokens = re.split(r'\s|\+|\-|\*|\/|<<|>>]', match.group(3))
+    is_const = True
+    skip_next = False
+    for tok in tokens:
+      if skip_next:
+        skip_next = False
+        continue
+
+      if Search(r'sizeof\(.+\)', tok): continue
+      if Search(r'arraysize\(\w+\)', tok): continue
+
+      tok = tok.lstrip('(')
+      tok = tok.rstrip(')')
+      if not tok: continue
+      if Match(r'\d+', tok): continue
+      if Match(r'0[xX][0-9a-fA-F]+', tok): continue
+      if Match(r'k[A-Z0-9]\w*', tok): continue
+      if Match(r'(.+::)?k[A-Z0-9]\w*', tok): continue
+      if Match(r'(.+::)?[A-Z][A-Z0-9_]*', tok): continue
+      # A catch all for tricky sizeof cases, including 'sizeof expression',
+      # 'sizeof(*type)', 'sizeof(const type)', 'sizeof(struct StructName)'
+      # requires skipping the next token because we split on ' ' and '*'.
+      if tok.startswith('sizeof'):
+        skip_next = True
+        continue
+      is_const = False
+      break
+    if not is_const:
+      error(filename, linenum, 'runtime/arrays', 1,
+            'Do not use variable-length arrays.  Use an appropriately named '
+            "('k' followed by CamelCase) compile-time constant for the size.")
+
+  # If DISALLOW_EVIL_CONSTRUCTORS, DISALLOW_COPY_AND_ASSIGN, or
+  # DISALLOW_IMPLICIT_CONSTRUCTORS is present, then it should be the last thing
+  # in the class declaration.
+  match = Match(
+      (r'\s*'
+       r'(DISALLOW_(EVIL_CONSTRUCTORS|COPY_AND_ASSIGN|IMPLICIT_CONSTRUCTORS))'
+       r'\(.*\);$'),
+      line)
+  if match and linenum + 1 < clean_lines.NumLines():
+    next_line = clean_lines.elided[linenum + 1]
+    # We allow some, but not all, declarations of variables to be present
+    # in the statement that defines the class.  The [\w\*,\s]* fragment of
+    # the regular expression below allows users to declare instances of
+    # the class or pointers to instances, but not less common types such
+    # as function pointers or arrays.  It's a tradeoff between allowing
+    # reasonable code and avoiding trying to parse more C++ using regexps.
+    if not Search(r'^\s*}[\w\*,\s]*;', next_line):
+      error(filename, linenum, 'readability/constructors', 3,
+            match.group(1) + ' should be the last thing in the class')
+
+  # Check for use of unnamed namespaces in header files.  Registration
+  # macros are typically OK, so we allow use of "namespace {" on lines
+  # that end with backslashes.
+  if (file_extension == 'h'
+      and Search(r'\bnamespace\s*{', line)
+      and line[-1] != '\\'):
+    error(filename, linenum, 'build/namespaces', 4,
+          'Do not use unnamed namespaces in header files.  See '
+          'http://google-styleguide.googlecode.com/svn/trunk/cppguide.xml#Namespaces'
+          ' for more information.')
+
+
+def CheckCStyleCast(filename, linenum, line, raw_line, cast_type, pattern,
+                    error):
+  """Checks for a C-style cast by looking for the pattern.
+
+  This also handles sizeof(type) warnings, due to similarity of content.
+
+  Args:
+    filename: The name of the current file.
+    linenum: The number of the line to check.
+    line: The line of code to check.
+    raw_line: The raw line of code to check, with comments.
+    cast_type: The string for the C++ cast to recommend.  This is either
+      reinterpret_cast, static_cast, or const_cast, depending.
+    pattern: The regular expression used to find C-style casts.
+    error: The function to call with any errors found.
+
+  Returns:
+    True if an error was emitted.
+    False otherwise.
+  """
+  match = Search(pattern, line)
+  if not match:
+    return False
+
+  # e.g., sizeof(int)
+  sizeof_match = Match(r'.*sizeof\s*$', line[0:match.start(1) - 1])
+  if sizeof_match:
+    error(filename, linenum, 'runtime/sizeof', 1,
+          'Using sizeof(type).  Use sizeof(varname) instead if possible')
+    return True
+
+  # operator++(int) and operator--(int)
+  if (line[0:match.start(1) - 1].endswith(' operator++') or
+      line[0:match.start(1) - 1].endswith(' operator--')):
+    return False
+
+  remainder = line[match.end(0):]
+
+  # The close paren is for function pointers as arguments to a function.
+  # eg, void foo(void (*bar)(int));
+  # The semicolon check is a more basic function check; also possibly a
+  # function pointer typedef.
+  # eg, void foo(int); or void foo(int) const;
+  # The equals check is for function pointer assignment.
+  # eg, void *(*foo)(int) = ...
+  # The > is for MockCallback<...> ...
+  #
+  # Right now, this will only catch cases where there's a single argument, and
+  # it's unnamed.  It should probably be expanded to check for multiple
+  # arguments with some unnamed.
+  function_match = Match(r'\s*(\)|=|(const)?\s*(;|\{|throw\(\)|>))', remainder)
+  if function_match:
+    if (not function_match.group(3) or
+        function_match.group(3) == ';' or
+        ('MockCallback<' not in raw_line and
+         '/*' not in raw_line)):
+      error(filename, linenum, 'readability/function', 3,
+            'All parameters should be named in a function')
+    return True
+
+  # At this point, all that should be left is actual casts.
+  error(filename, linenum, 'readability/casting', 4,
+        'Using C-style cast.  Use %s<%s>(...) instead' %
+        (cast_type, match.group(1)))
+
+  return True
+
+
+_HEADERS_CONTAINING_TEMPLATES = (
+    ('<deque>', ('deque',)),
+    ('<functional>', ('unary_function', 'binary_function',
+                      'plus', 'minus', 'multiplies', 'divides', 'modulus',
+                      'negate',
+                      'equal_to', 'not_equal_to', 'greater', 'less',
+                      'greater_equal', 'less_equal',
+                      'logical_and', 'logical_or', 'logical_not',
+                      'unary_negate', 'not1', 'binary_negate', 'not2',
+                      'bind1st', 'bind2nd',
+                      'pointer_to_unary_function',
+                      'pointer_to_binary_function',
+                      'ptr_fun',
+                      'mem_fun_t', 'mem_fun', 'mem_fun1_t', 'mem_fun1_ref_t',
+                      'mem_fun_ref_t',
+                      'const_mem_fun_t', 'const_mem_fun1_t',
+                      'const_mem_fun_ref_t', 'const_mem_fun1_ref_t',
+                      'mem_fun_ref',
+                     )),
+    ('<limits>', ('numeric_limits',)),
+    ('<list>', ('list',)),
+    ('<map>', ('map', 'multimap',)),
+    ('<memory>', ('allocator',)),
+    ('<queue>', ('queue', 'priority_queue',)),
+    ('<set>', ('set', 'multiset',)),
+    ('<stack>', ('stack',)),
+    ('<string>', ('char_traits', 'basic_string',)),
+    ('<utility>', ('pair',)),
+    ('<vector>', ('vector',)),
+
+    # gcc extensions.
+    # Note: std::hash is their hash, ::hash is our hash
+    ('<hash_map>', ('hash_map', 'hash_multimap',)),
+    ('<hash_set>', ('hash_set', 'hash_multiset',)),
+    ('<slist>', ('slist',)),
+    )
+
+_RE_PATTERN_STRING = re.compile(r'\bstring\b')
+
+_re_pattern_algorithm_header = []
+for _template in ('copy', 'max', 'min', 'min_element', 'sort', 'swap',
+                  'transform'):
+  # Match max<type>(..., ...), max(..., ...), but not foo->max, foo.max or
+  # type::max().
+  _re_pattern_algorithm_header.append(
+      (re.compile(r'[^>.]\b' + _template + r'(<.*?>)?\([^\)]'),
+       _template,
+       '<algorithm>'))
+
+_re_pattern_templates = []
+for _header, _templates in _HEADERS_CONTAINING_TEMPLATES:
+  for _template in _templates:
+    _re_pattern_templates.append(
+        (re.compile(r'(\<|\b)' + _template + r'\s*\<'),
+         _template + '<>',
+         _header))
+
+
+def FilesBelongToSameModule(filename_cc, filename_h):
+  """Check if these two filenames belong to the same module.
+
+  The concept of a 'module' here is a as follows:
+  foo.h, foo-inl.h, foo.cc, foo_test.cc and foo_unittest.cc belong to the
+  same 'module' if they are in the same directory.
+  some/path/public/xyzzy and some/path/internal/xyzzy are also considered
+  to belong to the same module here.
+
+  If the filename_cc contains a longer path than the filename_h, for example,
+  '/absolute/path/to/base/sysinfo.cc', and this file would include
+  'base/sysinfo.h', this function also produces the prefix needed to open the
+  header. This is used by the caller of this function to more robustly open the
+  header file. We don't have access to the real include paths in this context,
+  so we need this guesswork here.
+
+  Known bugs: tools/base/bar.cc and base/bar.h belong to the same module
+  according to this implementation. Because of this, this function gives
+  some false positives. This should be sufficiently rare in practice.
+
+  Args:
+    filename_cc: is the path for the .cc file
+    filename_h: is the path for the header path
+
+  Returns:
+    Tuple with a bool and a string:
+    bool: True if filename_cc and filename_h belong to the same module.
+    string: the additional prefix needed to open the header file.
+  """
+
+  if not filename_cc.endswith('.cc'):
+    return (False, '')
+  filename_cc = filename_cc[:-len('.cc')]
+  if filename_cc.endswith('_unittest'):
+    filename_cc = filename_cc[:-len('_unittest')]
+  elif filename_cc.endswith('_test'):
+    filename_cc = filename_cc[:-len('_test')]
+  filename_cc = filename_cc.replace('/public/', '/')
+  filename_cc = filename_cc.replace('/internal/', '/')
+
+  if not filename_h.endswith('.h'):
+    return (False, '')
+  filename_h = filename_h[:-len('.h')]
+  if filename_h.endswith('-inl'):
+    filename_h = filename_h[:-len('-inl')]
+  filename_h = filename_h.replace('/public/', '/')
+  filename_h = filename_h.replace('/internal/', '/')
+
+  files_belong_to_same_module = filename_cc.endswith(filename_h)
+  common_path = ''
+  if files_belong_to_same_module:
+    common_path = filename_cc[:-len(filename_h)]
+  return files_belong_to_same_module, common_path
+
+
+def UpdateIncludeState(filename, include_state, io=codecs):
+  """Fill up the include_state with new includes found from the file.
+
+  Args:
+    filename: the name of the header to read.
+    include_state: an _IncludeState instance in which the headers are inserted.
+    io: The io factory to use to read the file. Provided for testability.
+
+  Returns:
+    True if a header was succesfully added. False otherwise.
+  """
+  headerfile = None
+  try:
+    headerfile = io.open(filename, 'r', 'utf8', 'replace')
+  except IOError:
+    return False
+  linenum = 0
+  for line in headerfile:
+    linenum += 1
+    clean_line = CleanseComments(line)
+    match = _RE_PATTERN_INCLUDE.search(clean_line)
+    if match:
+      include = match.group(2)
+      # The value formatting is cute, but not really used right now.
+      # What matters here is that the key is in include_state.
+      include_state.setdefault(include, '%s:%d' % (filename, linenum))
+  return True
+
+
+def CheckForIncludeWhatYouUse(filename, clean_lines, include_state, error,
+                              io=codecs):
+  """Reports for missing stl includes.
+
+  This function will output warnings to make sure you are including the headers
+  necessary for the stl containers and functions that you use. We only give one
+  reason to include a header. For example, if you use both equal_to<> and
+  less<> in a .h file, only one (the latter in the file) of these will be
+  reported as a reason to include the <functional>.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    include_state: An _IncludeState instance.
+    error: The function to call with any errors found.
+    io: The IO factory to use to read the header file. Provided for unittest
+        injection.
+  """
+  required = {}  # A map of header name to linenumber and the template entity.
+                 # Example of required: { '<functional>': (1219, 'less<>') }
+
+  for linenum in xrange(clean_lines.NumLines()):
+    line = clean_lines.elided[linenum]
+    if not line or line[0] == '#':
+      continue
+
+    # String is special -- it is a non-templatized type in STL.
+    matched = _RE_PATTERN_STRING.search(line)
+    if matched:
+      # Don't warn about strings in non-STL namespaces:
+      # (We check only the first match per line; good enough.)
+      prefix = line[:matched.start()]
+      if prefix.endswith('std::') or not prefix.endswith('::'):
+        required['<string>'] = (linenum, 'string')
+
+    for pattern, template, header in _re_pattern_algorithm_header:
+      if pattern.search(line):
+        required[header] = (linenum, template)
+
+    # The following function is just a speed up, no semantics are changed.
+    if not '<' in line:  # Reduces the cpu time usage by skipping lines.
+      continue
+
+    for pattern, template, header in _re_pattern_templates:
+      if pattern.search(line):
+        required[header] = (linenum, template)
+
+  # The policy is that if you #include something in foo.h you don't need to
+  # include it again in foo.cc. Here, we will look at possible includes.
+  # Let's copy the include_state so it is only messed up within this function.
+  include_state = include_state.copy()
+
+  # Did we find the header for this file (if any) and succesfully load it?
+  header_found = False
+
+  # Use the absolute path so that matching works properly.
+  abs_filename = FileInfo(filename).FullName()
+
+  # For Emacs's flymake.
+  # If cpplint is invoked from Emacs's flymake, a temporary file is generated
+  # by flymake and that file name might end with '_flymake.cc'. In that case,
+  # restore original file name here so that the corresponding header file can be
+  # found.
+  # e.g. If the file name is 'foo_flymake.cc', we should search for 'foo.h'
+  # instead of 'foo_flymake.h'
+  abs_filename = re.sub(r'_flymake\.cc$', '.cc', abs_filename)
+
+  # include_state is modified during iteration, so we iterate over a copy of
+  # the keys.
+  header_keys = include_state.keys()
+  for header in header_keys:
+    (same_module, common_path) = FilesBelongToSameModule(abs_filename, header)
+    fullpath = common_path + header
+    if same_module and UpdateIncludeState(fullpath, include_state, io):
+      header_found = True
+
+  # If we can't find the header file for a .cc, assume it's because we don't
+  # know where to look. In that case we'll give up as we're not sure they
+  # didn't include it in the .h file.
+  # TODO(unknown): Do a better job of finding .h files so we are confident that
+  # not having the .h file means there isn't one.
+  if filename.endswith('.cc') and not header_found:
+    return
+
+  # All the lines have been processed, report the errors found.
+  for required_header_unstripped in required:
+    template = required[required_header_unstripped][1]
+    if required_header_unstripped.strip('<>"') not in include_state:
+      error(filename, required[required_header_unstripped][0],
+            'build/include_what_you_use', 4,
+            'Add #include ' + required_header_unstripped + ' for ' + template)
+
+
+_RE_PATTERN_EXPLICIT_MAKEPAIR = re.compile(r'\bmake_pair\s*<')
+
+
+def CheckMakePairUsesDeduction(filename, clean_lines, linenum, error):
+  """Check that make_pair's template arguments are deduced.
+
+  G++ 4.6 in C++0x mode fails badly if make_pair's template arguments are
+  specified explicitly, and such use isn't intended in any case.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+  raw = clean_lines.raw_lines
+  line = raw[linenum]
+  match = _RE_PATTERN_EXPLICIT_MAKEPAIR.search(line)
+  if match:
+    error(filename, linenum, 'build/explicit_make_pair',
+          4,  # 4 = high confidence
+          'For C++11-compatibility, omit template arguments from make_pair'
+          ' OR use pair directly OR if appropriate, construct a pair directly')
+
+
+def ProcessLine(filename, file_extension, clean_lines, line,
+                include_state, function_state, nesting_state, error,
+                extra_check_functions=[]):
+  """Processes a single line in the file.
+
+  Args:
+    filename: Filename of the file that is being processed.
+    file_extension: The extension (dot not included) of the file.
+    clean_lines: An array of strings, each representing a line of the file,
+                 with comments stripped.
+    line: Number of line being processed.
+    include_state: An _IncludeState instance in which the headers are inserted.
+    function_state: A _FunctionState instance which counts function lines, etc.
+    nesting_state: A _NestingState instance which maintains information about
+                   the current stack of nested blocks being parsed.
+    error: A callable to which errors are reported, which takes 4 arguments:
+           filename, line number, error level, and message
+    extra_check_functions: An array of additional check functions that will be
+                           run on each source line. Each function takes 4
+                           arguments: filename, clean_lines, line, error
+  """
+  raw_lines = clean_lines.raw_lines
+  ParseNolintSuppressions(filename, raw_lines[line], line, error)
+  nesting_state.Update(filename, clean_lines, line, error)
+  if nesting_state.stack and nesting_state.stack[-1].inline_asm != _NO_ASM:
+    return
+  CheckForFunctionLengths(filename, clean_lines, line, function_state, error)
+  CheckForMultilineCommentsAndStrings(filename, clean_lines, line, error)
+  CheckStyle(filename, clean_lines, line, file_extension, nesting_state, error)
+  CheckLanguage(filename, clean_lines, line, file_extension, include_state,
+                error)
+  CheckForNonStandardConstructs(filename, clean_lines, line,
+                                nesting_state, error)
+  CheckPosixThreading(filename, clean_lines, line, error)
+  CheckInvalidIncrement(filename, clean_lines, line, error)
+  CheckMakePairUsesDeduction(filename, clean_lines, line, error)
+  for check_fn in extra_check_functions:
+    check_fn(filename, clean_lines, line, error)
+
+def ProcessFileData(filename, file_extension, lines, error,
+                    extra_check_functions=[]):
+  """Performs lint checks and reports any errors to the given error function.
+
+  Args:
+    filename: Filename of the file that is being processed.
+    file_extension: The extension (dot not included) of the file.
+    lines: An array of strings, each representing a line of the file, with the
+           last element being empty if the file is terminated with a newline.
+    error: A callable to which errors are reported, which takes 4 arguments:
+           filename, line number, error level, and message
+    extra_check_functions: An array of additional check functions that will be
+                           run on each source line. Each function takes 4
+                           arguments: filename, clean_lines, line, error
+  """
+  lines = (['// marker so line numbers and indices both start at 1'] + lines +
+           ['// marker so line numbers end in a known way'])
+
+  include_state = _IncludeState()
+  function_state = _FunctionState()
+  nesting_state = _NestingState()
+
+  ResetNolintSuppressions()
+
+  CheckForCopyright(filename, lines, error)
+
+  if file_extension == 'h':
+    CheckForHeaderGuard(filename, lines, error)
+
+  RemoveMultiLineComments(filename, lines, error)
+  clean_lines = CleansedLines(lines)
+  for line in xrange(clean_lines.NumLines()):
+    ProcessLine(filename, file_extension, clean_lines, line,
+                include_state, function_state, nesting_state, error,
+                extra_check_functions)
+  nesting_state.CheckClassFinished(filename, error)
+
+  CheckForIncludeWhatYouUse(filename, clean_lines, include_state, error)
+
+  # We check here rather than inside ProcessLine so that we see raw
+  # lines rather than "cleaned" lines.
+  CheckForUnicodeReplacementCharacters(filename, lines, error)
+
+  CheckForNewlineAtEOF(filename, lines, error)
+
+def ProcessFile(filename, vlevel, extra_check_functions=[]):
+  """Does google-lint on a single file.
+
+  Args:
+    filename: The name of the file to parse.
+
+    vlevel: The level of errors to report.  Every error of confidence
+    >= verbose_level will be reported.  0 is a good default.
+
+    extra_check_functions: An array of additional check functions that will be
+                           run on each source line. Each function takes 4
+                           arguments: filename, clean_lines, line, error
+  """
+
+  _SetVerboseLevel(vlevel)
+
+  try:
+    # Support the UNIX convention of using "-" for stdin.  Note that
+    # we are not opening the file with universal newline support
+    # (which codecs doesn't support anyway), so the resulting lines do
+    # contain trailing '\r' characters if we are reading a file that
+    # has CRLF endings.
+    # If after the split a trailing '\r' is present, it is removed
+    # below. If it is not expected to be present (i.e. os.linesep !=
+    # '\r\n' as in Windows), a warning is issued below if this file
+    # is processed.
+
+    if filename == '-':
+      lines = codecs.StreamReaderWriter(sys.stdin,
+                                        codecs.getreader('utf8'),
+                                        codecs.getwriter('utf8'),
+                                        'replace').read().split('\n')
+    else:
+      lines = codecs.open(filename, 'r', 'utf8', 'replace').read().split('\n')
+
+    carriage_return_found = False
+    # Remove trailing '\r'.
+    for linenum in range(len(lines)):
+      if lines[linenum].endswith('\r'):
+        lines[linenum] = lines[linenum].rstrip('\r')
+        carriage_return_found = True
+
+  except IOError:
+    sys.stderr.write(
+        "Skipping input '%s': Can't open for reading\n" % filename)
+    return
+
+  # Note, if no dot is found, this will give the entire filename as the ext.
+  file_extension = filename[filename.rfind('.') + 1:]
+
+  # When reading from stdin, the extension is unknown, so no cpplint tests
+  # should rely on the extension.
+  if (filename != '-' and file_extension != 'cc' and file_extension != 'h'
+      and file_extension != 'cpp'):
+    sys.stderr.write('Ignoring %s; not a .cc or .h file\n' % filename)
+  else:
+    ProcessFileData(filename, file_extension, lines, Error,
+                    extra_check_functions)
+    if carriage_return_found and os.linesep != '\r\n':
+      # Use 0 for linenum since outputting only one error for potentially
+      # several lines.
+      Error(filename, 0, 'whitespace/newline', 1,
+            'One or more unexpected \\r (^M) found;'
+            'better to use only a \\n')
+
+  sys.stderr.write('Done processing %s\n' % filename)
+
+
+def PrintUsage(message):
+  """Prints a brief usage string and exits, optionally with an error message.
+
+  Args:
+    message: The optional error message.
+  """
+  sys.stderr.write(_USAGE)
+  if message:
+    sys.exit('\nFATAL ERROR: ' + message)
+  else:
+    sys.exit(1)
+
+
+def PrintCategories():
+  """Prints a list of all the error-categories used by error messages.
+
+  These are the categories used to filter messages via --filter.
+  """
+  sys.stderr.write(''.join('  %s\n' % cat for cat in _ERROR_CATEGORIES))
+  sys.exit(0)
+
+
+def ParseArguments(args):
+  """Parses the command line arguments.
+
+  This may set the output format and verbosity level as side-effects.
+
+  Args:
+    args: The command line arguments:
+
+  Returns:
+    The list of filenames to lint.
+  """
+  try:
+    (opts, filenames) = getopt.getopt(args, '', ['help', 'output=', 'verbose=',
+                                                 'counting=',
+                                                 'filter=',
+                                                 'root='])
+  except getopt.GetoptError:
+    PrintUsage('Invalid arguments.')
+
+  verbosity = _VerboseLevel()
+  output_format = _OutputFormat()
+  filters = ''
+  counting_style = ''
+
+  for (opt, val) in opts:
+    if opt == '--help':
+      PrintUsage(None)
+    elif opt == '--output':
+      if not val in ('emacs', 'vs7'):
+        PrintUsage('The only allowed output formats are emacs and vs7.')
+      output_format = val
+    elif opt == '--verbose':
+      verbosity = int(val)
+    elif opt == '--filter':
+      filters = val
+      if not filters:
+        PrintCategories()
+    elif opt == '--counting':
+      if val not in ('total', 'toplevel', 'detailed'):
+        PrintUsage('Valid counting options are total, toplevel, and detailed')
+      counting_style = val
+    elif opt == '--root':
+      global _root
+      _root = val
+
+  if not filenames:
+    PrintUsage('No files were specified.')
+
+  _SetOutputFormat(output_format)
+  _SetVerboseLevel(verbosity)
+  _SetFilters(filters)
+  _SetCountingStyle(counting_style)
+
+  return filenames
+
+
+def main():
+  filenames = ParseArguments(sys.argv[1:])
+
+  # Change stderr to write with replacement characters so we don't die
+  # if we try to print something containing non-ASCII characters.
+  sys.stderr = codecs.StreamReaderWriter(sys.stderr,
+                                         codecs.getreader('utf8'),
+                                         codecs.getwriter('utf8'),
+                                         'replace')
+
+  _cpplint_state.ResetErrorCounts()
+  for filename in filenames:
+    ProcessFile(filename, _cpplint_state.verbose_level)
+  _cpplint_state.PrintErrorCounts()
+
+  sys.exit(_cpplint_state.error_count > 0)
+
+
+if __name__ == '__main__':
+  main()
diff --git a/misc/ffControl.py b/misc/ffControl.py
new file mode 100755
index 00000000..d4d9dd72
--- /dev/null
+++ b/misc/ffControl.py
@@ -0,0 +1,56 @@
+#!/usr/bin/python
+
+# Copyright (C) 2013-2014 by Massachusetts Institute of Technology
+#
+# This file is part of zsim.
+#
+# zsim is free software; you can redistribute it and/or modify it under the
+# terms of the GNU General Public License as published by the Free Software
+# Foundation, version 2.
+#
+# If you use this software in your research, we request that you reference
+# the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of
+# Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the
+# source of the simulator in any publications that use this software, and that
+# you send us a citation of your work.
+#
+# zsim is distributed in the hope that it will be useful, but WITHOUT ANY
+# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+# details.
+#
+# You should have received a copy of the GNU General Public License along with
+# this program. If not, see <http://www.gnu.org/licenses/>.
+
+import os, sys, subprocess
+from optparse import OptionParser
+
+parser = OptionParser()
+parser.add_option("--procIdx", type="int", default=0, dest="procIdx", help="Process index to signal") 
+parser.add_option("--lineMatch", default=" ROI", dest="lineMatch", help="Matching line to stdin will trigger signal")
+parser.add_option("--maxMatches", type="int", default=0, dest="maxMatches", help="Exit after this many matches (0 to disable)")
+parser.add_option("--fftogglePath", default="./build/opt", dest="fftogglePath", help="")
+(opts, args) = parser.parse_args()
+
+targetShmid = -1
+matches = 0
+while matches < opts.maxMatches or opts.maxMatches <= 0:
+    try:
+        line = sys.stdin.readline()
+    except:
+        print "stdin done, exiting"
+        break
+   
+    if line.startswith("[H] Global segment shmid = "):
+        targetShmid = int(line.split("=")[1].lstrip().rstrip())
+        print "Target shmid is", targetShmid
+
+    if line.find(opts.lineMatch) >= 0:
+        if targetShmid >= 0:
+            print "Match, calling fftoggle"
+            matches += 1
+            subprocess.call([os.path.join(opts.fftogglePath, "fftoggle"), str(targetShmid), str(opts.procIdx)])
+        else:
+            print "Match but shmid is not valid, not sending signal (are you sure you specified procIdx correctly? it's not the PID)"
+print "Done, %d matches" % matches
+
diff --git a/misc/gitver.py b/misc/gitver.py
new file mode 100644
index 00000000..6068b7e5
--- /dev/null
+++ b/misc/gitver.py
@@ -0,0 +1,14 @@
+# Return a pretty-printed short git version (like hg/svnversion)
+import os
+def cmd(c): return os.popen(c).read().strip()
+branch = cmd("git rev-parse --abbrev-ref HEAD")
+revnum = cmd("git log | grep ^commit | wc -l")
+rshort = cmd("git rev-parse --short HEAD")
+dfstat = cmd("git diff HEAD --shortstat")
+dfhash = cmd("git diff HEAD | md5sum")[:8]
+shstat = dfstat.replace(" files changed", "fc").replace(" file changed", "fc") \
+               .replace(" insertions(+)", "+").replace(" insertion(+)", "+") \
+               .replace(" deletions(-)", "-").replace(" deletion(-)", "-") \
+               .replace(",", "")
+diff = "clean" if len(dfstat) == 0 else shstat +  " " + dfhash
+print ":".join([branch, revnum, rshort, diff])
diff --git a/misc/hooks/Makefile b/misc/hooks/Makefile
new file mode 100644
index 00000000..766ebc77
--- /dev/null
+++ b/misc/hooks/Makefile
@@ -0,0 +1,40 @@
+#JDK_PATH=../jdk1.5.0_22
+JDK_PATH=/usr/lib/jvm/java-7-oracle
+
+# Common deps
+DEPS=Makefile zsim_hooks.h
+
+default: test_c test_cpp test_fortran test_java 
+
+libfortran_hooks.a: $(DEPS)
+	gcc -O3 -g -fPIC -o fortran_hooks.o -c fortran_hooks.c
+	ar rcs libfortran_hooks.a fortran_hooks.o
+	ranlib libfortran_hooks.a
+
+zsim.class: $(DEPS) zsim_jni.cpp zsim.java
+	$(JDK_PATH)/bin/javah -o zsim_jni.h zsim  # generates header from zsim.java
+	g++ -O3 -g -std=c++0x -shared -fPIC -o libzsim_jni.so zsim_jni.cpp -I$(JDK_PATH)/include -I$(JDK_PATH)/include/linux
+	$(JDK_PATH)/bin/javac zsim.java
+	#$(JDK_PATH)/bin/jar cf zsim.jar zsim.class
+
+test_c: $(DEPS) test.c
+	gcc -O3 -g -o test_c test.c
+
+test_cpp: $(DEPS) test.cpp
+	g++ -O3 -g -o test_cpp test.cpp
+
+test_fortran: $(DEPS) test.f libfortran_hooks.a
+	gfortran -o test_fortran test.f -L. -lfortran_hooks
+
+test_java: $(DEPS) test.java zsim.class
+	$(JDK_PATH)/bin/javac test.java
+
+#run_tests: test_c test_cpp test_fortran test_java
+run_tests:
+	./test_c
+	./test_cpp
+	./test_fortran
+	java -Djava.library.path=. test
+
+clean:
+	rm -f *.o *.so *.a *.jar *.class test_* zsim_jni.h
diff --git a/misc/hooks/README b/misc/hooks/README
new file mode 100644
index 00000000..c2478abc
--- /dev/null
+++ b/misc/hooks/README
@@ -0,0 +1,5 @@
+zsim hooks for different programming languages. Use these to control simulation
+(e.g., fast-forwarding). C and C++ programs just need to include zsim-hooks.h.
+Fortran programs need to be linked with libfortran-hooks.a. Java programs can
+use the provided zsim JNI class. Run make run_tests and explore the Makefile
+for more info.
diff --git a/misc/hooks/fortran_hooks.c b/misc/hooks/fortran_hooks.c
new file mode 100644
index 00000000..c0c7b763
--- /dev/null
+++ b/misc/hooks/fortran_hooks.c
@@ -0,0 +1,14 @@
+#include "zsim_hooks.h"
+
+void zsim_roi_begin_() {
+    zsim_roi_begin();
+}
+
+void zsim_roi_end_() {
+    zsim_roi_end();
+}
+
+void zsim_heartbeat_() {
+    zsim_heartbeat();
+}
+
diff --git a/misc/hooks/test.c b/misc/hooks/test.c
new file mode 100644
index 00000000..f0a97337
--- /dev/null
+++ b/misc/hooks/test.c
@@ -0,0 +1,11 @@
+#include <stdio.h>
+#include "zsim_hooks.h"
+
+int main() {
+    printf("C test\n");
+    zsim_roi_begin();
+    zsim_heartbeat();
+    zsim_roi_end();
+    printf("C test done\n");
+    return 0;
+}
diff --git a/misc/hooks/test.cpp b/misc/hooks/test.cpp
new file mode 100644
index 00000000..15ba7ab3
--- /dev/null
+++ b/misc/hooks/test.cpp
@@ -0,0 +1,13 @@
+#include <iostream>
+#include "zsim_hooks.h"
+
+using namespace std;
+
+int main() {
+    cout << "C++ test" << endl;
+    zsim_roi_begin();
+    zsim_heartbeat();
+    zsim_roi_end();
+    cout << "C++ test done" << endl;
+    return 0;
+}
diff --git a/misc/hooks/test.f b/misc/hooks/test.f
new file mode 100644
index 00000000..377ef5c2
--- /dev/null
+++ b/misc/hooks/test.f
@@ -0,0 +1,6 @@
+      print *, "Fortran test"
+      call zsim_roi_begin()
+      call zsim_heartbeat()
+      call zsim_roi_end()
+      print *, "Fortran test done"
+      end
diff --git a/misc/hooks/test.java b/misc/hooks/test.java
new file mode 100644
index 00000000..cadd0e62
--- /dev/null
+++ b/misc/hooks/test.java
@@ -0,0 +1,11 @@
+
+public class test { 
+    public static void main(String[] args) { 
+        System.out.println("Java test");
+        zsim.roi_begin();
+        for (int i = 0; i < 42; i++) zsim.heartbeat();
+        zsim.roi_end();
+        System.out.println("Java test done");
+    }
+}
+
diff --git a/misc/hooks/zsim.java b/misc/hooks/zsim.java
new file mode 100644
index 00000000..781382a3
--- /dev/null
+++ b/misc/hooks/zsim.java
@@ -0,0 +1,11 @@
+// package zsim;
+
+public class zsim {
+    public static native void roi_begin();
+    public static native void roi_end();
+    public static native void heartbeat();
+    static {
+        System.loadLibrary("zsim_jni");
+    }
+}
+
diff --git a/misc/hooks/zsim_hooks.h b/misc/hooks/zsim_hooks.h
new file mode 100644
index 00000000..3bf91789
--- /dev/null
+++ b/misc/hooks/zsim_hooks.h
@@ -0,0 +1,49 @@
+#ifndef __ZSIM_HOOKS_H__
+#define __ZSIM_HOOKS_H__
+
+#include <stdint.h>
+#include <stdio.h>
+
+//Avoid optimizing compilers moving code around this barrier
+#define COMPILER_BARRIER() { __asm__ __volatile__("" ::: "memory");}
+
+//These need to be in sync with the simulator
+#define ZSIM_MAGIC_OP_ROI_BEGIN         (1025)
+#define ZSIM_MAGIC_OP_ROI_END           (1026)
+#define ZSIM_MAGIC_OP_REGISTER_THREAD   (1027)
+#define ZSIM_MAGIC_OP_HEARTBEAT         (1028)
+#define ZSIM_MAGIC_OP_WORK_BEGIN        (1029) //ubik
+#define ZSIM_MAGIC_OP_WORK_END          (1030) //ubik
+
+#ifdef __x86_64__
+#define HOOKS_STR  "HOOKS"
+static inline void zsim_magic_op(uint64_t op) {
+    COMPILER_BARRIER();
+    __asm__ __volatile__("xchg %%rcx, %%rcx;" : : "c"(op));
+    COMPILER_BARRIER();
+}
+#else
+#define HOOKS_STR  "NOP-HOOKS"
+static inline void zsim_magic_op(uint64_t op) {
+    //NOP
+}
+#endif
+
+static inline void zsim_roi_begin() {
+    printf("[" HOOKS_STR "] ROI begin\n");
+    zsim_magic_op(ZSIM_MAGIC_OP_ROI_BEGIN);
+}
+
+static inline void zsim_roi_end() {
+    zsim_magic_op(ZSIM_MAGIC_OP_ROI_END);
+    printf("[" HOOKS_STR  "] ROI end\n");
+}
+
+static inline void zsim_heartbeat() {
+    zsim_magic_op(ZSIM_MAGIC_OP_HEARTBEAT);
+}
+
+static inline void zsim_work_begin() { zsim_magic_op(ZSIM_MAGIC_OP_WORK_BEGIN); }
+static inline void zsim_work_end() { zsim_magic_op(ZSIM_MAGIC_OP_WORK_END); }
+
+#endif /*__ZSIM_HOOKS_H__*/
diff --git a/misc/hooks/zsim_jni.cpp b/misc/hooks/zsim_jni.cpp
new file mode 100644
index 00000000..89b7f4f2
--- /dev/null
+++ b/misc/hooks/zsim_jni.cpp
@@ -0,0 +1,7 @@
+#include <jni.h>
+#include "zsim_hooks.h"
+#include "zsim_jni.h"  // generated by javah
+
+JNIEXPORT void JNICALL Java_zsim_roi_1begin(JNIEnv *env, jclass cls) { zsim_roi_begin(); }
+JNIEXPORT void JNICALL Java_zsim_roi_1end(JNIEnv *env, jclass cls) { zsim_roi_end(); }
+JNIEXPORT void JNICALL Java_zsim_heartbeat(JNIEnv *env, jclass cls) { zsim_heartbeat(); }
diff --git a/misc/lint_includes.py b/misc/lint_includes.py
new file mode 100755
index 00000000..1e0f031f
--- /dev/null
+++ b/misc/lint_includes.py
@@ -0,0 +1,98 @@
+#!/usr/bin/python
+
+# Copyright (C) 2013-2014 by Massachusetts Institute of Technology
+#
+# This file is part of zsim.
+#
+# zsim is free software; you can redistribute it and/or modify it under the
+# terms of the GNU General Public License as published by the Free Software
+# Foundation, version 2.
+#
+# If you use this software in your research, we request that you reference
+# the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of
+# Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the
+# source of the simulator in any publications that use this software, and that
+# you send us a citation of your work.
+#
+# zsim is distributed in the hope that it will be useful, but WITHOUT ANY
+# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+# details.
+#
+# You should have received a copy of the GNU General Public License along with
+# this program. If not, see <http://www.gnu.org/licenses/>.
+ 
+
+import os, sys
+
+#dryRun = True
+dryRun = False
+
+srcs = sys.argv[1:]
+
+def sortIncludes(lines, fname):
+    def prefix(l):
+        if l.find("<") >= 0:
+            return "2"
+            # if you want to differentiate...
+            #if l.find(".h") >= 0: return "2" # C system headers
+            #else: return "3" # C++ system headers
+        else:
+            if l.find('"' + fname + '.') >= 0: return "1" # Our own header
+            return "4" # Program headers
+
+    ll = [prefix(l) + l.strip() for l in lines if len(l.strip()) > 0]
+    sl = [l[1:] for l in sorted(ll)]
+    if lines[-1].strip() == "": sl += [""]
+    #print sl
+    return sl
+
+for src in srcs:
+    f = open(src, 'r+')  # we open for read/write here to fail early on read-only files
+    txt = f.read()
+    f.close()
+
+    bName = os.path.basename(src).split(".")[0]
+    print bName
+
+    lines = [l for l in txt.split("\n")]
+   
+    includeBlocks = []
+    blockStart = -1
+    for i in range(len(lines)):
+        l = lines[i].strip()
+        isInclude = l.startswith("#include") and l.find("NOLINT") == -1
+        isEmpty = l == ""
+        if blockStart == -1:
+            if isInclude: blockStart = i  # start block
+        else:
+            if not (isInclude or isEmpty): # close block
+                includeBlocks.append((blockStart, i))
+                blockStart = -1
+
+    print src, len(includeBlocks), "blocks"
+
+    newIncludes = [(s , e, sortIncludes(lines[s:e], bName)) for (s, e) in includeBlocks]
+    for (s , e, ii) in newIncludes:
+        # Print?
+        if ii == lines[s:e]:
+            print "Block in lines %d-%d matches" % (s, e-1)
+            continue
+        for i in range(s, e):
+            print "%3d: %s%s | %s" % (i, lines[i], " "*(40 - len(lines[i][:39])), ii[i-s] if i-s < len(ii) else "")
+        print ""
+    
+    prevIdx = 0
+    newLines = []
+    for (s , e, ii) in newIncludes:
+        newLines += lines[prevIdx:s] + ii
+        prevIdx = e
+    newLines += lines[prevIdx:]
+
+    if not dryRun and len(includeBlocks):
+        outTxt = "\n".join(newLines)
+        f = open(src, 'w')
+        f.write(outTxt)
+        f.close()
+
+print "Done!"
diff --git a/misc/patchRoot/cpuinfo.template b/misc/patchRoot/cpuinfo.template
new file mode 100644
index 00000000..a0e1333a
--- /dev/null
+++ b/misc/patchRoot/cpuinfo.template
@@ -0,0 +1,25 @@
+processor	: $CPU 
+vendor_id	: GenuineIntel
+cpu family	: 6
+model		: 15
+model name	: Intel(R) Xeon(R) CPU           E5335  @ 2.00GHz
+stepping	: 7
+cpu MHz		: 1995.120
+cache size	: 4096 KB
+physical id	: $CPU
+siblings	: $NCPUS
+core id		: $CPU
+cpu cores	: $NCPUS
+apicid		: $CPU
+initial apicid	: $CPU
+fpu		: yes
+fpu_exception	: yes
+cpuid level	: 10
+wp		: yes
+flags		: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx lm constant_tsc arch_perfmon pebs bts rep_good nopl aperfmperf pni dtes64 monitor ds_cpl vmx tm2 ssse3 cx16 xtpr pdcm dca lahf_lm dtherm tpr_shadow
+bogomips	: 3990.24
+clflush size	: 64
+cache_alignment	: 64
+address sizes	: 36 bits physical, 48 bits virtual
+power management:
+
diff --git a/misc/patchRoot/genPatchRoot.py b/misc/patchRoot/genPatchRoot.py
new file mode 100755
index 00000000..e85ab65d
--- /dev/null
+++ b/misc/patchRoot/genPatchRoot.py
@@ -0,0 +1,155 @@
+#!/usr/bin/python
+
+# Copyright (C) 2013-2014 by Massachusetts Institute of Technology
+#
+# This file is part of zsim.
+#
+# zsim is free software; you can redistribute it and/or modify it under the
+# terms of the GNU General Public License as published by the Free Software
+# Foundation, version 2.
+#
+# If you use this software in your research, we request that you reference
+# the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of
+# Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the
+# source of the simulator in any publications that use this software, and that
+# you send us a citation of your work.
+#
+# zsim is distributed in the hope that it will be useful, but WITHOUT ANY
+# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+# details.
+#
+# You should have received a copy of the GNU General Public License along with
+# this program. If not, see <http://www.gnu.org/licenses/>.
+
+import os, string, sys
+
+class XTemplate(string.Template):
+    delimiter = "$"
+    escaped = "$$"
+
+def cmd(c):
+    f = os.popen(c)
+    r = f.read()
+    f.close()
+    return r.rstrip() # rm last newline
+
+def getMask(start, end):
+    cur = 0
+    l = []
+    for i in range(256):
+        j = i % 32
+        if i >= start and i <= end: cur |= 1 << j
+        if (i + 1) % 32 == 0:
+            l.append(cur)
+            cur = 0
+    l.reverse()
+    return ",".join("%08x" % n for n in l)
+
+from optparse import OptionParser
+
+# Read in options & args
+parser = OptionParser(usage="%prog [options] [resultsDirSuffix]")
+parser.add_option("-n", type="int", dest="ncpus", default=1, help="Number of simulated cores")
+parser.add_option("-d", type="string", dest="dir", default="./patchRoot", help="Destination directory")
+parser.add_option("-f", dest="force", action="store_true", default=False, help="Force, bypass existence checks")
+
+
+for option in parser.option_list:
+    if option.default != ("NO", "DEFAULT"):
+        option.help += (" " if option.help else "") + "[default: %default]"
+(options, args) = parser.parse_args()
+
+ncpus = options.ncpus
+root = options.dir
+progDir = os.path.dirname(os.path.abspath(__file__)) + "/"
+
+print "Will produce a tree for %d CPUs/cores in %s" % (ncpus, root)
+
+if ncpus < 1:
+    print "ERROR: Need >= 1 cpus!"
+    sys.exit(1)
+
+if os.path.exists(root) and not options.force:
+    print "ERROR: Dir already exists, aborting"
+    sys.exit(1)
+
+if len(args):
+    print "ERROR: No positional arguments taken, aborting"
+    sys.exit(1)
+
+cmd("mkdir -p " + root)
+if not os.path.exists(root):
+    print "ERROR: Could not create %s, aborting" % root
+    sys.exit(1)
+
+## /proc
+
+# cpuinfo
+cpuinfoTemplate = XTemplate(open(progDir + "cpuinfo.template", "r").read())
+cmd("mkdir -p %s/proc" % root)
+f = open(root + "/proc/cpuinfo", "w")
+for cpu in range(ncpus):
+    print >>f, cpuinfoTemplate.substitute({"CPU" : str(cpu), "NCPUS" : ncpus}),
+f.close()
+
+# stat
+cpuAct = [int(x) for x in "665084 119979939 9019834 399242499 472611 20 159543 0 0 0".split(" ")]
+totalAct = [x*ncpus for x in cpuAct]
+cpuStat = "cpu  " + " ".join([str(x) for x in totalAct])
+for cpu in range(ncpus):
+    cpuStat += ("\ncpu%d " % cpu) + " ".join([str(x) for x in cpuAct])
+statTemplate = XTemplate(open(progDir + "stat.template", "r").read())
+f = open(root + "/proc/stat", "w")
+print >>f, statTemplate.substitute({"CPUSTAT" : cpuStat}),
+f.close()
+
+## /sys
+
+# cpus
+cpuDir = root + "/sys/devices/system/cpu/"
+cmd("mkdir -p " + cpuDir)
+cpuList = "0-" + str(ncpus-1) if ncpus > 1 else "0"
+for f in ["online", "possible", "present"]:
+    cmd("echo %s > %s" % (cpuList, cpuDir + f))
+cmd("echo > " + cpuDir + "offline")
+cmd("echo 0 > " + cpuDir + "sched_mc_power_savings")
+maxCpus = max(ncpus, 255)
+cmd("echo %d > %s" % (maxCpus, cpuDir + "kernel_max"))
+coreSiblingsMask = getMask(0, ncpus)
+for cpu in range(ncpus):
+    d = cpuDir + "cpu" + str(cpu) + "/"
+    td = d + "topology/"
+    cmd("mkdir -p " + td)
+    if maxCpus > 255:
+        print "WARN: These many cpus have not been tested, x2APIC systems may be different..."
+    cmd("echo %d > %s" % (cpu, td + "core_id"))
+    cmd("echo %s > %s" % (cpuList, td + "core_siblings_list"))
+    cmd("echo %d > %s" % (cpu, td + "thread_siblings_list"))
+    cmd("echo 0 > " + td + "physical_package_id")
+    cmd("echo %s > %s" % (coreSiblingsMask, td + "core_siblings"))
+    cmd("echo %s > %s" % (getMask(cpu, cpu), td + "thread_siblings"))
+    cmd("echo 1 > " + td + "online")
+
+# nodes
+nodeDir = root + "/sys/devices/system/node/"
+cmd("mkdir -p " + nodeDir)
+for f in ["has_normal_memory", "online", "possible"]: cmd("echo 0 > " + nodeDir + f)
+cmd("echo > " + nodeDir + "has_cpu")
+
+n0Dir = nodeDir + "node0/"
+cmd("mkdir -p " + n0Dir)
+for cpu in range(ncpus):
+    cmd("ln -s " + cpuDir + "cpu" + str(cpu) + " " + n0Dir)
+cmd("cp -r %s/nodeFiles/* %s" % (progDir, n0Dir))
+cmd("echo %s > %s" % (coreSiblingsMask, n0Dir + "cpumap"))
+cmd("echo %s > %s" % (cpuList, n0Dir + "cpulist"))
+
+# misc
+cmd("mkdir -p " + root + "/sys/bus/pci/devices")
+
+# make read-only
+if not options.force:
+    cmd("chmod a-w -R " + root)
+
+
diff --git a/misc/patchRoot/nodeFiles/distance b/misc/patchRoot/nodeFiles/distance
new file mode 100644
index 00000000..f599e28b
--- /dev/null
+++ b/misc/patchRoot/nodeFiles/distance
@@ -0,0 +1 @@
+10
diff --git a/misc/patchRoot/nodeFiles/meminfo b/misc/patchRoot/nodeFiles/meminfo
new file mode 100644
index 00000000..f0dc8a7f
--- /dev/null
+++ b/misc/patchRoot/nodeFiles/meminfo
@@ -0,0 +1,29 @@
+Node 0 MemTotal:       33525260 kB
+Node 0 MemFree:         7103752 kB
+Node 0 MemUsed:        26421508 kB
+Node 0 Active:         21111308 kB
+Node 0 Inactive:        2823820 kB
+Node 0 Active(anon):   18869284 kB
+Node 0 Inactive(anon):   601672 kB
+Node 0 Active(file):    2242024 kB
+Node 0 Inactive(file):  2222148 kB
+Node 0 Unevictable:           0 kB
+Node 0 Mlocked:               0 kB
+Node 0 Dirty:               108 kB
+Node 0 Writeback:             0 kB
+Node 0 FilePages:       4884048 kB
+Node 0 Mapped:           433856 kB
+Node 0 AnonPages:      19069344 kB
+Node 0 Shmem:            398052 kB
+Node 0 KernelStack:        4120 kB
+Node 0 PageTables:        74400 kB
+Node 0 NFS_Unstable:          8 kB
+Node 0 Bounce:                0 kB
+Node 0 WritebackTmp:          0 kB
+Node 0 Slab:            1386804 kB
+Node 0 SReclaimable:    1271048 kB
+Node 0 SUnreclaim:       115756 kB
+Node 0 AnonHugePages:         0 kB
+Node 0 HugePages_Total:     0
+Node 0 HugePages_Free:      0
+Node 0 HugePages_Surp:      0
diff --git a/misc/patchRoot/nodeFiles/numastat b/misc/patchRoot/nodeFiles/numastat
new file mode 100644
index 00000000..a4608a42
--- /dev/null
+++ b/misc/patchRoot/nodeFiles/numastat
@@ -0,0 +1,6 @@
+numa_hit 17464111314
+numa_miss 0
+numa_foreign 0
+interleave_hit 15992
+local_node 17464079088
+other_node 0
diff --git a/misc/patchRoot/nodeFiles/scan_unevictable_pages b/misc/patchRoot/nodeFiles/scan_unevictable_pages
new file mode 100644
index 00000000..573541ac
--- /dev/null
+++ b/misc/patchRoot/nodeFiles/scan_unevictable_pages
@@ -0,0 +1 @@
+0
diff --git a/misc/patchRoot/nodeFiles/vmstat b/misc/patchRoot/nodeFiles/vmstat
new file mode 100644
index 00000000..45d53982
--- /dev/null
+++ b/misc/patchRoot/nodeFiles/vmstat
@@ -0,0 +1,33 @@
+nr_free_pages 1775938
+nr_inactive_anon 150418
+nr_active_anon 4717321
+nr_inactive_file 555537
+nr_active_file 560506
+nr_unevictable 0
+nr_mlock 0
+nr_anon_pages 4767336
+nr_mapped 108464
+nr_file_pages 1221012
+nr_dirty 27
+nr_writeback 0
+nr_slab_reclaimable 317762
+nr_slab_unreclaimable 28939
+nr_page_table_pages 18600
+nr_kernel_stack 515
+nr_unstable 2
+nr_bounce 0
+nr_vmscan_write 1418314
+nr_vmscan_immediate_reclaim 1404
+nr_writeback_temp 0
+nr_isolated_anon 0
+nr_isolated_file 0
+nr_shmem 99513
+nr_dirtied 218449153
+nr_written 216406104
+numa_hit 17464111314
+numa_miss 78421229
+numa_foreign 541191002
+numa_interleave 15992
+numa_local 17464079088
+numa_other 78453455
+nr_anon_transparent_hugepages 0
diff --git a/misc/patchRoot/stat.template b/misc/patchRoot/stat.template
new file mode 100644
index 00000000..5097d123
--- /dev/null
+++ b/misc/patchRoot/stat.template
@@ -0,0 +1,8 @@
+$CPUSTAT
+intr 126179179275 17876 3 0 0 0 0 0 0 1 0 0 0 4 0 0 0 61163008 0 0 0 0 0 0 290 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 103771380 215532394 165622708 48561815 78921339 136273892 331572171 84723547 12593167 0 0 0 0 0 0 0 0 0 0 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+ctxt 1319718714527
+btime 1365356075
+processes 3878073
+procs_running 2
+procs_blocked 0
+softirq 73342466563 0 1689790758 12887572 2486080210 18121886 0 12593228 2445648220 3610928298 2936874247
diff --git a/src/SConscript b/src/SConscript
new file mode 100644
index 00000000..8fe3dd24
--- /dev/null
+++ b/src/SConscript
@@ -0,0 +1,32 @@
+# -*- mode:python -*-
+
+import os
+Import("env")
+
+commonSrcs = ["config.cpp", "galloc.cpp", "log.cpp", "pin_cmd.cpp"]
+harnessSrcs = ["zsim_harness.cpp", "debug_harness.cpp"]
+
+# By default, we compile all cpp files in libzsim.so. List the cpp files that
+# should be excluded below (one per line and in order, to ease merges)
+excludeSrcs = [
+"fftoggle.cpp",
+]
+excludeSrcs += harnessSrcs
+
+# Build libzsim.so
+globSrcNodes = Glob("*.cpp") + Glob("virt/*.cpp")
+libSrcs = [str(x) for x in globSrcNodes if str(x) not in excludeSrcs]
+libEnv = env.Clone()
+libEnv["CPPFLAGS"] += libEnv["PINCPPFLAGS"]
+libEnv["LINKFLAGS"] += libEnv["PINLINKFLAGS"]
+libEnv["LIBPATH"] += libEnv["PINLIBPATH"]
+libEnv["LIBS"] += libEnv["PINLIBS"]
+libEnv.SharedLibrary("zsim.so", libSrcs)
+
+# Build harness (static to make it easier to run across environments)
+env["LINKFLAGS"] += " --static "
+env["LIBS"] += ["pthread"]
+env.Program("zsim", harnessSrcs + commonSrcs)
+
+# Build additional utilities below
+env.Program("fftoggle", ["fftoggle.cpp"] + commonSrcs)
diff --git a/src/barrier.h b/src/barrier.h
new file mode 100644
index 00000000..32c4da5e
--- /dev/null
+++ b/src/barrier.h
@@ -0,0 +1,282 @@
+/** $lic$
+ * Copyright (C) 2012-2014 by Massachusetts Institute of Technology
+ * Copyright (C) 2010-2013 by The Board of Trustees of Stanford University
+ *
+ * This file is part of zsim.
+ *
+ * zsim is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU General Public License as published by the Free Software
+ * Foundation, version 2.
+ *
+ * If you use this software in your research, we request that you reference
+ * the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of
+ * Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the
+ * source of the simulator in any publications that use this software, and that
+ * you send us a citation of your work.
+ *
+ * zsim is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/* Implements a barrier with join-leave semantics and parallelism control.
+ * JOIN-LEAVE SEMANTICS: Threads can join or leave the barrier at any point in time.
+ * Threads in the barrier call sync and synchronize with all other threads
+ * participating in the barrier. Threads can leave a barrier at any point in time
+ * (e.g. when other threads have started the sync).
+ *
+ * PARALLELISM CONTROL: The barrier limits the number of threads that run at the same time.
+ *
+ * Author: Daniel Sanchez <sanchezd@stanford.edu>
+ * Date: Apr 2011
+ */
+
+#ifndef BARRIER_H_
+#define BARRIER_H_
+
+#include <errno.h>
+#include <linux/futex.h>
+#include <stdint.h>
+#include <syscall.h>
+#include <time.h>
+#include <unistd.h>
+#include "constants.h"
+#include "galloc.h"
+#include "locks.h"
+#include "log.h"
+#include "mtrand.h"
+
+// Configure futex timeouts (die rather than deadlock)
+#define TIMEOUT_LENGTH 20 //seconds
+#define MAX_TIMEOUTS 10
+
+//#define DEBUG_BARRIER(args...) info(args)
+#define DEBUG_BARRIER(args...)
+
+class Callee {
+    public:
+        virtual void callback() = 0;
+};
+
+
+class Barrier : public GlobAlloc {
+    private:
+        uint32_t parallelThreads;
+
+        enum State {OFFLINE, WAITING, RUNNING, LEFT};
+
+        struct ThreadSyncInfo {
+            volatile State state;
+            volatile uint32_t futexWord;
+            uint32_t lastIdx;
+            uint32_t pad;
+        };
+
+        ThreadSyncInfo threadList[MAX_THREADS];
+
+        uint32_t* runList;
+        uint32_t runListSize;
+        uint32_t curThreadIdx;
+
+        uint32_t runningThreads; //threads in RUNNING state
+        uint32_t leftThreads; //threads in LEFT state
+        //Threads in OFFLINE state are not on the runlist, so runListSize - runningThreads - leftThreads == waitingThreads
+
+        uint32_t phaseCount; //INTERNAL, for LEFT->OFFLINE bookkeeping overhead reduction purposes
+
+        uint32_t pad[16];
+
+        /* NOTE(dsm): I was initially misled that having a single lock protecting the barrier was a performance hog, and coded a lock-free version.
+         * Profiling doesn't show that, however. What happened was that shorter phases caused a worse interaction with PIN locks in the memory
+         * hierarchy (which use yield, not futex?). The lock-free version was actually a bit slower, as we're already serializing on curThreadIdx and
+         * the lock-free version required to volatilize pretty much every variable. If serialization on sync() ever becomes an issue, ask me for the
+         * lock-free code.
+         */
+        //lock_t barrierLock; //not used anymore, using the scheduler lock instead since this is called from the scheduler
+
+        MTRand rnd;
+        Callee* sched; //FIXME: I don't like this organization, but don't have time to refactor the barrier code, this is used for a callback when the phase is done
+
+    public:
+        Barrier(uint32_t _parallelThreads, Callee* _sched) : parallelThreads(_parallelThreads), rnd(0xBA77137), sched(_sched) {
+            for (uint32_t t = 0; t < MAX_THREADS; t++) {
+                threadList[t].state = OFFLINE;
+                threadList[t].futexWord = 0;
+            }
+
+            runList = gm_calloc<uint32_t>(MAX_THREADS);
+            runListSize = 0;
+            curThreadIdx = 0;
+
+            runningThreads = 0;
+            leftThreads = 0;
+            phaseCount = 0;
+            //barrierLock = 0;
+        }
+
+        ~Barrier() {}
+
+        //Called with schedLock held; returns with schedLock unheld
+        void join(uint32_t tid, lock_t* schedLock) {
+            DEBUG_BARRIER("[%d] Joining, runningThreads %d, prevState %d", tid, runningThreads, threadList[tid].state);
+            assert(threadList[tid].state == LEFT || threadList[tid].state == OFFLINE);
+            if (threadList[tid].state == OFFLINE) {
+                runList[runListSize++] = tid;
+            } else {
+                leftThreads--;
+                //If we have already run in this phase, reschedule ourselves in it
+                uint32_t lastIdx = threadList[tid].lastIdx;
+                if (curThreadIdx > lastIdx) { //curThreadIdx points to the FIRST thread that tryWakeNext checks
+                    DEBUG_BARRIER("[%d] Doing same-phase join reschedule", tid);
+                    curThreadIdx--;
+                    //Swap our runlist tid with the last thread's
+                    assert(tid == runList[lastIdx]);
+                    uint32_t otherTid = runList[curThreadIdx];
+
+                    runList[lastIdx] = otherTid;
+                    runList[curThreadIdx] = tid;
+                    threadList[otherTid].lastIdx = lastIdx;
+                    threadList[tid].lastIdx = curThreadIdx;
+                    //now we'll be scheduled next :)
+                }
+            }
+
+
+            threadList[tid].state = WAITING;
+            threadList[tid].futexWord = 1;
+            tryWakeNext(tid); //NOTE: You can't cause a phase to end here.
+            futex_unlock(schedLock);
+
+            if (threadList[tid].state == WAITING) {
+                DEBUG_BARRIER("[%d] Waiting on join", tid);
+                while (true) {
+                    int futex_res = syscall(SYS_futex, &threadList[tid].futexWord, FUTEX_WAIT, 1 /*a racing thread waking us up will change value to 0, and we won't block*/, NULL, NULL, 0);
+                    if (futex_res == 0 || threadList[tid].futexWord != 1) break;
+                }
+                //The thread that wakes us up changes this
+                assert(threadList[tid].state == RUNNING);
+            }
+        }
+
+        //Must be called with schedLock held
+        void leave(uint32_t tid) {
+            DEBUG_BARRIER("[%d] Leaving, runningThreads %d", tid, runningThreads);
+            if (threadList[tid].state == RUNNING) {
+                threadList[tid].state = LEFT;
+                leftThreads++;
+                runningThreads--;
+                tryWakeNext(tid); //can trigger phase end
+            } else {
+                assert_msg(threadList[tid].state == WAITING, "leave, tid %d, incorrect state %d", tid, threadList[tid].state);
+                threadList[tid].state = LEFT;
+                leftThreads++;
+            }
+        }
+
+        //Called with schedLock held, returns with schedLock unheld
+        void sync(uint32_t tid, lock_t* schedLock) {
+            DEBUG_BARRIER("[%d] Sync", tid);
+            assert_msg(threadList[tid].state == RUNNING, "[%d] sync: state was supposed to be %d, it is %d", tid, RUNNING, threadList[tid].state);
+            threadList[tid].futexWord = 1;
+            threadList[tid].state = WAITING;
+            runningThreads--;
+            tryWakeNext(tid); //can trigger phase end
+            futex_unlock(schedLock);
+
+            if (threadList[tid].state == WAITING) {
+                while (true) {
+                    int futex_res = syscall(SYS_futex, &threadList[tid].futexWord, FUTEX_WAIT, 1 /*a racing thread waking us up will change value to 0, and we won't block*/, NULL, NULL, 0);
+                    if (futex_res == 0 || threadList[tid].futexWord != 1) break;
+                }
+                //The thread that wakes us up changes this
+                assert(threadList[tid].state == RUNNING);
+            }
+        }
+
+    private:
+        inline void checkEndPhase(uint32_t tid) {
+            if (curThreadIdx == runListSize && runningThreads == 0) {
+                if (leftThreads == runListSize) {
+                    DEBUG_BARRIER("[%d] All threads left barrier, not ending current phase", tid);
+                    return; //watch the early return
+                }
+                DEBUG_BARRIER("[%d] Phase ended", tid);
+                // End of phase actions
+                sched->callback();
+                curThreadIdx = 0; //rewind list
+
+                if (((phaseCount++) & (32-1)) == 0) { //one out of 32 times, do
+                    /* Pass over the whole array, OFFLINE the threads that LEFT. If they are on a syscall, they will rejoin;
+                     * If they left for good, we avoid long-term traversal overheads on apps with a varying number of threads.
+                     */
+                    assert(runListSize > 0);
+                    uint32_t idx = 0;
+                    uint32_t newSize = runListSize;
+                    while (idx < newSize) {
+                        uint32_t wtid = runList[idx];
+                        if (threadList[wtid].state == LEFT) {
+                            threadList[wtid].state = OFFLINE;
+                            uint32_t stid = runList[newSize-1];
+                            runList[idx] = stid;
+                            threadList[stid].lastIdx = idx;
+
+                            newSize--; //last elem is now garbage
+                        } else {
+                            idx++; //this one is OK, keep going
+                        }
+                    }
+                    assert(runListSize - newSize == leftThreads);
+                    leftThreads = 0;
+                    DEBUG_BARRIER("[%d] Cleanup pass, initial runListSize %d, now %d", tid, runListSize, newSize);
+                    runListSize = newSize;
+                }
+
+                //NOTE: If this is a performance hog, the algorithm can be rewritten to be top-down and threads can be woken up as soon as they are reordered. So far, I've seen this has negligible overheads though.
+                if (parallelThreads < runListSize) {
+                    //Randomly shuffle thread list to avoid systemic biases and reduce contention on cache hierarchy (Fisher-Yates shuffle)
+                    for (uint32_t i = runListSize-1; i > 0; i--) {
+                        uint32_t j = rnd.randInt(i); //j is in {0,...,i}
+                        uint32_t itid = runList[i];
+                        uint32_t jtid = runList[j];
+
+                        runList[i] = jtid;
+                        runList[j] = itid;
+
+                        threadList[itid].lastIdx = j;
+                        threadList[jtid].lastIdx = i;
+                    }
+                }
+            }
+        }
+
+        inline void checkRunList(uint32_t tid) {
+            while (runningThreads < parallelThreads && curThreadIdx < runListSize) {
+                //Wake next thread
+                uint32_t idx = curThreadIdx++;
+                uint32_t wtid = runList[idx];
+                if (threadList[wtid].state == WAITING) {
+                    DEBUG_BARRIER("[%d] Waking %d runningThreads %d", tid, wtid, runningThreads);
+                    threadList[wtid].state = RUNNING; //must be set before writing to futexWord to avoid wakeup race
+                    threadList[wtid].lastIdx = idx;
+                    bool succ = __sync_bool_compare_and_swap(&threadList[wtid].futexWord, 1, 0);
+                    if (!succ) panic("Wakeup race in barrier?");
+                    syscall(SYS_futex, &threadList[wtid].futexWord, FUTEX_WAKE, 1, NULL, NULL, 0);
+                    runningThreads++;
+                } else {
+                    DEBUG_BARRIER("[%d] Skipping %d state %d", tid, wtid, threadList[wtid].state);
+                }
+            }
+        }
+
+        void tryWakeNext(uint32_t tid) {
+            checkRunList(tid); //wake up threads on this phase, may reach EOP
+            checkEndPhase(tid); //see if we've reached EOP, execute if if so
+            checkRunList(tid); //if we started a new phase, wake up threads
+        }
+};
+
+#endif  // BARRIER_H_
diff --git a/src/bithacks.h b/src/bithacks.h
new file mode 100644
index 00000000..369cdc48
--- /dev/null
+++ b/src/bithacks.h
@@ -0,0 +1,81 @@
+/** $lic$
+ * Copyright (C) 2012-2014 by Massachusetts Institute of Technology
+ * Copyright (C) 2010-2013 by The Board of Trustees of Stanford University
+ *
+ * This file is part of zsim.
+ *
+ * zsim is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU General Public License as published by the Free Software
+ * Foundation, version 2.
+ *
+ * If you use this software in your research, we request that you reference
+ * the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of
+ * Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the
+ * source of the simulator in any publications that use this software, and that
+ * you send us a citation of your work.
+ *
+ * zsim is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef BITHACKS_H_
+#define BITHACKS_H_
+
+#include <stdint.h>
+
+/* Assortment of efficient implementations for required, "bithack" operations, see the bithacks
+ * website, http://graphics.stanford.edu/~seander/bithacks.html
+ */
+
+/* Max and min: These work with side-effects, are type-safe, and gcc recognizes this pattern and uses
+ * conditional moves (i.e., predication --> no unpredictable branches and great preformance)
+ */
+#ifdef MAX
+#undef MAX
+#endif
+#define MAX(x, y) ({ __typeof__(x) xx = (x); __typeof__(y) yy = (y); (xx > yy)? xx : yy;})
+
+#ifdef MIN
+#undef MIN
+#endif
+#define MIN(x, y) ({ __typeof__(x) xx = (x); __typeof__(y) yy = (y); (xx < yy)? xx : yy;})
+
+// Integer log2 --- called ilog2 because cmath defines log2 for floats/doubles,
+// and promotes int calls to use FP
+template<typename T> static inline uint32_t ilog2(T val);
+// Only specializations of unsigned types (no calling these with ints)
+// __builtin_clz is undefined for 0 (internally, this uses bsr in x86-64)
+template<> uint32_t ilog2<uint32_t>(uint32_t val) {
+    return val? 31 - __builtin_clz(val) : 0;
+}
+template<> uint32_t ilog2<uint64_t>(uint64_t val) {
+    return val? 63 - __builtin_clzl(val) : 0;
+}
+
+template<typename T>
+static inline bool isPow2(T val) {
+    return val && !(val & (val - 1));
+}
+
+/* Some variadic template magic for max/min with N args.
+ *
+ * Type-wise, you can compare multiple types (e.g., maxN(1, -7, 3.3)), but the
+ * output type is the first arg's type (e.g., returns 3)
+ */
+template <typename T> static inline T maxN(T a) { return a; }
+template <typename T, typename U, typename ... V> static inline T maxN(T a, U b, V... c) {
+    return maxN(((a > b)? a : b), c...);
+}
+
+template <typename T> static inline T minN(T a) { return a; }
+template <typename T, typename U, typename ... V> static inline T minN(T a, U b, V... c) {
+    return minN(((a < b)? a : b), c...);
+}
+
+
+#endif  // BITHACKS_H_
diff --git a/src/breakdown_stats.h b/src/breakdown_stats.h
new file mode 100644
index 00000000..6273f558
--- /dev/null
+++ b/src/breakdown_stats.h
@@ -0,0 +1,69 @@
+/** $lic$
+ * Copyright (C) 2012-2014 by Massachusetts Institute of Technology
+ * Copyright (C) 2010-2013 by The Board of Trustees of Stanford University
+ *
+ * This file is part of zsim.
+ *
+ * zsim is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU General Public License as published by the Free Software
+ * Foundation, version 2.
+ *
+ * If you use this software in your research, we request that you reference
+ * the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of
+ * Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the
+ * source of the simulator in any publications that use this software, and that
+ * you send us a citation of your work.
+ *
+ * zsim is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "bithacks.h"
+#include "stats.h"
+#include "zsim.h"
+
+/* Implements per-cycle breakdowns. Always starts at state 0.
+ * count() accounts for cycles in current state; count() is used
+ * because we extend VectorCounter (TODO: Move to VectorStat).
+ */
+class CycleBreakdownStat : public VectorCounter {
+    private:
+        uint32_t curState;
+        uint64_t lastCycle;
+
+    public:
+        CycleBreakdownStat() : VectorCounter() {}
+
+        virtual void init(const char* name, const char* desc, uint32_t size) {
+            VectorCounter::init(name, desc, size);
+            curState = 0;
+            lastCycle = 0;
+        }
+
+        // I need to define this even though it is completely unnecessary, but only if I override init. gcc bug or C++ oddity?
+        virtual void init(const char* name, const char* desc, uint32_t size, const char** names) {
+            VectorCounter::init(name, desc, size, names); // will call our init(name, desc, size)
+        }
+
+        void transition(uint32_t newState, uint64_t cycle) {
+            assert(curState < size());
+            assert(newState < size());
+            assert(lastCycle <= cycle);
+            inc(curState, cycle - lastCycle);
+            curState = newState;
+            lastCycle = cycle;
+        }
+
+        // Accounts for time in current state, even if the last transition happened long ago
+        inline virtual uint64_t count(uint32_t idx) const {
+            uint64_t partial = VectorCounter::count(idx);
+            uint64_t curCycle = MAX(lastCycle, zinfo->globPhaseCycles);
+            return partial + ((idx == curState)? (curCycle - lastCycle) : 0);
+        }
+};
+
diff --git a/src/cache.cpp b/src/cache.cpp
new file mode 100644
index 00000000..c4109f6c
--- /dev/null
+++ b/src/cache.cpp
@@ -0,0 +1,100 @@
+/** $lic$
+ * Copyright (C) 2012-2014 by Massachusetts Institute of Technology
+ * Copyright (C) 2010-2013 by The Board of Trustees of Stanford University
+ *
+ * This file is part of zsim.
+ *
+ * zsim is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU General Public License as published by the Free Software
+ * Foundation, version 2.
+ *
+ * If you use this software in your research, we request that you reference
+ * the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of
+ * Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the
+ * source of the simulator in any publications that use this software, and that
+ * you send us a citation of your work.
+ *
+ * zsim is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "cache.h"
+#include "hash.h"
+
+Cache::Cache(uint32_t _numLines, CC* _cc, CacheArray* _array, ReplPolicy* _rp, uint32_t _accLat, uint32_t _invLat, const g_string& _name)
+    : cc(_cc), array(_array), rp(_rp), numLines(_numLines), accLat(_accLat), invLat(_invLat), name(_name) {}
+
+const char* Cache::getName() {
+    return name.c_str();
+}
+
+void Cache::setParents(uint32_t childId, const g_vector<MemObject*>& parents, Network* network) {
+    cc->setParents(childId, parents, network);
+}
+
+void Cache::setChildren(const g_vector<BaseCache*>& children, Network* network) {
+    cc->setChildren(children, network);
+}
+
+void Cache::initStats(AggregateStat* parentStat) {
+    AggregateStat* cacheStat = new AggregateStat();
+    cacheStat->init(name.c_str(), "Cache stats");
+    initCacheStats(cacheStat);
+    parentStat->append(cacheStat);
+}
+
+void Cache::initCacheStats(AggregateStat* cacheStat) {
+    cc->initStats(cacheStat);
+    array->initStats(cacheStat);
+    rp->initStats(cacheStat);
+}
+
+uint64_t Cache::access(MemReq& req) {
+    uint64_t respCycle = req.cycle;
+    bool skipAccess = cc->startAccess(req); //may need to skip access due to races (NOTE: may change req.type!)
+    if (likely(!skipAccess)) {
+        bool updateReplacement = (req.type == GETS) || (req.type == GETX);
+        int32_t lineId = array->lookup(req.lineAddr, &req, updateReplacement);
+        respCycle += accLat;
+
+        if (lineId == -1 && cc->shouldAllocate(req)) {
+            //Make space for new line
+            Address wbLineAddr;
+            lineId = array->preinsert(req.lineAddr, &req, &wbLineAddr); //find the lineId to replace
+            trace(Cache, "[%s] Evicting 0x%lx", name.c_str(), wbLineAddr);
+
+            //Evictions are not in the critical path in any sane implementation -- we do not include their delays
+            //NOTE: We might be "evicting" an invalid line for all we know. Coherence controllers will know what to do
+            cc->processEviction(req, wbLineAddr, lineId, respCycle); //1. if needed, send invalidates/downgrades to lower level
+
+            array->postinsert(req.lineAddr, &req, lineId); //do the actual insertion. NOTE: Now we must split insert into a 2-phase thing because cc unlocks us.
+        }
+
+        respCycle = cc->processAccess(req, lineId, respCycle);
+    }
+
+    cc->endAccess(req);
+
+    assert_msg(respCycle >= req.cycle, "[%s] resp < req? 0x%lx type %s childState %s, respCycle %ld reqCycle %ld",
+            name.c_str(), req.lineAddr, AccessTypeName(req.type), MESIStateName(*req.state), respCycle, req.cycle);
+    return respCycle;
+}
+
+uint64_t Cache::invalidate(Address lineAddr, InvType type, bool* reqWriteback, uint64_t reqCycle, uint32_t srcId) {
+    cc->startInv(); //note we don't grab tcc; tcc serializes multiple up accesses, down accesses don't see it
+
+    int32_t lineId = array->lookup(lineAddr, NULL, false);
+    assert_msg(lineId != -1, "[%s] Invalidate on non-existing address 0x%lx type %s lineId %d, reqWriteback %d", name.c_str(), lineAddr, InvTypeName(type), lineId, *reqWriteback);
+    uint64_t respCycle = reqCycle + invLat;
+    trace(Cache, "[%s] Invalidate start 0x%lx type %s lineId %d, reqWriteback %d", name.c_str(), lineAddr, InvTypeName(type), lineId, *reqWriteback);
+    respCycle = cc->processInv(lineAddr, lineId, type, reqWriteback, respCycle, srcId); //send invalidates or downgrades to children, and adjust our own state
+    trace(Cache, "[%s] Invalidate end 0x%lx type %s lineId %d, reqWriteback %d, latency %ld", name.c_str(), lineAddr, InvTypeName(type), lineId, *reqWriteback, respCycle - reqCycle);
+
+    return respCycle;
+}
+
diff --git a/src/cache.h b/src/cache.h
new file mode 100644
index 00000000..ecb86d6e
--- /dev/null
+++ b/src/cache.h
@@ -0,0 +1,75 @@
+/** $lic$
+ * Copyright (C) 2012-2014 by Massachusetts Institute of Technology
+ * Copyright (C) 2010-2013 by The Board of Trustees of Stanford University
+ *
+ * This file is part of zsim.
+ *
+ * zsim is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU General Public License as published by the Free Software
+ * Foundation, version 2.
+ *
+ * If you use this software in your research, we request that you reference
+ * the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of
+ * Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the
+ * source of the simulator in any publications that use this software, and that
+ * you send us a citation of your work.
+ *
+ * zsim is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef CACHE_H_
+#define CACHE_H_
+
+#include "cache_arrays.h"
+#include "coherence_ctrls.h"
+#include "g_std/g_string.h"
+#include "g_std/g_vector.h"
+#include "memory_hierarchy.h"
+#include "repl_policies.h"
+#include "stats.h"
+
+class Network;
+
+/* General coherent modular cache. The replacement policy and cache array are
+ * pretty much mix and match. The coherence controller interfaces are general
+ * too, but to avoid virtual function call overheads we work with MESI
+ * controllers, since for now we only have MESI controllers
+ */
+class Cache : public BaseCache {
+    protected:
+        CC* cc;
+        CacheArray* array;
+        ReplPolicy* rp;
+
+        uint32_t numLines;
+
+        //Latencies
+        uint32_t accLat; //latency of a normal access (could split in get/put, probably not needed)
+        uint32_t invLat; //latency of an invalidation
+
+        g_string name;
+
+    public:
+        Cache(uint32_t _numLines, CC* _cc, CacheArray* _array, ReplPolicy* _rp, uint32_t _accLat, uint32_t _invLat, const g_string& _name);
+
+        const char* getName();
+        void setParents(uint32_t _childId, const g_vector<MemObject*>& parents, Network* network);
+        void setChildren(const g_vector<BaseCache*>& children, Network* network);
+        void initStats(AggregateStat* parentStat);
+
+        virtual uint64_t access(MemReq& req);
+
+        //NOTE: reqWriteback is pulled up to true, but not pulled down to false.
+        virtual uint64_t invalidate(Address lineAddr, InvType type, bool* reqWriteback, uint64_t reqCycle, uint32_t srcId);
+
+    protected:
+        void initCacheStats(AggregateStat* cacheStat);
+};
+
+#endif  // CACHE_H_
diff --git a/src/cache_arrays.cpp b/src/cache_arrays.cpp
new file mode 100644
index 00000000..e2e6e220
--- /dev/null
+++ b/src/cache_arrays.cpp
@@ -0,0 +1,209 @@
+/** $lic$
+ * Copyright (C) 2012-2014 by Massachusetts Institute of Technology
+ * Copyright (C) 2010-2013 by The Board of Trustees of Stanford University
+ *
+ * This file is part of zsim.
+ *
+ * zsim is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU General Public License as published by the Free Software
+ * Foundation, version 2.
+ *
+ * If you use this software in your research, we request that you reference
+ * the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of
+ * Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the
+ * source of the simulator in any publications that use this software, and that
+ * you send us a citation of your work.
+ *
+ * zsim is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "cache_arrays.h"
+#include "hash.h"
+#include "repl_policies.h"
+
+/* Set-associative array implementation */
+
+SetAssocArray::SetAssocArray(uint32_t _numLines, uint32_t _assoc, ReplPolicy* _rp, HashFamily* _hf) : rp(_rp), hf(_hf), numLines(_numLines), assoc(_assoc)  {
+    array = gm_calloc<Address>(numLines);
+    numSets = numLines/assoc;
+    setMask = numSets - 1;
+    assert(isPow2(numSets));
+}
+
+int32_t SetAssocArray::lookup(const Address lineAddr, const MemReq* req, bool updateReplacement) {
+    uint32_t set = hf->hash(0, lineAddr) & setMask;
+    uint32_t first = set*assoc;
+    for (uint32_t id = first; id < first + assoc; id++) {
+        if (array[id] ==  lineAddr) {
+            if (updateReplacement) rp->update(id, req);
+            return id;
+        }
+    }
+    return -1;
+}
+
+uint32_t SetAssocArray::preinsert(const Address lineAddr, const MemReq* req, Address* wbLineAddr) { //TODO: Give out valid bit of wb cand?
+    uint32_t set = hf->hash(0, lineAddr) & setMask;
+    uint32_t first = set*assoc;
+
+    uint32_t candidate = rp->rankCands(req, SetAssocCands(first, first+assoc));
+
+    *wbLineAddr = array[candidate];
+    return candidate;
+}
+
+void SetAssocArray::postinsert(const Address lineAddr, const MemReq* req, uint32_t candidate) {
+    rp->replaced(candidate);
+    array[candidate] = lineAddr;
+    rp->update(candidate, req);
+}
+
+
+/* ZCache implementation */
+
+ZArray::ZArray(uint32_t _numLines, uint32_t _ways, uint32_t _candidates, ReplPolicy* _rp, HashFamily* _hf) //(int _size, int _lineSize, int _assoc, int _zassoc, ReplacementPolicy<T>* _rp, int _hashType)
+    : rp(_rp), hf(_hf), numLines(_numLines), ways(_ways), cands(_candidates)
+{
+    assert_msg(ways > 1, "zcaches need >=2 ways to work");
+    assert_msg(cands >= ways, "candidates < ways does not make sense in a zcache");
+    assert_msg(numLines % ways == 0, "number of lines is not a multiple of ways");
+
+    //Populate secondary parameters
+    numSets = numLines/ways;
+    assert_msg(isPow2(numSets), "must have a power of 2 # sets, but you specified %d", numSets);
+    setMask = numSets - 1;
+
+    lookupArray = gm_calloc<uint32_t>(numLines);
+    array = gm_calloc<Address>(numLines);
+    for (uint32_t i = 0; i < numLines; i++) {
+        lookupArray[i] = i;  // start with a linear mapping; with swaps, it'll get progressively scrambled
+    }
+    swapArray = gm_calloc<uint32_t>(cands/ways + 2);  // conservative upper bound (tight within 2 ways)
+}
+
+void ZArray::initStats(AggregateStat* parentStat) {
+    AggregateStat* objStats = new AggregateStat();
+    objStats->init("array", "ZArray stats");
+    statSwaps.init("swaps", "Block swaps in replacement process");
+    objStats->append(&statSwaps);
+    parentStat->append(objStats);
+}
+
+int32_t ZArray::lookup(const Address lineAddr, const MemReq* req, bool updateReplacement) {
+    /* Be defensive: If the line is 0, panic instead of asserting. Now this can
+     * only happen on a segfault in the main program, but when we move to full
+     * system, phy page 0 might be used, and this will hit us in a very subtle
+     * way if we don't check.
+     */
+    if (unlikely(!lineAddr)) panic("ZArray::lookup called with lineAddr==0 -- your app just segfaulted");
+
+    for (uint32_t w = 0; w < ways; w++) {
+        uint32_t lineId = lookupArray[w*numSets + (hf->hash(w, lineAddr) & setMask)];
+        if (array[lineId] == lineAddr) {
+            if (updateReplacement) {
+                rp->update(lineId, req);
+            }
+            return lineId;
+        }
+    }
+    return -1;
+}
+
+uint32_t ZArray::preinsert(const Address lineAddr, const MemReq* req, Address* wbLineAddr) {
+    ZWalkInfo candidates[cands + ways]; //extra ways entries to avoid checking on every expansion
+
+    bool all_valid = true;
+    uint32_t fringeStart = 0;
+    uint32_t numCandidates = ways; //seeds
+
+    //info("Replacement for incoming 0x%lx", lineAddr);
+
+    //Seeds
+    for (uint32_t w = 0; w < ways; w++) {
+        uint32_t pos = w*numSets + (hf->hash(w, lineAddr) & setMask);
+        uint32_t lineId = lookupArray[pos];
+        candidates[w].set(pos, lineId, -1);
+        all_valid &= (array[lineId] != 0);
+        //info("Seed Candidate %d addr 0x%lx pos %d lineId %d", w, array[lineId], pos, lineId);
+    }
+
+    //Expand fringe in BFS fashion
+    while (numCandidates < cands && all_valid) {
+        uint32_t fringeId = candidates[fringeStart].lineId;
+        Address fringeAddr = array[fringeId];
+        assert(fringeAddr);
+        for (uint32_t w = 0; w < ways; w++) {
+            uint32_t hval = hf->hash(w, fringeAddr) & setMask;
+            uint32_t pos = w*numSets + hval;
+            uint32_t lineId = lookupArray[pos];
+            if (lineId != fringeId) {
+                //info("Candidate %d way %d addr 0x%lx pos %d lineId %d parent %d", numCandidates, w, array[lineId], pos, lineId, fringeStart);
+                candidates[numCandidates++].set(pos, lineId, (int32_t)fringeStart);
+                all_valid &= (array[lineId] != 0);
+            }
+        }
+        fringeStart++;
+    }
+
+    //Get best candidate (NOTE: This could be folded in the code above, but it's messy since we can expand more than zassoc elements)
+    assert(!all_valid || numCandidates >= cands);
+    numCandidates = (numCandidates > cands)? cands : numCandidates;
+
+    //info("Using %d candidates, all_valid=%d", numCandidates, all_valid);
+
+    uint32_t bestCandidate = rp->rankCands(req, ZCands(&candidates[0], &candidates[numCandidates]));
+    assert(bestCandidate < numLines);
+
+    //Fill in swap array
+
+    //Get the *minimum* index of cands that matches lineId. We need the minimum in case there are loops (rare, but possible)
+    uint32_t minIdx = -1;
+    for (uint32_t ii = 0; ii < numCandidates; ii++) {
+        if (bestCandidate == candidates[ii].lineId) {
+            minIdx = ii;
+            break;
+        }
+    }
+    assert(minIdx >= 0);
+    //info("Best candidate is %d lineId %d", minIdx, bestCandidate);
+
+    lastCandIdx = minIdx; //used by timing simulation code to schedule array accesses
+
+    int32_t idx = minIdx;
+    uint32_t swapIdx = 0;
+    while (idx >= 0) {
+        swapArray[swapIdx++] = candidates[idx].pos;
+        idx = candidates[idx].parentIdx;
+    }
+    swapArrayLen = swapIdx;
+    assert(swapArrayLen > 0);
+
+    //Write address of line we're replacing
+    *wbLineAddr = array[bestCandidate];
+
+    return bestCandidate;
+}
+
+void ZArray::postinsert(const Address lineAddr, const MemReq* req, uint32_t candidate) {
+    //We do the swaps in lookupArray, the array stays the same
+    assert(lookupArray[swapArray[0]] == candidate);
+    for (uint32_t i = 0; i < swapArrayLen-1; i++) {
+        //info("Moving position %d (lineId %d) <- %d (lineId %d)", swapArray[i], lookupArray[swapArray[i]], swapArray[i+1], lookupArray[swapArray[i+1]]);
+        lookupArray[swapArray[i]] = lookupArray[swapArray[i+1]];
+    }
+    lookupArray[swapArray[swapArrayLen-1]] = candidate; //note that in preinsert() we walk the array backwards when populating swapArray, so the last elem is where the new line goes
+    //info("Inserting lineId %d in position %d", candidate, swapArray[swapArrayLen-1]);
+
+    rp->replaced(candidate);
+    array[candidate] = lineAddr;
+    rp->update(candidate, req);
+
+    statSwaps.inc(swapArrayLen-1);
+}
+
diff --git a/src/cache_arrays.h b/src/cache_arrays.h
new file mode 100644
index 00000000..db036642
--- /dev/null
+++ b/src/cache_arrays.h
@@ -0,0 +1,157 @@
+/** $lic$
+ * Copyright (C) 2012-2014 by Massachusetts Institute of Technology
+ * Copyright (C) 2010-2013 by The Board of Trustees of Stanford University
+ *
+ * This file is part of zsim.
+ *
+ * zsim is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU General Public License as published by the Free Software
+ * Foundation, version 2.
+ *
+ * If you use this software in your research, we request that you reference
+ * the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of
+ * Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the
+ * source of the simulator in any publications that use this software, and that
+ * you send us a citation of your work.
+ *
+ * zsim is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef CACHE_ARRAYS_H_
+#define CACHE_ARRAYS_H_
+
+#include "memory_hierarchy.h"
+#include "stats.h"
+
+/* General interface of a cache array. The array is a fixed-size associative container that
+ * translates addresses to line IDs. A line ID represents the position of the tag. The other
+ * cache components store tag data in non-associative arrays indexed by line ID.
+ */
+class CacheArray : public GlobAlloc {
+    public:
+        /* Returns tag's ID if present, -1 otherwise. If updateReplacement is set, call the replacement policy's update() on the line accessed*/
+        virtual int32_t lookup(const Address lineAddr, const MemReq* req, bool updateReplacement) = 0;
+
+        /* Runs replacement scheme, returns tag ID of new pos and address of line to write back*/
+        virtual uint32_t preinsert(const Address lineAddr, const MemReq* req, Address* wbLineAddr) = 0;
+
+        /* Actually do the replacement, writing the new address in lineId.
+         * NOTE: This method is guaranteed to be called after preinsert, although
+         * there may be some intervening calls to lookup. The implementation is
+         * allowed to keep internal state in preinsert() and use it in postinsert()
+         */
+        virtual void postinsert(const Address lineAddr, const MemReq* req, uint32_t lineId) = 0;
+
+        virtual void initStats(AggregateStat* parent) {}
+};
+
+class ReplPolicy;
+class HashFamily;
+
+/* Set-associative cache array */
+class SetAssocArray : public CacheArray {
+    protected:
+        Address* array;
+        ReplPolicy* rp;
+        HashFamily* hf;
+        uint32_t numLines;
+        uint32_t numSets;
+        uint32_t assoc;
+        uint32_t setMask;
+
+    public:
+        SetAssocArray(uint32_t _numLines, uint32_t _assoc, ReplPolicy* _rp, HashFamily* _hf);
+
+        int32_t lookup(const Address lineAddr, const MemReq* req, bool updateReplacement);
+        uint32_t preinsert(const Address lineAddr, const MemReq* req, Address* wbLineAddr);
+        void postinsert(const Address lineAddr, const MemReq* req, uint32_t candidate);
+};
+
+/* The cache array that started this simulator :) */
+class ZArray : public CacheArray {
+    private:
+        Address* array; //maps line id to address
+        uint32_t* lookupArray; //maps physical position to lineId
+        ReplPolicy* rp;
+        HashFamily* hf;
+        uint32_t numLines;
+        uint32_t numSets;
+        uint32_t ways;
+        uint32_t cands;
+        uint32_t setMask;
+
+        //preinsert() stores the swaps that must be done here, postinsert() does the swaps
+        uint32_t* swapArray; //contains physical positions
+        uint32_t swapArrayLen; //set in preinsert()
+
+        uint32_t lastCandIdx;
+
+        Counter statSwaps;
+
+    public:
+        ZArray(uint32_t _numLines, uint32_t _ways, uint32_t _candidates, ReplPolicy* _rp, HashFamily* _hf);
+
+        int32_t lookup(const Address lineAddr, const MemReq* req, bool updateReplacement);
+        uint32_t preinsert(const Address lineAddr, const MemReq* req, Address* wbLineAddr);
+        void postinsert(const Address lineAddr, const MemReq* req, uint32_t candidate);
+
+        //zcache-specific, since timing code needs to know the number of swaps, and these depend on idx
+        //Should be called after preinsert(). Allows intervening lookups
+        uint32_t getLastCandIdx() const {return lastCandIdx;}
+
+        void initStats(AggregateStat* parentStat);
+};
+
+// Simple wrapper classes and iterators for candidates in each case; simplifies replacement policy interface without sacrificing performance
+// NOTE: All must implement the same interface and be POD (we pass them by value)
+struct SetAssocCands {
+    struct iterator {
+        uint32_t x;
+        explicit inline iterator(uint32_t _x) : x(_x) {}
+        inline void inc() {x++;} //overloading prefix/postfix too messy
+        inline uint32_t operator*() const { return x; }
+        inline bool operator==(const iterator& it) const { return it.x == x; }
+        inline bool operator!=(const iterator& it) const { return it.x != x; }
+    };
+
+    uint32_t b, e;
+    inline SetAssocCands(uint32_t _b, uint32_t _e) : b(_b), e(_e) {}
+    inline iterator begin() const {return iterator(b);}
+    inline iterator end() const {return iterator(e);}
+    inline uint32_t numCands() const { return e-b; }
+};
+
+
+struct ZWalkInfo {
+    uint32_t pos;
+    uint32_t lineId;
+    int32_t parentIdx;
+
+    inline void set(uint32_t p, uint32_t i, int32_t x) {pos = p; lineId = i; parentIdx = x;}
+};
+
+struct ZCands {
+    struct iterator {
+        ZWalkInfo* x;
+        explicit inline iterator(ZWalkInfo* _x) : x(_x) {}
+        inline void inc() {x++;} //overloading prefix/postfix too messy
+        inline uint32_t operator*() const { return x->lineId; }
+        inline bool operator==(const iterator& it) const { return it.x == x; }
+        inline bool operator!=(const iterator& it) const { return it.x != x; }
+    };
+
+    ZWalkInfo* b;
+    ZWalkInfo* e;
+    inline ZCands(ZWalkInfo* _b, ZWalkInfo* _e) : b(_b), e(_e) {}
+    inline iterator begin() const {return iterator(b);}
+    inline iterator end() const {return iterator(e);}
+    inline uint32_t numCands() const { return e-b; }
+};
+
+#endif  // CACHE_ARRAYS_H_
diff --git a/src/coherence_ctrls.cpp b/src/coherence_ctrls.cpp
new file mode 100644
index 00000000..4894771c
--- /dev/null
+++ b/src/coherence_ctrls.cpp
@@ -0,0 +1,342 @@
+/** $lic$
+ * Copyright (C) 2012-2014 by Massachusetts Institute of Technology
+ * Copyright (C) 2010-2013 by The Board of Trustees of Stanford University
+ *
+ * This file is part of zsim.
+ *
+ * zsim is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU General Public License as published by the Free Software
+ * Foundation, version 2.
+ *
+ * If you use this software in your research, we request that you reference
+ * the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of
+ * Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the
+ * source of the simulator in any publications that use this software, and that
+ * you send us a citation of your work.
+ *
+ * zsim is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "coherence_ctrls.h"
+#include "cache.h"
+#include "network.h"
+
+/* Do a simple XOR block hash on address to determine its bank. Hacky for now,
+ * should probably have a class that deals with this with a real hash function
+ * (TODO)
+ */
+uint32_t MESIBottomCC::getParentId(Address lineAddr) {
+    //Hash things a bit
+    uint32_t res = 0;
+    uint64_t tmp = lineAddr;
+    for (uint32_t i = 0; i < 4; i++) {
+        res ^= (uint32_t) ( ((uint64_t)0xffff) & tmp);
+        tmp = tmp >> 16;
+    }
+    return (res % parents.size());
+}
+
+
+void MESIBottomCC::init(const g_vector<MemObject*>& _parents, Network* network, const char* name) {
+    parents.resize(_parents.size());
+    parentRTTs.resize(_parents.size());
+    for (uint32_t p = 0; p < parents.size(); p++) {
+        parents[p] = _parents[p];
+        parentRTTs[p] = (network)? network->getRTT(name, parents[p]->getName()) : 0;
+    }
+}
+
+
+uint64_t MESIBottomCC::processEviction(Address wbLineAddr, uint32_t lineId, bool lowerLevelWriteback, uint64_t cycle, uint32_t srcId) {
+    MESIState* state = &array[lineId];
+    if (lowerLevelWriteback) {
+        //If this happens, when tcc issued the invalidations, it got a writeback. This means we have to do a PUTX, i.e. we have to transition to M if we are in E
+        assert(*state == M || *state == E); //Must have exclusive permission!
+        *state = M; //Silent E->M transition (at eviction); now we'll do a PUTX
+    }
+    uint64_t respCycle = cycle;
+    switch (*state) {
+        case I:
+            break; //Nothing to do
+        case S:
+        case E:
+            {
+                MemReq req = {wbLineAddr, PUTS, selfId, state, cycle, &ccLock, *state, srcId, 0 /*no flags*/};
+                respCycle = parents[getParentId(wbLineAddr)]->access(req);
+            }
+            break;
+        case M:
+            {
+                MemReq req = {wbLineAddr, PUTX, selfId, state, cycle, &ccLock, *state, srcId, 0 /*no flags*/};
+                respCycle = parents[getParentId(wbLineAddr)]->access(req);
+            }
+            break;
+
+        default: panic("!?");
+    }
+    assert_msg(*state == I, "Wrong final state %s on eviction", MESIStateName(*state));
+    return respCycle;
+}
+
+uint64_t MESIBottomCC::processAccess(Address lineAddr, uint32_t lineId, AccessType type, uint64_t cycle, uint32_t srcId, uint32_t flags) {
+    uint64_t respCycle = cycle;
+    MESIState* state = &array[lineId];
+    switch (type) {
+        // A PUTS/PUTX does nothing w.r.t. higher coherence levels --- it dies here
+        case PUTS: //Clean writeback, nothing to do (except profiling)
+            assert(*state != I);
+            profPUTS.inc();
+            break;
+        case PUTX: //Dirty writeback
+            assert(*state == M || *state == E);
+            if (*state == E) {
+                //Silent transition, record that block was written to
+                *state = M;
+            }
+            profPUTX.inc();
+            break;
+        case GETS:
+            if (*state == I) {
+                uint32_t parentId = getParentId(lineAddr);
+                MemReq req = {lineAddr, GETS, selfId, state, cycle, &ccLock, *state, srcId, flags};
+                uint32_t nextLevelLat = parents[parentId]->access(req) - cycle;
+                uint32_t netLat = parentRTTs[parentId];
+                profGETNextLevelLat.inc(nextLevelLat);
+                profGETNetLat.inc(netLat);
+                respCycle += nextLevelLat + netLat;
+                profGETSMiss.inc();
+                assert(*state == S || *state == E);
+            } else {
+                profGETSHit.inc();
+            }
+            break;
+        case GETX:
+            if (*state == I || *state == S) {
+                //Profile before access, state changes
+                if (*state == I) profGETXMissIM.inc();
+                else profGETXMissSM.inc();
+                uint32_t parentId = getParentId(lineAddr);
+                MemReq req = {lineAddr, GETX, selfId, state, cycle, &ccLock, *state, srcId, flags};
+                uint32_t nextLevelLat = parents[parentId]->access(req) - cycle;
+                uint32_t netLat = parentRTTs[parentId];
+                profGETNextLevelLat.inc(nextLevelLat);
+                profGETNetLat.inc(netLat);
+                respCycle += nextLevelLat + netLat;
+            } else {
+                if (*state == E) {
+                    // Silent transition
+                    // NOTE: When do we silent-transition E->M on an ML hierarchy... on a GETX, or on a PUTX?
+                    /* Actually, on both: on a GETX b/c line's going to be modified anyway, and must do it if it is the L1 (it's OK not
+                     * to transition if L2+, we'll TX on the PUTX or invalidate, but doing it this way minimizes the differences between
+                     * L1 and L2+ controllers); and on a PUTX, because receiving a PUTX while we're in E indicates the child did a silent
+                     * transition and now that it is evictiong, it's our turn to maintain M info.
+                     */
+                    *state = M;
+                }
+                profGETXHit.inc();
+            }
+            assert_msg(*state == M, "Wrong final state on GETX, lineId %d numLines %d, finalState %s", lineId, numLines, MESIStateName(*state));
+            break;
+
+        default: panic("!?");
+    }
+    assert_msg(respCycle >= cycle, "XXX %ld %ld", respCycle, cycle);
+    return respCycle;
+}
+
+void MESIBottomCC::processWritebackOnAccess(Address lineAddr, uint32_t lineId, AccessType type) {
+    MESIState* state = &array[lineId];
+    assert(*state == M || *state == E);
+    if (*state == E) {
+        //Silent transition to M if in E
+        *state = M;
+    }
+}
+
+void MESIBottomCC::processInval(Address lineAddr, uint32_t lineId, InvType type, bool* reqWriteback) {
+    MESIState* state = &array[lineId];
+    assert(*state != I);
+    switch (type) {
+        case INVX: //lose exclusivity
+            //Hmmm, do we have to propagate loss of exclusivity down the tree? (nah, topcc will do this automatically -- it knows the final state, always!)
+            assert_msg(*state == E || *state == M, "Invalid state %s", MESIStateName(*state));
+            if (*state == M) *reqWriteback = true;
+            *state = S;
+            profINVX.inc();
+            break;
+        case INV: //invalidate
+            assert(*state != I);
+            if (*state == M) *reqWriteback = true;
+            *state = I;
+            profINV.inc();
+            break;
+        case FWD: //forward
+            assert_msg(*state == S, "Invalid state %s on FWD", MESIStateName(*state));
+            profFWD.inc();
+            break;
+        default: panic("!?");
+    }
+    //NOTE: BottomCC never calls up on an invalidate, so it adds no extra latency
+}
+
+
+uint64_t MESIBottomCC::processNonInclusiveWriteback(Address lineAddr, AccessType type, uint64_t cycle, MESIState* state, uint32_t srcId, uint32_t flags) {
+    if (!nonInclusiveHack) panic("Non-inclusive %s on line 0x%lx, this cache should be inclusive", AccessTypeName(type), lineAddr);
+
+    //info("Non-inclusive wback, forwarding");
+    MemReq req = {lineAddr, type, selfId, state, cycle, &ccLock, *state, srcId, flags | MemReq::NONINCLWB};
+    uint64_t respCycle = parents[getParentId(lineAddr)]->access(req);
+    return respCycle;
+}
+
+
+/* MESITopCC implementation */
+
+void MESITopCC::init(const g_vector<BaseCache*>& _children, Network* network, const char* name) {
+    if (_children.size() > MAX_CACHE_CHILDREN) {
+        panic("[%s] Children size (%d) > MAX_CACHE_CHILDREN (%d)", name, (uint32_t)_children.size(), MAX_CACHE_CHILDREN);
+    }
+    children.resize(_children.size());
+    childrenRTTs.resize(_children.size());
+    for (uint32_t c = 0; c < children.size(); c++) {
+        children[c] = _children[c];
+        childrenRTTs[c] = (network)? network->getRTT(name, children[c]->getName()) : 0;
+    }
+}
+
+uint64_t MESITopCC::sendInvalidates(Address lineAddr, uint32_t lineId, InvType type, bool* reqWriteback, uint64_t cycle, uint32_t srcId) {
+    //Send down downgrades/invalidates
+    Entry* e = &array[lineId];
+
+    //Don't propagate downgrades if sharers are not exclusive.
+    if (type == INVX && !e->isExclusive()) {
+        return cycle;
+    }
+
+    uint64_t maxCycle = cycle; //keep maximum cycle only, we assume all invals are sent in parallel
+    if (!e->isEmpty()) {
+        uint32_t numChildren = children.size();
+        uint32_t sentInvs = 0;
+        for (uint32_t c = 0; c < numChildren; c++) {
+            if (e->sharers[c]) {
+                uint64_t respCycle = children[c]->invalidate(lineAddr, type, reqWriteback, cycle, srcId);
+                respCycle += childrenRTTs[c];
+                maxCycle = MAX(respCycle, maxCycle);
+                if (type == INV) e->sharers[c] = false;
+                sentInvs++;
+            }
+        }
+        assert(sentInvs == e->numSharers);
+        if (type == INV) {
+            e->numSharers = 0;
+        } else {
+            //TODO: This is kludgy -- once the sharers format is more sophisticated, handle downgrades with a different codepath
+            assert(e->exclusive);
+            assert(e->numSharers == 1);
+            e->exclusive = false;
+        }
+    }
+    return maxCycle;
+}
+
+
+uint64_t MESITopCC::processEviction(Address wbLineAddr, uint32_t lineId, bool* reqWriteback, uint64_t cycle, uint32_t srcId) {
+    if (nonInclusiveHack) {
+        // Don't invalidate anything, just clear our entry
+        array[lineId].clear();
+        return cycle;
+    } else {
+        //Send down invalidates
+        return sendInvalidates(wbLineAddr, lineId, INV, reqWriteback, cycle, srcId);
+    }
+}
+
+uint64_t MESITopCC::processAccess(Address lineAddr, uint32_t lineId, AccessType type, uint32_t childId, bool haveExclusive,
+                                  MESIState* childState, bool* inducedWriteback, uint64_t cycle, uint32_t srcId, uint32_t flags) {
+    Entry* e = &array[lineId];
+    uint64_t respCycle = cycle;
+    switch (type) {
+        case PUTX:
+            assert(e->isExclusive());
+            if (flags & MemReq::PUTX_KEEPEXCL) {
+                assert(e->sharers[childId]);
+                assert(*childState == M);
+                *childState = E; //they don't hold dirty data anymore
+                break; //don't remove from sharer set. It'll keep exclusive perms.
+            }
+            //note NO break in general
+        case PUTS:
+            assert(e->sharers[childId]);
+            e->sharers[childId] = false;
+            e->numSharers--;
+            *childState = I;
+            break;
+        case GETS:
+            if (e->isEmpty() && haveExclusive && !(flags & MemReq::NOEXCL)) {
+                //Give in E state
+                e->exclusive = true;
+                e->sharers[childId] = true;
+                e->numSharers = 1;
+                *childState = E;
+            } else {
+                //Give in S state
+                assert(e->sharers[childId] == false);
+
+                if (e->isExclusive()) {
+                    //Downgrade the exclusive sharer
+                    respCycle = sendInvalidates(lineAddr, lineId, INVX, inducedWriteback, cycle, srcId);
+                }
+
+                assert_msg(!e->isExclusive(), "Can't have exclusivity here. isExcl=%d excl=%d numSharers=%d", e->isExclusive(), e->exclusive, e->numSharers);
+
+                e->sharers[childId] = true;
+                e->numSharers++;
+                e->exclusive = false; //dsm: Must set, we're explicitly non-exclusive
+                *childState = S;
+            }
+            break;
+        case GETX:
+            assert(haveExclusive); //the current cache better have exclusive access to this line
+
+            // If child is in sharers list (this is an upgrade miss), take it out
+            if (e->sharers[childId]) {
+                assert_msg(!e->isExclusive(), "Spurious GETX, childId=%d numSharers=%d isExcl=%d excl=%d", childId, e->numSharers, e->isExclusive(), e->exclusive);
+                e->sharers[childId] = false;
+                e->numSharers--;
+            }
+
+            // Invalidate all other copies
+            respCycle = sendInvalidates(lineAddr, lineId, INV, inducedWriteback, cycle, srcId);
+
+            // Set current sharer, mark exclusive
+            e->sharers[childId] = true;
+            e->numSharers++;
+            e->exclusive = true;
+
+            assert(e->numSharers == 1);
+
+            *childState = M; //give in M directly
+            break;
+
+        default: panic("!?");
+    }
+
+    return respCycle;
+}
+
+uint64_t MESITopCC::processInval(Address lineAddr, uint32_t lineId, InvType type, bool* reqWriteback, uint64_t cycle, uint32_t srcId) {
+    if (type == FWD) {//if it's a FWD, we should be inclusive for now, so we must have the line, just invLat works
+        assert(!nonInclusiveHack); //dsm: ask me if you see this failing and don't know why
+        return cycle;
+    } else {
+        //Just invalidate or downgrade down to children as needed
+        return sendInvalidates(lineAddr, lineId, type, reqWriteback, cycle, srcId);
+    }
+}
+
diff --git a/src/coherence_ctrls.h b/src/coherence_ctrls.h
new file mode 100644
index 00000000..2a875f40
--- /dev/null
+++ b/src/coherence_ctrls.h
@@ -0,0 +1,498 @@
+/** $lic$
+ * Copyright (C) 2012-2014 by Massachusetts Institute of Technology
+ * Copyright (C) 2010-2013 by The Board of Trustees of Stanford University
+ *
+ * This file is part of zsim.
+ *
+ * zsim is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU General Public License as published by the Free Software
+ * Foundation, version 2.
+ *
+ * If you use this software in your research, we request that you reference
+ * the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of
+ * Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the
+ * source of the simulator in any publications that use this software, and that
+ * you send us a citation of your work.
+ *
+ * zsim is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef COHERENCE_CTRLS_H_
+#define COHERENCE_CTRLS_H_
+
+#include <bitset>
+#include "constants.h"
+#include "g_std/g_string.h"
+#include "g_std/g_vector.h"
+#include "locks.h"
+#include "memory_hierarchy.h"
+#include "pad.h"
+#include "stats.h"
+
+//TODO: Now that we have a pure CC interface, the MESI controllers should go on different files.
+
+/* Generic, integrated controller interface */
+class CC : public GlobAlloc {
+    public:
+        //Initialization
+        virtual void setParents(uint32_t childId, const g_vector<MemObject*>& parents, Network* network) = 0;
+        virtual void setChildren(const g_vector<BaseCache*>& children, Network* network) = 0;
+        virtual void initStats(AggregateStat* cacheStat) = 0;
+
+        //Access methods; see Cache for call sequence
+        virtual bool startAccess(MemReq& req) = 0; //initial locking, address races; returns true if access should be skipped; may change req!
+        virtual bool shouldAllocate(const MemReq& req) = 0; //called when we don't find req's lineAddr in the array
+        virtual uint64_t processEviction(const MemReq& triggerReq, Address wbLineAddr, int32_t lineId, uint64_t startCycle) = 0; //called iff shouldAllocate returns true
+        virtual uint64_t processAccess(const MemReq& req, int32_t lineId, uint64_t startCycle, uint64_t* getDoneCycle = NULL) = 0;
+        virtual void endAccess(const MemReq& req) = 0;
+
+        //Inv methods
+        virtual void startInv() = 0;
+        virtual uint64_t processInv(Address lineAddr, int32_t lineId, InvType type, bool* reqWriteback, uint64_t startCycle, uint32_t srcId) = 0;
+
+        //Repl policy interface
+        virtual uint32_t numSharers(uint32_t lineId) = 0;
+        virtual bool isValid(uint32_t lineId) = 0;
+};
+
+
+/* A MESI coherence controller is decoupled in two:
+ *  - The BOTTOM controller, which deals with keeping coherence state with respect to the upper level and issues
+ *    requests (accesses) to upper levels.
+ *  - The TOP controller, which keeps state of lines w.r.t. lower levels of the hierarchy (e.g. sharer lists),
+ *    and issues requests (invalidates) to lower levels.
+ * The naming scheme is PROTOCOL-CENTRIC, i.e. if you draw a multi-level hierarchy, between each pair of levels
+ * there is a top CC at the top and a bottom CC at the bottom. Unfortunately, if you look at the caches, the
+ * bottom CC is at the top is at the bottom. So the cache class may seem a bit weird at times, but the controller
+ * classes make more sense.
+ */
+
+class Cache;
+class Network;
+
+/* NOTE: To avoid virtual function overheads, there is no BottomCC interface, since we only have a MESI controller for now */
+
+class MESIBottomCC : public GlobAlloc {
+    private:
+        MESIState* array;
+        g_vector<MemObject*> parents;
+        g_vector<uint32_t> parentRTTs;
+        uint32_t numLines;
+        uint32_t selfId;
+
+        //Profiling counters
+        Counter profGETSHit, profGETSMiss, profGETXHit, profGETXMissIM /*from invalid*/, profGETXMissSM /*from S, i.e. upgrade misses*/;
+        Counter profPUTS, profPUTX /*received from downstream*/;
+        Counter profINV, profINVX, profFWD /*received from upstream*/;
+        //Counter profWBIncl, profWBCoh /* writebacks due to inclusion or coherence, received from downstream, does not include PUTS */;
+        // TODO: Measuring writebacks is messy, do if needed
+        Counter profGETNextLevelLat, profGETNetLat;
+
+        bool nonInclusiveHack;
+
+        PAD();
+        lock_t ccLock;
+        PAD();
+
+    public:
+        MESIBottomCC(uint32_t _numLines, uint32_t _selfId, bool _nonInclusiveHack) : numLines(_numLines), selfId(_selfId), nonInclusiveHack(_nonInclusiveHack) {
+            array = gm_calloc<MESIState>(numLines);
+            for (uint32_t i = 0; i < numLines; i++) {
+                array[i] = I;
+            }
+            futex_init(&ccLock);
+        }
+
+        void init(const g_vector<MemObject*>& _parents, Network* network, const char* name);
+
+        inline bool isExclusive(uint32_t lineId) {
+            MESIState state = array[lineId];
+            return (state == E) || (state == M);
+        }
+
+        void initStats(AggregateStat* parentStat) {
+            profGETSHit.init("hGETS", "GETS hits");
+            profGETXHit.init("hGETX", "GETX hits");
+            profGETSMiss.init("mGETS", "GETS misses");
+            profGETXMissIM.init("mGETXIM", "GETX I->M misses");
+            profGETXMissSM.init("mGETXSM", "GETX S->M misses (upgrade misses)");
+            profPUTS.init("PUTS", "Clean evictions (from lower level)");
+            profPUTX.init("PUTX", "Dirty evictions (from lower level)");
+            profINV.init("INV", "Invalidates (from upper level)");
+            profINVX.init("INVX", "Downgrades (from upper level)");
+            profFWD.init("FWD", "Forwards (from upper level)");
+            profGETNextLevelLat.init("latGETnl", "GET request latency on next level");
+            profGETNetLat.init("latGETnet", "GET request latency on network to next level");
+
+            parentStat->append(&profGETSHit);
+            parentStat->append(&profGETXHit);
+            parentStat->append(&profGETSMiss);
+            parentStat->append(&profGETXMissIM);
+            parentStat->append(&profGETXMissSM);
+            parentStat->append(&profPUTS);
+            parentStat->append(&profPUTX);
+            parentStat->append(&profINV);
+            parentStat->append(&profINVX);
+            parentStat->append(&profFWD);
+            parentStat->append(&profGETNextLevelLat);
+            parentStat->append(&profGETNetLat);
+        }
+
+        uint64_t processEviction(Address wbLineAddr, uint32_t lineId, bool lowerLevelWriteback, uint64_t cycle, uint32_t srcId);
+
+        uint64_t processAccess(Address lineAddr, uint32_t lineId, AccessType type, uint64_t cycle, uint32_t srcId, uint32_t flags);
+
+        void processWritebackOnAccess(Address lineAddr, uint32_t lineId, AccessType type);
+
+        void processInval(Address lineAddr, uint32_t lineId, InvType type, bool* reqWriteback);
+
+        uint64_t processNonInclusiveWriteback(Address lineAddr, AccessType type, uint64_t cycle, MESIState* state, uint32_t srcId, uint32_t flags);
+
+        inline void lock() {
+            futex_lock(&ccLock);
+        }
+
+        inline void unlock() {
+            futex_unlock(&ccLock);
+        }
+
+        /* Replacement policy query interface */
+        inline bool isValid(uint32_t lineId) {
+            return array[lineId] != I;
+        }
+
+        //Could extend with isExclusive, isDirty, etc, but not needed for now.
+
+    private:
+        uint32_t getParentId(Address lineAddr);
+};
+
+
+//Implements the "top" part: Keeps directory information, handles downgrades and invalidates
+class MESITopCC : public GlobAlloc {
+    private:
+        struct Entry {
+            uint32_t numSharers;
+            std::bitset<MAX_CACHE_CHILDREN> sharers;
+            bool exclusive;
+
+            void clear() {
+                exclusive = false;
+                numSharers = 0;
+                sharers.reset();
+            }
+
+            bool isEmpty() {
+                return numSharers == 0;
+            }
+
+            bool isExclusive() {
+                return (numSharers == 1) && (exclusive);
+            }
+        };
+
+        Entry* array;
+        g_vector<BaseCache*> children;
+        g_vector<uint32_t> childrenRTTs;
+        uint32_t numLines;
+
+        bool nonInclusiveHack;
+
+        PAD();
+        lock_t ccLock;
+        PAD();
+
+    public:
+        MESITopCC(uint32_t _numLines, bool _nonInclusiveHack) : numLines(_numLines), nonInclusiveHack(_nonInclusiveHack) {
+            array = gm_calloc<Entry>(numLines);
+            for (uint32_t i = 0; i < numLines; i++) {
+                array[i].clear();
+            }
+
+            futex_init(&ccLock);
+        }
+
+        void init(const g_vector<BaseCache*>& _children, Network* network, const char* name);
+
+        uint64_t processEviction(Address wbLineAddr, uint32_t lineId, bool* reqWriteback, uint64_t cycle, uint32_t srcId);
+
+        uint64_t processAccess(Address lineAddr, uint32_t lineId, AccessType type, uint32_t childId, bool haveExclusive,
+                MESIState* childState, bool* inducedWriteback, uint64_t cycle, uint32_t srcId, uint32_t flags);
+
+        uint64_t processInval(Address lineAddr, uint32_t lineId, InvType type, bool* reqWriteback, uint64_t cycle, uint32_t srcId);
+
+        inline void lock() {
+            futex_lock(&ccLock);
+        }
+
+        inline void unlock() {
+            futex_unlock(&ccLock);
+        }
+
+        /* Replacement policy query interface */
+        inline uint32_t numSharers(uint32_t lineId) {
+            return array[lineId].numSharers;
+        }
+
+    private:
+        uint64_t sendInvalidates(Address lineAddr, uint32_t lineId, InvType type, bool* reqWriteback, uint64_t cycle, uint32_t srcId);
+};
+
+static inline bool CheckForMESIRace(AccessType& type, MESIState* state, MESIState initialState) {
+    //NOTE: THIS IS THE ONLY CODE THAT SHOULD DEAL WITH RACES. tcc, bcc et al should be written as if they were race-free.
+    bool skipAccess = false;
+    if (*state != initialState) {
+        //info("[%s] Race on line 0x%lx, %s by childId %d, was state %s, now %s", name.c_str(), lineAddr, accessTypeNames[type], childId, mesiStateNames[initialState], mesiStateNames[*state]);
+        //An intervening invalidate happened! Two types of races:
+        if (type == PUTS || type == PUTX) { //either it is a PUT...
+            //We want to get rid of this line
+            if (*state == I) {
+                //If it was already invalidated (INV), just skip access altogether, we're already done
+                skipAccess = true;
+            } else {
+                //We were downgraded (INVX), still need to do the PUT
+                assert(*state == S);
+                //If we wanted to do a PUTX, just change it to a PUTS b/c now the line is not exclusive anymore
+                if (type == PUTX) type = PUTS;
+            }
+        } else if (type == GETX) { //...or it is a GETX
+            //In this case, the line MUST have been in S and have been INValidated
+            assert(initialState == S);
+            assert(*state == I);
+            //Do nothing. This is still a valid GETX, only it is not an upgrade miss anymore
+        } else { //no GETSs can race with INVs, if we are doing a GETS it's because the line was invalid to begin with!
+            panic("Invalid true race happened (?)");
+        }
+    }
+    return skipAccess;
+}
+
+// Non-terminal CC; accepts GETS/X and PUTS/X accesses
+class MESICC : public CC {
+    private:
+        MESITopCC* tcc;
+        MESIBottomCC* bcc;
+        uint32_t numLines;
+        bool nonInclusiveHack;
+        g_string name;
+
+    public:
+        //Initialization
+        MESICC(uint32_t _numLines, bool _nonInclusiveHack, g_string& _name) : tcc(NULL), bcc(NULL),
+            numLines(_numLines), nonInclusiveHack(_nonInclusiveHack), name(_name) {}
+
+        void setParents(uint32_t childId, const g_vector<MemObject*>& parents, Network* network) {
+            bcc = new MESIBottomCC(numLines, childId, nonInclusiveHack);
+            bcc->init(parents, network, name.c_str());
+        }
+
+        void setChildren(const g_vector<BaseCache*>& children, Network* network) {
+            tcc = new MESITopCC(numLines, nonInclusiveHack);
+            tcc->init(children, network, name.c_str());
+        }
+
+        void initStats(AggregateStat* cacheStat) {
+            //no tcc stats
+            bcc->initStats(cacheStat);
+        }
+
+        //Access methods
+        bool startAccess(MemReq& req) {
+            assert((req.type == GETS) || (req.type == GETX) || (req.type == PUTS) || (req.type == PUTX));
+
+            /* Child should be locked when called. We do hand-over-hand locking when going
+             * down (which is why we require the lock), but not when going up, opening the
+             * child to invalidation races here to avoid deadlocks.
+             */
+            if (req.childLock) {
+                futex_unlock(req.childLock);
+            }
+
+            tcc->lock(); //must lock tcc FIRST
+            bcc->lock();
+
+            /* The situation is now stable, true race-wise. No one can touch the child state, because we hold
+             * both parent's locks. So, we first handle races, which may cause us to skip the access.
+             */
+            bool skipAccess = CheckForMESIRace(req.type /*may change*/, req.state, req.initialState);
+            return skipAccess;
+        }
+
+        bool shouldAllocate(const MemReq& req) {
+            if ((req.type == GETS) || (req.type == GETX)) {
+                return true;
+            } else {
+                assert((req.type == PUTS) || (req.type == PUTX));
+                if (!nonInclusiveHack) {
+                    panic("[%s] We lost inclusion on this line! 0x%lx, type %s, childId %d, childState %s", name.c_str(),
+                            req.lineAddr, AccessTypeName(req.type), req.childId, MESIStateName(*req.state));
+                }
+                return false;
+            }
+        }
+
+        uint64_t processEviction(const MemReq& triggerReq, Address wbLineAddr, int32_t lineId, uint64_t startCycle) {
+            bool lowerLevelWriteback = false;
+            uint64_t evCycle = tcc->processEviction(wbLineAddr, lineId, &lowerLevelWriteback, startCycle, triggerReq.srcId); //1. if needed, send invalidates/downgrades to lower level
+            evCycle = bcc->processEviction(wbLineAddr, lineId, lowerLevelWriteback, evCycle, triggerReq.srcId); //2. if needed, write back line to upper level
+            return evCycle;
+        }
+
+        uint64_t processAccess(const MemReq& req, int32_t lineId, uint64_t startCycle, uint64_t* getDoneCycle = NULL) {
+            uint64_t respCycle = startCycle;
+            //Handle non-inclusive writebacks by bypassing
+            //NOTE: Most of the time, these are due to evictions, so the line is not there. But the second condition can trigger in NUCA-initiated
+            //invalidations. The alternative with this would be to capture these blocks, since we have space anyway. This is so rare is doesn't matter,
+            //but if we do proper NI/EX mid-level caches backed by directories, this may start becoming more common (and it is perfectly acceptable to
+            //upgrade without any interaction with the parent... the child had the permissions!)
+            if (lineId == -1 || (((req.type == PUTS) || (req.type == PUTX)) && !bcc->isValid(lineId))) { //can only be a non-inclusive wback
+                assert(nonInclusiveHack);
+                assert((req.type == PUTS) || (req.type == PUTX));
+                respCycle = bcc->processNonInclusiveWriteback(req.lineAddr, req.type, startCycle, req.state, req.srcId, req.flags);
+            } else {
+                //Prefetches are side requests and get handled a bit differently
+                bool isPrefetch = req.flags & MemReq::PREFETCH;
+                assert(!isPrefetch || req.type == GETS);
+                uint32_t flags = req.flags & ~MemReq::PREFETCH; //always clear PREFETCH, this flag cannot propagate up
+
+                //if needed, fetch line or upgrade miss from upper level
+                respCycle = bcc->processAccess(req.lineAddr, lineId, req.type, startCycle, req.srcId, flags);
+                if (getDoneCycle) *getDoneCycle = respCycle;
+                if (!isPrefetch) { //prefetches only touch bcc; the demand request from the core will pull the line to lower level
+                    //At this point, the line is in a good state w.r.t. upper levels
+                    bool lowerLevelWriteback = false;
+                    //change directory info, invalidate other children if needed, tell requester about its state
+                    respCycle = tcc->processAccess(req.lineAddr, lineId, req.type, req.childId, bcc->isExclusive(lineId), req.state,
+                            &lowerLevelWriteback, respCycle, req.srcId, flags);
+                    if (lowerLevelWriteback) {
+                        //Essentially, if tcc induced a writeback, bcc may need to do an E->M transition to reflect that the cache now has dirty data
+                        bcc->processWritebackOnAccess(req.lineAddr, lineId, req.type);
+                    }
+                }
+            }
+            return respCycle;
+        }
+
+        void endAccess(const MemReq& req) {
+            //Relock child before we unlock ourselves (hand-over-hand)
+            if (req.childLock) {
+                futex_lock(req.childLock);
+            }
+
+            bcc->unlock();
+            tcc->unlock();
+        }
+
+        //Inv methods
+        void startInv() {
+            bcc->lock(); //note we don't grab tcc; tcc serializes multiple up accesses, down accesses don't see it
+        }
+
+        uint64_t processInv(Address lineAddr, int32_t lineId, InvType type, bool* reqWriteback, uint64_t startCycle, uint32_t srcId) {
+            uint64_t respCycle = tcc->processInval(lineAddr, lineId, type, reqWriteback, startCycle, srcId); //send invalidates or downgrades to children
+            bcc->processInval(lineAddr, lineId, type, reqWriteback); //adjust our own state
+
+            bcc->unlock();
+            return respCycle;
+        }
+
+        //Repl policy interface
+        uint32_t numSharers(uint32_t lineId) {return tcc->numSharers(lineId);}
+        bool isValid(uint32_t lineId) {return bcc->isValid(lineId);}
+};
+
+// Terminal CC, i.e., without children --- accepts GETS/X, but not PUTS/X
+class MESITerminalCC : public CC {
+    private:
+        MESIBottomCC* bcc;
+        uint32_t numLines;
+        g_string name;
+
+    public:
+        //Initialization
+        MESITerminalCC(uint32_t _numLines, const g_string& _name) : bcc(NULL), numLines(_numLines), name(_name) {}
+
+        void setParents(uint32_t childId, const g_vector<MemObject*>& parents, Network* network) {
+            bcc = new MESIBottomCC(numLines, childId, false /*inclusive*/);
+            bcc->init(parents, network, name.c_str());
+        }
+
+        void setChildren(const g_vector<BaseCache*>& children, Network* network) {
+            panic("[%s] MESITerminalCC::setChildren cannot be called -- terminal caches cannot have children!", name.c_str());
+        }
+
+        void initStats(AggregateStat* cacheStat) {
+            bcc->initStats(cacheStat);
+        }
+
+        //Access methods
+        bool startAccess(MemReq& req) {
+            assert((req.type == GETS) || (req.type == GETX)); //no puts!
+
+            /* Child should be locked when called. We do hand-over-hand locking when going
+             * down (which is why we require the lock), but not when going up, opening the
+             * child to invalidation races here to avoid deadlocks.
+             */
+            if (req.childLock) {
+                futex_unlock(req.childLock);
+            }
+
+            bcc->lock();
+
+            /* The situation is now stable, true race-wise. No one can touch the child state, because we hold
+             * both parent's locks. So, we first handle races, which may cause us to skip the access.
+             */
+            bool skipAccess = CheckForMESIRace(req.type /*may change*/, req.state, req.initialState);
+            return skipAccess;
+        }
+
+        bool shouldAllocate(const MemReq& req) {
+            return true;
+        }
+
+        uint64_t processEviction(const MemReq& triggerReq, Address wbLineAddr, int32_t lineId, uint64_t startCycle) {
+            bool lowerLevelWriteback = false;
+            uint64_t endCycle = bcc->processEviction(wbLineAddr, lineId, lowerLevelWriteback, startCycle, triggerReq.srcId); //2. if needed, write back line to upper level
+            return endCycle;  // critical path unaffected, but TimingCache needs it
+        }
+
+        uint64_t processAccess(const MemReq& req, int32_t lineId, uint64_t startCycle,  uint64_t* getDoneCycle = NULL) {
+            assert(lineId != -1);
+            assert(!getDoneCycle);
+            //if needed, fetch line or upgrade miss from upper level
+            uint64_t respCycle = bcc->processAccess(req.lineAddr, lineId, req.type, startCycle, req.srcId, req.flags);
+            //at this point, the line is in a good state w.r.t. upper levels
+            return respCycle;
+        }
+
+        void endAccess(const MemReq& req) {
+            //Relock child before we unlock ourselves (hand-over-hand)
+            if (req.childLock) {
+                futex_lock(req.childLock);
+            }
+            bcc->unlock();
+        }
+
+        //Inv methods
+        void startInv() {
+            bcc->lock();
+        }
+
+        uint64_t processInv(Address lineAddr, int32_t lineId, InvType type, bool* reqWriteback, uint64_t startCycle, uint32_t srcId) {
+            bcc->processInval(lineAddr, lineId, type, reqWriteback); //adjust our own state
+            bcc->unlock();
+            return startCycle; //no extra delay in terminal caches
+        }
+
+        //Repl policy interface
+        uint32_t numSharers(uint32_t lineId) {return 0;} //no sharers
+        bool isValid(uint32_t lineId) {return bcc->isValid(lineId);}
+};
+
+#endif  // COHERENCE_CTRLS_H_
diff --git a/src/config.cpp b/src/config.cpp
new file mode 100644
index 00000000..3eac351a
--- /dev/null
+++ b/src/config.cpp
@@ -0,0 +1,352 @@
+/** $lic$
+ * Copyright (C) 2012-2014 by Massachusetts Institute of Technology
+ * Copyright (C) 2010-2013 by The Board of Trustees of Stanford University
+ *
+ * This file is part of zsim.
+ *
+ * zsim is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU General Public License as published by the Free Software
+ * Foundation, version 2.
+ *
+ * If you use this software in your research, we request that you reference
+ * the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of
+ * Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the
+ * source of the simulator in any publications that use this software, and that
+ * you send us a citation of your work.
+ *
+ * zsim is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "config.h"
+#include <sstream>
+#include <string.h>
+#include <string>
+#include <typeinfo>
+#include <vector>
+#include "libconfig.h++"
+#include "log.h"
+
+using std::string;
+using std::stringstream;
+using std::vector;
+
+// Restrict use of long long, which libconfig uses as its int64
+typedef long long lc_int64;  // NOLINT(runtime/int)
+
+Config::Config(const char* inFile) {
+    inCfg = new libconfig::Config();
+    outCfg = new libconfig::Config();
+    try {
+        inCfg->readFile(inFile);
+    } catch (libconfig::FileIOException fioe) {
+        panic("Input config file %s could not be read", inFile);
+    } catch (libconfig::ParseException pe) {
+        panic("Input config file %s could not be parsed, line %d, error: %s", pe.getFile(), pe.getLine(), pe.getError());
+    }
+}
+
+Config::~Config() {
+    delete inCfg;
+    delete outCfg;
+}
+
+// Helper function: Add "*"-prefixed vars, which are used by our scripts but not zsim, to outCfg
+// Returns number of copied vars
+static uint32_t copyNonSimVars(libconfig::Setting& s1, libconfig::Setting& s2, std::string prefix) {
+    uint32_t copied = 0;
+    for (uint32_t i = 0; i < (uint32_t)s1.getLength(); i++) {
+        const char* name = s1[i].getName();
+        if (name[0] == '*') {
+            if (s2.exists(name)) panic("Setting %s was read, should be private", (prefix + name).c_str());
+            // This could be as simple as:
+            //s2.add(s1[i].getType()) = s1[i];
+            // However, because Setting kinda sucks, we need to go type by type:
+            libconfig::Setting& ns = s2.add(name, s1[i].getType());
+            if      (libconfig::Setting::Type::TypeInt     == s1[i].getType()) ns = (int) s1[i];
+            else if (libconfig::Setting::Type::TypeInt64   == s1[i].getType()) ns = (lc_int64) s1[i];
+            else if (libconfig::Setting::Type::TypeBoolean == s1[i].getType()) ns = (bool) s1[i];
+            else if (libconfig::Setting::Type::TypeString  == s1[i].getType()) ns = (const char*) s1[i];
+            else panic("Unknown type for priv setting %s, cannot copy", (prefix + name).c_str());
+            copied++;
+        }
+
+        if (s1[i].isGroup() && s2.exists(name)) {
+            copied += copyNonSimVars(s1[i], s2[name], prefix + name + ".");
+        }
+    }
+    return copied;
+}
+
+// Helper function: Compares two settings recursively, checking for inclusion
+// Returns number of settings without inclusion (given but unused)
+static uint32_t checkIncluded(libconfig::Setting& s1, libconfig::Setting& s2, std::string prefix) {
+    uint32_t unused = 0;
+    for (uint32_t i = 0; i < (uint32_t)s1.getLength(); i++) {
+        const char* name = s1[i].getName();
+        if (!s2.exists(name)) {
+            warn("Setting %s not used during configuration", (prefix + name).c_str());
+            unused++;
+        } else if (s1[i].isGroup()) {
+            unused += checkIncluded(s1[i], s2[name], prefix + name + ".");
+        }
+    }
+    return unused;
+}
+
+
+
+//Called when initialization ends. Writes output config, and emits warnings for unused input settings
+void Config::writeAndClose(const char* outFile, bool strictCheck) {
+    uint32_t nonSimVars = copyNonSimVars(inCfg->getRoot(), outCfg->getRoot(), std::string(""));
+    uint32_t unused = checkIncluded(inCfg->getRoot(), outCfg->getRoot(), std::string(""));
+
+    if (nonSimVars) info("Copied %d non-sim var%s to output config", nonSimVars, (nonSimVars > 1)? "s" : "");
+    if (unused) {
+        if (strictCheck) {
+            panic("%d setting%s not used during configuration", unused, (unused > 1)? "s" : "");
+        } else {
+            warn("%d setting%s not used during configuration", unused, (unused > 1)? "s" : "");
+        }
+    }
+
+    try {
+        outCfg->writeFile(outFile);
+    } catch (libconfig::FileIOException fioe) {
+        panic("Output config file %s could not be written", outFile);
+    }
+}
+
+
+bool Config::exists(const char* key) {
+    return inCfg->exists(key);
+}
+
+//Helper functions
+template<typename T> static const char* getTypeName();
+template<> const char* getTypeName<int>() {return "uint32";}
+template<> const char* getTypeName<lc_int64>() {return "uint64";}
+template<> const char* getTypeName<bool>() {return "bool";}
+template<> const char* getTypeName<const char*>() {return "string";}
+template<> const char* getTypeName<double>() {return "double";}
+
+typedef libconfig::Setting::Type SType;
+template<typename T> static SType getSType();
+template<> SType getSType<int>() {return SType::TypeInt;}
+template<> SType getSType<lc_int64>() {return SType::TypeInt64;}
+template<> SType getSType<bool>() {return SType::TypeBoolean;}
+template<> SType getSType<const char*>() {return SType::TypeString;}
+template<> SType getSType<double>() {return SType::TypeFloat;}
+
+template<typename T> static bool getEq(T v1, T v2);
+template<> bool getEq<int>(int v1, int v2) {return v1 == v2;}
+template<> bool getEq<lc_int64>(lc_int64 v1, lc_int64 v2) {return v1 == v2;}
+template<> bool getEq<bool>(bool v1, bool v2) {return v1 == v2;}
+template<> bool getEq<const char*>(const char* v1, const char* v2) {return strcmp(v1, v2) == 0;}
+template<> bool getEq<double>(double v1, double v2) {return v1 == v2;}
+
+template<typename T> static void writeVar(libconfig::Setting& setting, const char* key, T val) {
+    //info("writeVal %s", key);
+    const char* sep = strchr(key, '.');
+    if (sep) {
+        assert(*sep == '.');
+        uint32_t plen = (size_t)(sep-key);
+        char prefix[plen+1];
+        strncpy(prefix, key, plen);
+        prefix[plen] = 0;
+        // libconfig strdups all passed strings, so it's fine that prefix is local.
+        if (!setting.exists(prefix)) {
+            try {
+                setting.add((const char*)prefix, SType::TypeGroup);
+            } catch (libconfig::SettingNameException sne) {
+                panic("libconfig error adding group setting %s", prefix);
+            }
+        }
+        libconfig::Setting& child = setting[(const char*)prefix];
+        writeVar(child, sep+1, val);
+    } else {
+        if (!setting.exists(key)) {
+            try {
+                setting.add(key, getSType<T>()) = val;
+            } catch (libconfig::SettingNameException sne) {
+                panic("libconfig error adding leaf setting %s", key);
+            }
+        } else {
+            //If this panics, what the hell are you doing in the code? Multiple reads and different defaults??
+            T origVal = setting[key];
+            if (!getEq(val, origVal)) panic("Duplicate writes to out config key %s with different values!", key);
+        }
+    }
+}
+
+template<typename T> static void writeVar(libconfig::Config* cfg, const char* key, T val) {
+    libconfig::Setting& setting = cfg->getRoot();
+    writeVar(setting, key, val);
+}
+
+
+template<typename T>
+T Config::genericGet(const char* key, T def) {
+    T val;
+    if (inCfg->exists(key)) {
+        if (!inCfg->lookupValue(key, val)) {
+            panic("Type error on optional setting %s, expected type %s", key, getTypeName<T>());
+        }
+    } else {
+        val = def;
+    }
+    writeVar(outCfg, key, val);
+    return val;
+}
+
+template<typename T>
+T Config::genericGet(const char* key) {
+    T val;
+    if (inCfg->exists(key)) {
+        if (!inCfg->lookupValue(key, val)) {
+            panic("Type error on mandatory setting %s, expected type %s", key, getTypeName<T>());
+        }
+    } else {
+        panic("Mandatory setting %s (%s) not found", key, getTypeName<T>())
+    }
+    writeVar(outCfg, key, val);
+    return val;
+}
+
+//Template specializations for access interface
+template<> uint32_t Config::get<uint32_t>(const char* key) {return (uint32_t) genericGet<int>(key);}
+template<> uint64_t Config::get<uint64_t>(const char* key) {return (uint64_t) genericGet<lc_int64>(key);}
+template<> bool Config::get<bool>(const char* key) {return genericGet<bool>(key);}
+template<> const char* Config::get<const char*>(const char* key) {return genericGet<const char*>(key);}
+template<> double Config::get<double>(const char* key) {return (double) genericGet<double>(key);}
+
+template<> uint32_t Config::get<uint32_t>(const char* key, uint32_t def) {return (uint32_t) genericGet<int>(key, (int)def);}
+template<> uint64_t Config::get<uint64_t>(const char* key, uint64_t def) {return (uint64_t) genericGet<lc_int64>(key, (lc_int64)def);}
+template<> bool Config::get<bool>(const char* key, bool def) {return genericGet<bool>(key, def);}
+template<> const char* Config::get<const char*>(const char* key, const char* def) {return genericGet<const char*>(key, def);}
+template<> double Config::get<double>(const char* key, double def) {return (double) genericGet<double>(key, (double)def);}
+
+//Get subgroups in a specific key
+void Config::subgroups(const char* key, std::vector<const char*>& grps) {
+    if (inCfg->exists(key)) {
+        libconfig::Setting& s = inCfg->lookup(key);
+        uint32_t n = s.getLength(); //0 if not a group or list
+        for (uint32_t i = 0; i < n; i++) {
+            if (s[i].isGroup()) grps.push_back(s[i].getName());
+        }
+    }
+}
+
+
+/* Config value parsing functions */
+
+//Range parsing, for process masks
+
+//Helper, from http://oopweb.com/CPP/Documents/CPPHOWTO/Volume/C++Programming-HOWTO-7.html
+void Tokenize(const string& str, vector<string>& tokens, const string& delimiters) {
+    // Skip delimiters at beginning.
+    string::size_type lastPos = 0; //dsm: DON'T //str.find_first_not_of(delimiters, 0);
+    // Find first "non-delimiter".
+    string::size_type pos = str.find_first_of(delimiters, lastPos);
+
+    while (string::npos != pos || string::npos != lastPos) {
+        // Found a token, add it to the vector.
+        tokens.push_back(str.substr(lastPos, pos - lastPos));
+        // Skip delimiters.  Note the "not_of"
+        lastPos = str.find_first_not_of(delimiters, pos);
+        // Find next "non-delimiter"
+        pos = str.find_first_of(delimiters, lastPos);
+    }
+}
+
+struct Range {
+    int32_t min;
+    int32_t sup;
+    int32_t step;
+
+    explicit Range(string r)  {
+        vector<string> t;
+        Tokenize(r, t, ":");
+        vector<uint32_t> n;
+        for (auto s : t) {
+            stringstream ss(s);
+            uint32_t x = 0;
+            ss >> x;
+            if (ss.fail()) panic("%s in range %s is not a valid number", s.c_str(), r.c_str());
+            n.push_back(x);
+        }
+        switch (n.size()) {
+            case 1:
+                min = n[0];
+                sup = min + 1;
+                step = 1;
+                break;
+            case 2:
+                min = n[0];
+                sup = n[1];
+                step = 1;
+                break;
+            case 3:
+                min = n[0];
+                sup = n[1];
+                step = n[2];
+                break;
+            default:
+                panic("Range '%s' can only have 1-3 numbers delimited by ':', %ld parsed", r.c_str(), n.size());
+        }
+
+        //Final error-checking
+        if (min < 0 || step < 0 || sup < 0) panic("Range %s has negative numbers", r.c_str());
+        if (step == 0) panic("Range %s has 0 step!", r.c_str());
+        if (min >= sup) panic("Range %s has min >= sup!", r.c_str());
+    }
+
+    void fill(vector<bool>& mask) {
+        for (int32_t i = min; i < sup; i += step) {
+            if (i >= (int32_t)mask.size() || i < 0) panic("Range %d:%d:%d includes out-of-bounds %d (mask limit %ld)", min, step, sup, i, mask.size()-1);
+            mask[i] = true;
+        }
+    }
+};
+
+std::vector<bool> ParseMask(const std::string& maskStr, uint32_t maskSize) {
+    vector<bool> mask;
+    mask.resize(maskSize);
+
+    vector<string> ranges;
+    Tokenize(maskStr, ranges, " ");
+    for (auto r : ranges) {
+        if (r.length() == 0) continue;
+        Range range(r);
+        range.fill(mask);
+    }
+    return mask;
+}
+
+//List parsing
+template <typename T>
+std::vector<T> ParseList(const std::string& listStr) {
+    vector<string> nums;
+    Tokenize(listStr, nums, " ");
+
+    vector<T> res;
+    for (auto n : nums) {
+        if (n.length() == 0) continue;
+        stringstream ss(n);
+        T x;
+        ss >> x;
+        if (ss.fail()) panic("%s in list [%s] could not be parsed", n.c_str(), listStr.c_str());
+        res.push_back(x);
+    }
+    return res;
+}
+
+//Instantiations
+template std::vector<uint32_t> ParseList<uint32_t>(const std::string& listStr);
+template std::vector<uint64_t> ParseList<uint64_t>(const std::string& listStr);
+template std::vector<std::string> ParseList(const std::string& listStr);
diff --git a/src/config.h b/src/config.h
new file mode 100644
index 00000000..b75bca7f
--- /dev/null
+++ b/src/config.h
@@ -0,0 +1,102 @@
+/** $lic$
+ * Copyright (C) 2012-2014 by Massachusetts Institute of Technology
+ * Copyright (C) 2010-2013 by The Board of Trustees of Stanford University
+ *
+ * This file is part of zsim.
+ *
+ * zsim is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU General Public License as published by the Free Software
+ * Foundation, version 2.
+ *
+ * If you use this software in your research, we request that you reference
+ * the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of
+ * Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the
+ * source of the simulator in any publications that use this software, and that
+ * you send us a citation of your work.
+ *
+ * zsim is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef CONFIG_H_
+#define CONFIG_H_
+
+/* Thin wrapper around libconfig to:
+ * - Reduce and simplify init code (tailored interface, not type BS, ...)
+ * - Strict config: type errors, warnings on unused variables, panic on different defaults
+ * - Produce a full configuration file with all the variables, including defaults (for config parsing, comparison, etc.)
+ */
+
+#include <stdint.h>
+#include <string>
+#include <vector>
+#include "log.h"
+
+namespace libconfig {
+    class Config;
+    class Setting;
+};
+
+
+class Config {
+    private:
+        libconfig::Config* inCfg;
+        libconfig::Config* outCfg;
+
+    public:
+        explicit Config(const char* inFile);
+        ~Config();
+
+        //Called when initialization ends. Writes output config, and emits warnings for unused input settings
+        void writeAndClose(const char* outFile, bool strictCheck);
+
+        bool exists(const char* key);
+        bool exists(const std::string& key) {return exists(key.c_str());}
+
+        //Access interface
+        //T can be uint32_t, uint64_t, bool, or const char*. Instantiations are in the cpp file
+
+        // Mandatory values (no default, panics if setting does not exist)
+        template<typename T> T get(const char* key);
+        template<typename T> T get(const std::string& key) {return get<T>(key.c_str());}
+
+        // Optional values (default)
+        template<typename T> T get(const char* key, T def);
+        template<typename T> T get(const std::string& key, T def) {return get<T>(key.c_str(), def);}
+
+        //Get subgroups in a specific key
+        void subgroups(const char* key, std::vector<const char*>& grps);
+        void subgroups(const std::string& key, std::vector<const char*>& grps) {subgroups(key.c_str(), grps);}
+
+    private:
+        template<typename T> T genericGet(const char* key);
+        template<typename T> T genericGet(const char* key, T def);
+};
+
+
+/* Parsing functions used for configuration */
+
+std::vector<bool> ParseMask(const std::string& maskStr, uint32_t maskSize);
+
+/* Parses a space-separated list of T's (typically ints, see/add specializtions in .cpp)
+ * 0-elem lists are OK
+ * panics on parsing and size-violation errors
+ */
+template <typename T> std::vector<T> ParseList(const std::string& listStr);
+
+// fills remaining elems till maxSize with fillValue
+template <typename T> std::vector<T> ParseList(const std::string& listStr, uint32_t maxSize, uint32_t fillValue) {
+    std::vector<T> res = ParseList<T>(listStr);
+    if (res.size() > maxSize) panic("ParseList: Too many elements, max %d, got %ld", maxSize, res.size());
+    while (res.size() < maxSize) res.push_back(fillValue);
+    return res;
+}
+
+void Tokenize(const std::string& str, std::vector<std::string>& tokens, const std::string& delimiters);
+
+#endif  // CONFIG_H_
diff --git a/src/constants.h b/src/constants.h
new file mode 100644
index 00000000..9db17b68
--- /dev/null
+++ b/src/constants.h
@@ -0,0 +1,47 @@
+/** $lic$
+ * Copyright (C) 2012-2014 by Massachusetts Institute of Technology
+ * Copyright (C) 2010-2013 by The Board of Trustees of Stanford University
+ *
+ * This file is part of zsim.
+ *
+ * zsim is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU General Public License as published by the Free Software
+ * Foundation, version 2.
+ *
+ * If you use this software in your research, we request that you reference
+ * the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of
+ * Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the
+ * source of the simulator in any publications that use this software, and that
+ * you send us a citation of your work.
+ *
+ * zsim is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef CONSTANTS_H_
+#define CONSTANTS_H_
+
+/* Simulator constants/limits go here, defined by macros */
+
+// PIN 2.9 (rev39599) can't do more than 2048 threads...
+#define MAX_THREADS (2048)
+
+// How many children caches can each cache track? Note each bank is a separate child. This impacts sharer bit-vector sizes.
+#define MAX_CACHE_CHILDREN (256)
+//#define MAX_CACHE_CHILDREN (1024)
+
+// Complex multiprocess runs need multiple clocks, and multiple port domains
+#define MAX_CLOCK_DOMAINS (64)
+#define MAX_PORT_DOMAINS (64)
+
+//Maximum IPC of any implemented core. This is used for adaptive events and will not fail silently if you define new, faster processors.
+//If you use it, make sure it does not fail silently if violated.
+#define MAX_IPC (4)
+
+#endif  // CONSTANTS_H_
+
diff --git a/src/contention_sim.cpp b/src/contention_sim.cpp
new file mode 100644
index 00000000..8c22849b
--- /dev/null
+++ b/src/contention_sim.cpp
@@ -0,0 +1,418 @@
+/** $lic$
+ * Copyright (C) 2012-2014 by Massachusetts Institute of Technology
+ * Copyright (C) 2010-2013 by The Board of Trustees of Stanford University
+ *
+ * This file is part of zsim.
+ *
+ * zsim is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU General Public License as published by the Free Software
+ * Foundation, version 2.
+ *
+ * If you use this software in your research, we request that you reference
+ * the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of
+ * Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the
+ * source of the simulator in any publications that use this software, and that
+ * you send us a citation of your work.
+ *
+ * zsim is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "contention_sim.h"
+#include <algorithm>
+#include <queue>
+#include <sstream>
+#include <string>
+#include <typeinfo>
+#include <unordered_map>
+#include <vector>
+#include "log.h"
+#include "ooo_core.h"
+#include "timing_core.h"
+#include "timing_event.h"
+#include "zsim.h"
+
+//Set to 1 to produce a post-mortem analysis log
+#define POST_MORTEM 0
+//#define POST_MORTEM 1
+
+bool ContentionSim::CompareEvents::operator()(TimingEvent* lhs, TimingEvent* rhs) const {
+    return lhs->cycle > rhs->cycle;
+}
+
+bool ContentionSim::CompareDomains::operator()(DomainData* d1, DomainData* d2) const {
+    uint64_t v1 = d1->queuePrio;
+    uint64_t v2 = d2->queuePrio;
+    return (v1 > v2);
+}
+
+
+void ContentionSim::SimThreadTrampoline(void* arg) {
+    ContentionSim* csim = static_cast<ContentionSim*>(arg);
+    uint32_t thid = __sync_fetch_and_add(&csim->threadTicket, 1);
+    csim->simThreadLoop(thid);
+}
+
+ContentionSim::ContentionSim(uint32_t _numDomains, uint32_t _numSimThreads) {
+    numDomains = _numDomains;
+    numSimThreads = _numSimThreads;
+    threadsDone = 0;
+    limit = 0;
+    lastLimit = 0;
+    inCSim = false;
+
+    domains = gm_calloc<DomainData>(numDomains);
+    simThreads = gm_calloc<SimThreadData>(numSimThreads);
+
+    for (uint32_t i = 0; i < numDomains; i++) {
+        new (&domains[i].pq) PrioQueue<TimingEvent, PQ_BLOCKS>();
+        domains[i].curCycle = 0;
+        futex_init(&domains[i].pqLock);
+    }
+
+    if ((numDomains % numSimThreads) != 0) panic("numDomains(%d) must be a multiple of numSimThreads(%d) for now", numDomains, numSimThreads);
+
+    for (uint32_t i = 0; i < numSimThreads; i++) {
+        futex_init(&simThreads[i].wakeLock);
+        futex_lock(&simThreads[i].wakeLock); //starts locked, so first actual call to lock blocks
+        simThreads[i].firstDomain = i*numDomains/numSimThreads;
+        simThreads[i].supDomain = (i+1)*numDomains/numSimThreads;
+    }
+
+    futex_init(&waitLock);
+    futex_lock(&waitLock); //wait lock must also start locked
+
+    //futex_init(&testLock);
+    futex_init(&postMortemLock);
+
+    //Launch domain simulation threads
+    threadTicket = 0;
+    __sync_synchronize();
+    for (uint32_t i = 0; i < numSimThreads; i++) {
+        PIN_SpawnInternalThread(SimThreadTrampoline, this, 1024*1024, NULL);
+    }
+
+    lastCrossing = gm_calloc<CrossingEventInfo>(numDomains*numDomains*MAX_THREADS); //TODO: refine... this allocs too much
+}
+
+void ContentionSim::postInit() {
+    for (uint32_t i = 0; i < zinfo->numCores; i++) {
+        TimingCore* tcore = dynamic_cast<TimingCore*>(zinfo->cores[i]);
+        if (tcore) {
+            skipContention = false;
+            return;
+        }
+        OOOCore* ocore = dynamic_cast<OOOCore*>(zinfo->cores[i]);
+        if (ocore) {
+            skipContention = false;
+            return;
+        }
+    }
+    skipContention = true;
+}
+
+void ContentionSim::initStats(AggregateStat* parentStat) {
+    AggregateStat* objStat = new AggregateStat(false);
+    objStat->init("contention", "Contention simulation stats");
+    for (uint32_t i = 0; i < numDomains; i++) {
+        std::stringstream ss;
+        ss << "domain-" << i;
+        AggregateStat* domStat = new AggregateStat();
+        domStat->init(gm_strdup(ss.str().c_str()), "Domain stats");
+#if PROFILE_CROSSINGS
+        new (&domains[i].profIncomingCrossings) VectorCounter();
+        new (&domains[i].profIncomingCrossingSims) VectorCounter();
+        new (&domains[i].profIncomingCrossingHist) VectorCounter();
+        domains[i].profIncomingCrossings.init("ixe", "Incoming crossing events", numDomains);
+        domains[i].profIncomingCrossingSims.init("ixs", "Incoming crossings simulated but held", numDomains);
+        domains[i].profIncomingCrossingHist.init("ixh", "Incoming crossings held count histogram", 33 /*32 means >31*/);
+        domStat->append(&domains[i].profIncomingCrossings);
+        domStat->append(&domains[i].profIncomingCrossingSims);
+        domStat->append(&domains[i].profIncomingCrossingHist);
+#endif
+        new (&domains[i].profTime) ClockStat();
+        domains[i].profTime.init("time", "Weave simulation time");
+        domStat->append(&domains[i].profTime);
+        objStat->append(domStat);
+    }
+    parentStat->append(objStat);
+}
+
+void ContentionSim::simulatePhase(uint64_t limit) {
+    if (skipContention) return; //fastpath when there are no cores to simulate
+
+    this->limit = limit;
+    assert(limit >= lastLimit);
+
+    //info("simulatePhase limit %ld", limit);
+    for (uint32_t i = 0; i < zinfo->numCores; i++) {
+        TimingCore* tcore = dynamic_cast<TimingCore*>(zinfo->cores[i]);
+        if (tcore) tcore->cSimStart();
+        OOOCore* ocore = dynamic_cast<OOOCore*>(zinfo->cores[i]);
+        if (ocore) ocore->cSimStart();
+    }
+
+    inCSim = true;
+    __sync_synchronize();
+
+    //Wake up sim threads
+    for (uint32_t i = 0; i < numSimThreads; i++) {
+        futex_unlock(&simThreads[i].wakeLock);
+    }
+
+    //Sleep until phase is simulated
+    futex_lock_nospin(&waitLock);
+
+    inCSim = false;
+    __sync_synchronize();
+
+    for (uint32_t i = 0; i < zinfo->numCores; i++) {
+        TimingCore* tcore = dynamic_cast<TimingCore*>(zinfo->cores[i]);
+        if (tcore) tcore->cSimEnd();
+        OOOCore* ocore = dynamic_cast<OOOCore*>(zinfo->cores[i]);
+        if (ocore) ocore->cSimEnd();
+    }
+
+    lastLimit = limit;
+    __sync_synchronize();
+}
+
+void ContentionSim::enqueue(TimingEvent* ev, uint64_t cycle) {
+    assert(inCSim);
+    assert(ev);
+    assert_msg(cycle >= lastLimit, "Enqueued event before last limit! cycle %ld min %ld", cycle, lastLimit);
+    //Hacky, but helpful to chase events scheduled too far ahead due to bugs (e.g., cycle -1). We should probably formalize this a bit more
+    assert_msg(cycle < lastLimit+10*zinfo->phaseLength+10000, "Queued event too far into the future, cycle %ld lastLimit %ld", cycle, lastLimit);
+
+    assert_msg(cycle >= domains[ev->domain].curCycle, "Queued event goes back in time, cycle %ld curCycle %ld", cycle, domains[ev->domain].curCycle);
+    ev->privCycle = cycle;
+    assert(ev->numParents == 0);
+    assert(ev->domain != -1);
+    assert(ev->domain < (int32_t)numDomains);
+
+    domains[ev->domain].pq.enqueue(ev, cycle);
+}
+
+void ContentionSim::enqueueSynced(TimingEvent* ev, uint64_t cycle) {
+    assert(!inCSim);
+    assert(ev && ev->domain != -1);
+    assert(ev->domain < (int32_t)numDomains);
+    uint32_t domain = ev->domain;
+
+    futex_lock(&domains[domain].pqLock);
+
+    assert_msg(cycle >= lastLimit, "Enqueued (synced) event before last limit! cycle %ld min %ld", cycle, lastLimit);
+    //Hacky, but helpful to chase events scheduled too far ahead due to bugs (e.g., cycle -1). We should probably formalize this a bit more
+    assert_msg(cycle < lastLimit+10*zinfo->phaseLength+10000, "Queued  (synced) event too far into the future, cycle %ld lastLimit %ld", cycle, lastLimit);
+    ev->privCycle = cycle;
+    assert(ev->numParents == 0);
+    domains[ev->domain].pq.enqueue(ev, cycle);
+
+    futex_unlock(&domains[domain].pqLock);
+}
+
+void ContentionSim::enqueueCrossing(CrossingEvent* ev, uint64_t cycle, uint32_t srcId, uint32_t srcDomain, uint32_t dstDomain, EventRecorder* evRec) {
+    CrossingStack& cs = evRec->getCrossingStack();
+    bool isFirst = cs.empty();
+    bool isResp = false;
+    CrossingEvent* req = NULL;
+    if (!isFirst) {
+        CrossingEvent* b = cs.back();
+        if (b->srcDomain == (uint32_t)ev->domain && (uint32_t)b->domain == ev->srcDomain) {
+            //info("XXX response identified %d->%d", ev->srcDomain, ev->domain);
+            isResp = true;
+            req = b;
+        }
+    }
+
+    if (!isResp) cs.push_back(ev);
+    else cs.pop_back();
+
+    if (isResp) {
+        req->parentEv->addChild(ev, evRec);
+    } else {
+        CrossingEventInfo* last = &lastCrossing[(srcId*numDomains + srcDomain)*numDomains + dstDomain];
+        uint64_t srcDomCycle = domains[srcDomain].curCycle;
+        if (last->cycle > srcDomCycle && last->cycle <= cycle) { //NOTE: With the OOO model, last->cycle > cycle is now possible, since requests are issued in instruction order -> ooo
+            //Chain to previous req
+            assert_msg(last->cycle <= cycle, "last->cycle (%ld) > cycle (%ld)", last->cycle, cycle);
+            last->ev->addChild(ev, evRec);
+        } else {
+            //We can't queue --- queue directly (synced, we're in phase 1)
+            assert(cycle >= srcDomCycle);
+            //info("Queuing xing %ld %ld (lst eve too old at cycle %ld)", cycle, srcDomCycle, last->cycle);
+            enqueueSynced(ev, cycle);
+        }
+        //Store this one as the last req
+        last->cycle = cycle;
+        last->ev = ev;
+    }
+}
+
+void ContentionSim::simThreadLoop(uint32_t thid) {
+    info("Started contention simulation thread %d", thid);
+#if 0
+    //Pin
+    uint32_t nprocs = sysconf(_SC_NPROCESSORS_ONLN);
+    cpu_set_t cpuset;
+    CPU_ZERO(&cpuset);
+    //CPU_SET(domain % nprocs, cpuset); //basically no difference
+    //CPU_SET(0, cpuset); CPU_SET(1, cpuset); CPU_SET(2, cpuset); CPU_SET(3, cpuset); //don't use hyperthreads; confuses the scheduler to no end, does worse
+    CPU_SET(thid % nprocs, &cpuset); CPU_SET((thid % nprocs) + (nprocs/2), &cpuset); //pin to a single core but can use both its hyperthreads, works best (~20% gain)
+    //TODO: Must make this optional, or multiple simulations/machine will work horribly
+    int r = sched_setaffinity(0 /*calling thread, equiv to syscall(SYS_gettid)*/, sizeof(cpuset), &cpuset);
+    assert_msg(r == 0, "sched_setaffinity failed (%d)", r);
+#endif
+    while (true) {
+        futex_lock_nospin(&simThreads[thid].wakeLock);
+
+        if (terminate) {
+            break;
+        }
+
+        //info("%d --- phase start", domain);
+        simulatePhaseThread(thid);
+        //info("%d --- phase end", domain);
+
+        uint32_t val = __sync_add_and_fetch(&threadsDone, 1);
+        if (val == numSimThreads) {
+            threadsDone = 0;
+            futex_unlock(&waitLock); //unblock caller
+        }
+    }
+    info("Finished contention simulation thread %d", thid);
+}
+
+void ContentionSim::simulatePhaseThread(uint32_t thid) {
+    uint32_t thDomains = simThreads[thid].supDomain - simThreads[thid].firstDomain;
+    uint32_t numFinished = 0;
+
+    if (thDomains == 1) {
+        DomainData& domain = domains[simThreads[thid].firstDomain];
+        domain.profTime.start();
+        PrioQueue<TimingEvent, PQ_BLOCKS>& pq = domain.pq;
+        while (pq.size() && pq.firstCycle() < limit) {
+            uint64_t domCycle = domain.curCycle;
+            uint64_t cycle;
+            TimingEvent* te = pq.dequeue(cycle);
+            assert(cycle >= domCycle);
+            if (cycle != domCycle) {
+                domCycle = cycle;
+                domain.curCycle = cycle;
+            }
+            te->run(cycle);
+            uint64_t newCycle = pq.size()? pq.firstCycle() : limit;
+            assert(newCycle >= domCycle);
+            if (newCycle != domCycle) domain.curCycle = newCycle;
+#if POST_MORTEM
+            simThreads[thid].logVec.push_back(std::make_pair(cycle, te));
+#endif
+        }
+        domain.curCycle = limit;
+        domain.profTime.end();
+
+#if POST_MORTEM
+        //Post-mortem
+        if (limit % 10000000 == 0)  {
+            futex_lock(&postMortemLock); //serialize output
+            uint32_t uniqueEvs = 0;
+            std::unordered_map<TimingEvent*, std::string> evsSeen;
+            for (std::pair<uint64_t, TimingEvent*> p : simThreads[thid].logVec) {
+                uint64_t cycle = p.first;
+                TimingEvent* te = p.second;
+                std::string desc = evsSeen[te];
+                if (desc == "") { //non-existnt
+                    std::stringstream ss;
+                    ss << uniqueEvs << " " << typeid(*te).name();
+                    CrossingEvent* ce = dynamic_cast<CrossingEvent*>(te);
+                    if (ce) {
+                        ss << " slack " << (ce->preSlack + ce->postSlack) << " osc " << ce->origStartCycle << " cnt " << ce->simCount;
+                    }
+
+                    evsSeen[te] = ss.str();
+                    uniqueEvs++;
+                    desc = ss.str();
+                }
+                info("[%d] %ld %s", thid, cycle, desc.c_str());
+            }
+            futex_unlock(&postMortemLock);
+        }
+        simThreads[thid].logVec.clear();
+#endif
+
+    } else {
+        //info("XXX %d / %d %d %d", thid, thDomains, simThreads[thid].supDomain, simThreads[thid].firstDomain);
+
+        std::priority_queue<DomainData*, std::vector<DomainData*>, CompareDomains> domPq;
+        for (uint32_t i = simThreads[thid].firstDomain; i < simThreads[thid].supDomain; i++) {
+            domPq.push(&domains[i]);
+        }
+
+        std::vector<DomainData*> sq1;
+        std::vector<DomainData*> sq2;
+
+        std::vector<DomainData*>& stalledQueue = sq1;
+        std::vector<DomainData*>& nextStalledQueue = sq2;
+
+        while (numFinished < thDomains) {
+            while (domPq.size()) {
+                DomainData* domain = domPq.top();
+                domPq.pop();
+                PrioQueue<TimingEvent, PQ_BLOCKS>& pq = domain->pq;
+                if (!pq.size() || pq.firstCycle() > limit) {
+                    numFinished++;
+                    domain->curCycle = limit;
+                } else {
+                    //info("YYY %d %ld %ld %d", numFinished, domPq.size(), domain->curCycle, domain->prio);
+                    uint64_t cycle;
+                    TimingEvent* te = pq.dequeue(cycle);
+                    //uint64_t nextCycle = pq.size()? pq.firstCycle() : cycle;
+                    if (cycle != domain->curCycle) domain->curCycle = cycle;
+                    te->run(cycle);
+                    domain->curCycle = pq.size()? pq.firstCycle() : limit;
+                    domain->queuePrio = domain->curCycle;
+                    if (domain->prio == 0) domPq.push(domain);
+                    else stalledQueue.push_back(domain);
+                }
+            }
+
+            while (stalledQueue.size()) {
+                DomainData* domain = stalledQueue.back();
+                stalledQueue.pop_back();
+                PrioQueue<TimingEvent, PQ_BLOCKS>& pq = domain->pq;
+                if (!pq.size() || pq.firstCycle() > limit) {
+                    numFinished++;
+                    domain->curCycle = limit;
+                } else {
+                    //info("SSS %d %ld %ld", numFinished, stalledQueue.size(), domain->curCycle);
+                    uint64_t cycle;
+                    TimingEvent* te = pq.dequeue(cycle);
+                    if (cycle != domain->curCycle) domain->curCycle = cycle;
+                    te->state = EV_RUNNING;
+                    te->simulate(cycle);
+                    domain->curCycle = pq.size()? pq.firstCycle() : limit;
+                    domain->queuePrio = domain->curCycle;
+                    if (domain->prio == 0) domPq.push(domain);
+                    else nextStalledQueue.push_back(domain);
+                }
+                if (domPq.size()) break;
+            }
+            if (!stalledQueue.size()) std::swap(stalledQueue, nextStalledQueue);
+        }
+    }
+
+    //info("Phase done");
+    __sync_synchronize();
+}
+
+void ContentionSim::finish() {
+    assert(!terminate);
+    terminate = true;
+    __sync_synchronize();
+}
+
diff --git a/src/contention_sim.h b/src/contention_sim.h
new file mode 100644
index 00000000..8331c4ca
--- /dev/null
+++ b/src/contention_sim.h
@@ -0,0 +1,169 @@
+/** $lic$
+ * Copyright (C) 2012-2014 by Massachusetts Institute of Technology
+ * Copyright (C) 2010-2013 by The Board of Trustees of Stanford University
+ *
+ * This file is part of zsim.
+ *
+ * zsim is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU General Public License as published by the Free Software
+ * Foundation, version 2.
+ *
+ * If you use this software in your research, we request that you reference
+ * the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of
+ * Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the
+ * source of the simulator in any publications that use this software, and that
+ * you send us a citation of your work.
+ *
+ * zsim is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef CONTENTION_SIM_H_
+#define CONTENTION_SIM_H_
+
+#include <functional>
+#include <stdint.h>
+#include <vector>
+#include "bithacks.h"
+#include "event_recorder.h"
+#include "g_std/g_vector.h"
+#include "galloc.h"
+#include "memory_hierarchy.h"
+#include "pad.h"
+#include "prio_queue.h"
+#include "profile_stats.h"
+#include "stats.h"
+
+//Set to 1 to produce stats of how many event crossings are generated and run. Useful for debugging, but adds overhead.
+#define PROFILE_CROSSINGS 0
+//#define PROFILE_CROSSINGS 1
+
+class TimingEvent;
+class DelayEvent;
+class CrossingEvent;
+
+#define PQ_BLOCKS 1024
+
+class ContentionSim : public GlobAlloc {
+    private:
+        struct CompareEvents : public std::binary_function<TimingEvent*, TimingEvent*, bool> {
+            bool operator()(TimingEvent* lhs, TimingEvent* rhs) const;
+        };
+
+        struct CrossingEventInfo {
+            uint64_t cycle;
+            CrossingEvent* ev; //only valid if the source's curCycle < cycle (otherwise this may be already executed or recycled)
+        };
+
+        CrossingEventInfo* lastCrossing; //indexed by [srcId*doms*doms + srcDom*doms + dstDom]
+
+        struct DomainData : public GlobAlloc {
+            PrioQueue<TimingEvent, PQ_BLOCKS> pq;
+
+            PAD();
+
+            volatile uint64_t curCycle;
+            lock_t pqLock; //used on phase 1 enqueues
+            //lock_t domainLock; //used by simulation thread
+
+            uint32_t prio;
+            uint64_t queuePrio;
+
+            PAD();
+
+            ClockStat profTime;
+
+#if PROFILE_CROSSINGS
+            VectorCounter profIncomingCrossingSims;
+            VectorCounter profIncomingCrossings;
+            VectorCounter profIncomingCrossingHist;
+#endif
+        };
+
+        struct CompareDomains : public std::binary_function<DomainData*, DomainData*, bool> {
+             bool operator()(DomainData* d1, DomainData* d2) const;
+        };
+
+        struct SimThreadData {
+            lock_t wakeLock; //used to sleep/wake up simulation thread
+            uint32_t firstDomain;
+            uint32_t supDomain; //supreme, ie first not included
+
+            std::vector<std::pair<uint64_t, TimingEvent*> > logVec;
+        };
+
+        //RO
+        DomainData* domains;
+        SimThreadData* simThreads;
+
+        PAD();
+
+        uint32_t numDomains;
+        uint32_t numSimThreads;
+        bool skipContention;
+
+        PAD();
+
+        //RW
+        lock_t waitLock;
+        volatile uint64_t limit;
+        volatile uint64_t lastLimit;
+        volatile bool terminate;
+
+        volatile uint32_t threadsDone;
+        volatile uint32_t threadTicket; //used only at init
+
+        volatile bool inCSim; //true when inside contention simulation
+
+        PAD();
+
+        //lock_t testLock;
+        lock_t postMortemLock;
+
+    public:
+        ContentionSim(uint32_t _numDomains, uint32_t _numSimThreads);
+
+        void initStats(AggregateStat* parentStat);
+
+        void postInit(); //must be called after the simulator is initialized
+
+        void enqueue(TimingEvent* ev, uint64_t cycle);
+        void enqueueSynced(TimingEvent* ev, uint64_t cycle);
+        void enqueueCrossing(CrossingEvent* ev, uint64_t cycle, uint32_t srcId, uint32_t srcDomain, uint32_t dstDomain, EventRecorder* evRec);
+
+        void simulatePhase(uint64_t limit);
+
+        void finish();
+
+        uint64_t getLastLimit() {return lastLimit;}
+
+        uint64_t getCurCycle(uint32_t domain) {
+            assert(domain < numDomains);
+            uint64_t c = domains[domain].curCycle;
+            assert(((int64_t)c) >= 0);
+            return c;
+        }
+
+        void setPrio(uint32_t domain, uint32_t prio) {domains[domain].prio = prio;}
+
+#if PROFILE_CROSSINGS
+        void profileCrossing(uint32_t srcDomain, uint32_t dstDomain, uint32_t count) {
+            domains[dstDomain].profIncomingCrossings.inc(srcDomain);
+            domains[dstDomain].profIncomingCrossingSims.inc(srcDomain, count);
+            domains[dstDomain].profIncomingCrossingHist.inc(MIN(count, (unsigned)32));
+        }
+#endif
+
+    private:
+        void simThreadLoop(uint32_t thid);
+        void simulatePhaseThread(uint32_t thid);
+
+        static void SimThreadTrampoline(void* arg);
+};
+
+#endif  // CONTENTION_SIM_H_
diff --git a/src/core.h b/src/core.h
new file mode 100644
index 00000000..d523c538
--- /dev/null
+++ b/src/core.h
@@ -0,0 +1,90 @@
+/** $lic$
+ * Copyright (C) 2012-2014 by Massachusetts Institute of Technology
+ * Copyright (C) 2010-2013 by The Board of Trustees of Stanford University
+ *
+ * This file is part of zsim.
+ *
+ * zsim is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU General Public License as published by the Free Software
+ * Foundation, version 2.
+ *
+ * If you use this software in your research, we request that you reference
+ * the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of
+ * Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the
+ * source of the simulator in any publications that use this software, and that
+ * you send us a citation of your work.
+ *
+ * zsim is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef CORE_H_
+#define CORE_H_
+
+#include <stdint.h>
+#include "decoder.h"
+#include "g_std/g_string.h"
+#include "stats.h"
+
+struct BblInfo {
+    uint32_t instrs;
+    uint32_t bytes;
+    DynBbl oooBbl[0]; //0 bytes, but will be 1-sized when we have an element (and that element has variable size as well)
+};
+
+/* Analysis function pointer struct
+ * As an artifact of having a shared code cache, we need these to be the same for different core types.
+ */
+struct InstrFuncPtrs {  // NOLINT(whitespace)
+    void (*loadPtr)(THREADID, ADDRINT);
+    void (*storePtr)(THREADID, ADDRINT);
+    void (*bblPtr)(THREADID, ADDRINT, BblInfo*);
+    void (*branchPtr)(THREADID, ADDRINT, BOOL, ADDRINT, ADDRINT);
+    // Same as load/store functions, but last arg indicated whether op is executing
+    void (*predLoadPtr)(THREADID, ADDRINT, BOOL);
+    void (*predStorePtr)(THREADID, ADDRINT, BOOL);
+    uint64_t type;
+    uint64_t pad[1];
+    //NOTE: By having the struct be a power of 2 bytes, indirect calls are simpler (w/ gcc 4.4 -O3, 6->5 instructions, and those instructions are simpler)
+};
+
+
+//TODO: Switch type to an enum by using sizeof macros...
+#define FPTR_ANALYSIS (0L)
+#define FPTR_JOIN (1L)
+#define FPTR_NOP (2L)
+
+//Generic core class
+
+class Core : public GlobAlloc {
+    private:
+        uint64_t lastUpdateCycles;
+        uint64_t lastUpdateInstrs;
+
+    protected:
+        g_string name;
+
+    public:
+        explicit Core(g_string& _name) : lastUpdateCycles(0), lastUpdateInstrs(0), name(_name) {}
+
+        virtual uint64_t getInstrs() const = 0; // typically used to find out termination conditions or dumps
+        virtual uint64_t getPhaseCycles() const = 0; // used by RDTSC faking --- we need to know how far along we are in the phase, but not the total number of phases
+        virtual uint64_t getCycles() const = 0;
+
+        virtual void initStats(AggregateStat* parentStat) = 0;
+        virtual void contextSwitch(int32_t gid) = 0; //gid == -1 means descheduled, otherwise this is the new gid
+
+        //Called by scheduler on every leave and join action, before barrier methods are called
+        virtual void leave() {}
+        virtual void join() {}
+
+        virtual InstrFuncPtrs GetFuncPtrs() = 0;
+};
+
+#endif  // CORE_H_
+
diff --git a/src/core_recorder.cpp b/src/core_recorder.cpp
new file mode 100644
index 00000000..37ce9450
--- /dev/null
+++ b/src/core_recorder.cpp
@@ -0,0 +1,267 @@
+/** $lic$
+ * Copyright (C) 2012-2014 by Massachusetts Institute of Technology
+ * Copyright (C) 2010-2013 by The Board of Trustees of Stanford University
+ *
+ * This file is part of zsim.
+ *
+ * zsim is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU General Public License as published by the Free Software
+ * Foundation, version 2.
+ *
+ * If you use this software in your research, we request that you reference
+ * the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of
+ * Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the
+ * source of the simulator in any publications that use this software, and that
+ * you send us a citation of your work.
+ *
+ * zsim is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "core_recorder.h"
+#include "timing_event.h"
+#include "zsim.h"
+
+#define DEBUG_MSG(args...)
+//#define DEBUG_MSG(args...) info(args)
+
+class TimingCoreEvent : public TimingEvent {
+    private:
+        uint64_t origStartCycle;
+        uint64_t startCycle;
+        CoreRecorder* cRec;
+
+    public:
+        //NOTE: Only the first TimingCoreEvent after a thread join needs to be in a domain, hence the default parameter. Because these are inherently sequential and have a fixed delay, subsequent events can inherit the parent's domain, reducing domain xings and improving slack and performance
+        TimingCoreEvent(uint64_t _delay, uint64_t _origStartCycle, CoreRecorder* _cRec, int32_t domain = -1) : TimingEvent(0, _delay, domain), origStartCycle(_origStartCycle), cRec(_cRec) {}
+
+        void simulate(uint64_t _startCycle) {
+            startCycle = _startCycle;
+            cRec->reportEventSimulated(this);
+            done(startCycle);
+        }
+
+        friend class CoreRecorder;
+};
+
+CoreRecorder::CoreRecorder(uint32_t _domain, g_string& _name)
+    : domain(_domain), name(_name + "-rec")
+{
+    prevRespEvent = NULL;
+    state = HALTED;
+    gapCycles = 0;
+    eventRecorder.setGapCycles(gapCycles);
+
+    lastUnhaltedCycle = 0;
+    totalGapCycles = 0;
+    totalHaltedCycles = 0;
+}
+
+
+uint64_t CoreRecorder::notifyJoin(uint64_t curCycle) {
+    if (state == HALTED) {
+        assert(!prevRespEvent);
+        curCycle = zinfo->globPhaseCycles; //start at beginning of the phase
+
+        totalGapCycles += gapCycles;
+        gapCycles = 0;
+        eventRecorder.setGapCycles(gapCycles);
+        assert(lastUnhaltedCycle <= curCycle);
+        totalHaltedCycles += curCycle - lastUnhaltedCycle;
+
+        prevRespEvent = new (eventRecorder) TimingCoreEvent(0, curCycle, this, domain);
+        prevRespCycle = curCycle;
+        prevRespEvent->setMinStartCycle(curCycle);
+        prevRespEvent->queue(curCycle);
+        eventRecorder.setStartSlack(0);
+        DEBUG_MSG("[%s] Joined, was HALTED, curCycle %ld halted %ld", name.c_str(), curCycle, totalHaltedCycles);
+    } else if (state == DRAINING) {
+        assert(curCycle >= zinfo->globPhaseCycles); //should not have gone out of sync...
+        DEBUG_MSG("[%s] Joined, was DRAINING, curCycle %ld", name.c_str(), curCycle);
+    } else {
+        panic("[%s] Invalid state %d on join()", name.c_str(), state);
+    }
+
+    //Common actions
+    state = RUNNING;
+    return curCycle;
+}
+
+
+void CoreRecorder::notifyLeave(uint64_t curCycle) {
+    assert(state == RUNNING);
+    state = DRAINING;
+    assert(prevRespEvent);
+    //Taper off the event
+    // Cover delay to curCycle
+    uint64_t delay = curCycle - prevRespCycle;
+    TimingCoreEvent* ev = new (eventRecorder) TimingCoreEvent(delay, prevRespCycle-gapCycles, this);
+    ev->setMinStartCycle(prevRespCycle);
+    prevRespEvent->addChild(ev, eventRecorder);
+    prevRespEvent = ev;
+    prevRespCycle = curCycle;
+
+    // Then put a zero-delay event that finishes the sequence
+    ev = new (eventRecorder) TimingCoreEvent(0, prevRespCycle-gapCycles, this);
+    ev->setMinStartCycle(prevRespCycle);
+    prevRespEvent->addChild(ev, eventRecorder);
+    prevRespEvent = ev;
+
+    DEBUG_MSG("[%s] Left, curCycle %ld", name.c_str(), curCycle);
+}
+
+void CoreRecorder::recordAccess(uint64_t startCycle) {
+    assert(eventRecorder.numRecords() <= 2);
+    TimingRecord tr = eventRecorder.getRecord(0);
+    TimingEvent* origPrevResp = prevRespEvent;
+
+    if (tr.type == PUTS || tr.type == PUTX) {
+        //info("Handling PUT+GET");
+        assert(eventRecorder.numRecords() == 2);
+        TimingRecord tr1 = eventRecorder.getRecord(1);
+        assert(tr1.type == GETX || tr1.type == GETS);
+        assert(startCycle >= prevRespCycle);
+        assert(tr1.reqCycle >= startCycle);
+        assert(tr.reqCycle >= startCycle);
+
+        uint64_t delay = startCycle - prevRespCycle;
+        TimingCoreEvent* ev = new (eventRecorder) TimingCoreEvent(delay, prevRespCycle - gapCycles, this);
+        ev->setMinStartCycle(prevRespCycle);
+        prevRespEvent->addChild(ev, eventRecorder);
+        DelayEvent* dr = new (eventRecorder) DelayEvent(tr.reqCycle-startCycle);
+        DelayEvent* dr1 = new (eventRecorder) DelayEvent(tr1.reqCycle-startCycle);
+        dr->setMinStartCycle(startCycle);
+        dr1->setMinStartCycle(startCycle);
+        ev->addChild(dr, eventRecorder)->addChild(tr.startEvent, eventRecorder);
+        ev->addChild(dr1, eventRecorder)->addChild(tr1.startEvent, eventRecorder);
+
+        //tr.endEvent not linked to anything
+        prevRespEvent = tr1.endEvent;
+        prevRespCycle = tr1.respCycle;
+
+    } else {
+        //info("Handling single GET");
+        assert(tr.type == GETX || tr.type == GETS);
+        assert(eventRecorder.numRecords() == 1);
+        uint64_t delay = tr.reqCycle - prevRespCycle;
+        TimingEvent* ev = new (eventRecorder) TimingCoreEvent(delay, prevRespCycle - gapCycles, this);
+        ev->setMinStartCycle(prevRespCycle);
+        prevRespEvent->addChild(ev, eventRecorder)->addChild(tr.startEvent, eventRecorder);
+        prevRespEvent = tr.endEvent;
+        prevRespCycle = tr.respCycle;
+    }
+
+    origPrevResp->produceCrossings(&eventRecorder);
+    eventRecorder.getCrossingStack().clear();
+    eventRecorder.clearRecords();
+}
+
+
+uint64_t CoreRecorder::cSimStart(uint64_t curCycle) {
+    if (state == HALTED) return curCycle; //nothing to do
+
+    DEBUG_MSG("[%s] Cycle %ld cSimStart %d", name.c_str(), curCycle, state);
+
+    uint64_t nextPhaseCycle = zinfo->globPhaseCycles + zinfo->phaseLength;
+
+    // If needed, bring us to the current cycle
+    if (state == RUNNING) {
+        assert(curCycle >= nextPhaseCycle);
+
+        // Cover delay to curCycle
+        if (prevRespCycle < curCycle) {
+            uint64_t delay = curCycle - prevRespCycle;
+            TimingCoreEvent* ev = new (eventRecorder) TimingCoreEvent(delay, prevRespCycle-gapCycles, this);
+            ev->setMinStartCycle(prevRespCycle);
+            prevRespEvent->addChild(ev, eventRecorder);
+            prevRespEvent = ev;
+            prevRespCycle = curCycle;
+        }
+
+        // Add an event that STARTS in the next phase, so it never gets simulated on the current phase
+        TimingCoreEvent* ev = new (eventRecorder) TimingCoreEvent(0, prevRespCycle-gapCycles, this);
+        ev->setMinStartCycle(prevRespCycle);
+        prevRespEvent->addChild(ev, eventRecorder);
+        prevRespEvent = ev;
+    } else if (state == DRAINING) { // add no event --- that's how we detect we're done draining
+         if (curCycle < nextPhaseCycle) curCycle = nextPhaseCycle; // bring cycle up
+    }
+    return curCycle;
+}
+
+uint64_t CoreRecorder::cSimEnd(uint64_t curCycle) {
+    if (state == HALTED) return curCycle; //nothing to do
+
+    DEBUG_MSG("[%s] Cycle %ld done state %d", name.c_str(), curCycle, state);
+
+    assert(lastEventSimulated);
+
+    // Adjust curCycle to account for contention simulation delay
+
+    // In our current clock, when did the last event start (1) before contention simulation, and (2) after contention simulation
+    uint64_t lastEvCycle1 = lastEventSimulated->origStartCycle + gapCycles; //we add gapCycles because origStartCycle is in zll clocks
+    uint64_t lastEvCycle2 = lastEventSimulated->startCycle;
+
+    assert(lastEvCycle1 <= curCycle);
+    assert_msg(lastEvCycle2 <= curCycle, "[%s] lec2 %ld cc %ld, state %d", name.c_str(), lastEvCycle2, curCycle, state);
+    if (unlikely(lastEvCycle1 > lastEvCycle2)) panic("[%s] Contention simulation introduced a negative skew, curCycle %ld, lc1 %ld lc2 %ld", name.c_str(), curCycle, lastEvCycle1, lastEvCycle2);
+
+    uint64_t skew = lastEvCycle2 - lastEvCycle1;
+
+    // Skew clock
+    // Note that by adding to gapCycles, we keep the zll clock (defined as curCycle - gapCycles) constant.
+    // We use the zll clock to translate origStartCycle correctly, even if it's coming from several phases back.
+    curCycle += skew;
+    gapCycles += skew;
+    prevRespCycle += skew;
+    eventRecorder.setGapCycles(gapCycles);
+
+    //NOTE: Suppose that we had a really long event, so long that in the next phase, lastEventSimulated is still the same. In this case, skew will be 0, so we do not need to remove it.
+
+    /*DEBUG_MSG*/ //info("[%s] curCycle %ld zllCurCycle %ld lec1 %ld lec2 %ld skew %ld", name.c_str(), curCycle, curCycle-gapCycles, lastEvCycle1, lastEvCycle2, skew);
+
+    /* Advance the recorder: we set the current dead cycle as the last event's cycle,
+     * but we mark any live events with some slack (we need the slack to account for events
+     * that linger a bit longer).
+     */
+    //eventRecorder.advance(curCycle + zinfo->phaseLength + 10000 +100000, lastEvCycle2);
+    eventRecorder.advance(curCycle - gapCycles + zinfo->phaseLength, lastEventSimulated->origStartCycle);
+
+    if (!lastEventSimulated->getNumChildren()) {
+        //if we were RUNNING, the phase would have been tapered off
+        assert_msg(state == DRAINING, "[%s] state %d lastEventSimulated %p (startCycle %ld) curCycle %ld", name.c_str(), state, lastEventSimulated, lastEventSimulated->startCycle, curCycle);
+        assert(prevRespEvent == lastEventSimulated);
+        lastUnhaltedCycle = lastEventSimulated->startCycle; //the taper is a 0-delay event
+        assert(lastEventSimulated->getPostDelay() == 0);
+        state = HALTED;
+        DEBUG_MSG("[%s] lastEventSimulated reached (startCycle %ld), DRAINING -> HALTED", name.c_str(), lastEventSimulated->startCycle);
+
+        lastEventSimulated = NULL;
+        prevRespEvent = NULL;
+    }
+    return curCycle;
+}
+
+void CoreRecorder::reportEventSimulated(TimingCoreEvent* ev) {
+    lastEventSimulated = ev;
+    eventRecorder.setStartSlack(ev->startCycle - ev->origStartCycle);
+}
+
+//Stats
+uint64_t CoreRecorder::getUnhaltedCycles(uint64_t curCycle) const {
+    uint64_t cycle = MAX(curCycle, zinfo->globPhaseCycles);
+    uint64_t haltedCycles =  totalHaltedCycles + ((state == HALTED)? (cycle - lastUnhaltedCycle) : 0);
+    return cycle - haltedCycles;
+}
+
+uint64_t CoreRecorder::getContentionCycles() const {
+    return totalGapCycles + gapCycles;
+}
+
+
+
diff --git a/src/core_recorder.h b/src/core_recorder.h
new file mode 100644
index 00000000..7d4f68e2
--- /dev/null
+++ b/src/core_recorder.h
@@ -0,0 +1,98 @@
+/** $lic$
+ * Copyright (C) 2012-2014 by Massachusetts Institute of Technology
+ * Copyright (C) 2010-2013 by The Board of Trustees of Stanford University
+ *
+ * This file is part of zsim.
+ *
+ * zsim is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU General Public License as published by the Free Software
+ * Foundation, version 2.
+ *
+ * If you use this software in your research, we request that you reference
+ * the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of
+ * Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the
+ * source of the simulator in any publications that use this software, and that
+ * you send us a citation of your work.
+ *
+ * zsim is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef CORE_RECORDER_H_
+#define CORE_RECORDER_H_
+
+#include "event_recorder.h"
+#include "g_std/g_string.h"
+
+class TimingCoreEvent;
+
+class CoreRecorder {
+    private:
+        typedef enum {
+            HALTED, //Not scheduled, no events left. Initial state. join() --> RUNNING
+            RUNNING, //Scheduled. leave() --> DRAINING
+            DRAINING //Not scheduled, but events remain. join() --> RUNNING; all events done --> HALTED
+        } State;
+
+        State state;
+
+        /* There are 2 clocks:
+         *  - phase 1 clock = curCycle and is maintained by the bound phase contention-free core model
+         *  - phase 2 clock = curCycle - gapCycles is the zll clock
+         *  We maintain gapCycles, and only get curCycle on function calls. Some of those calls also
+         *  need to change curCycle, so they just return an updated version that the bound phase model
+         *  needs to take. However, **we have no idea about curCycle outside of those calls**.
+         *  Defend this invariant with your life or you'll find this horrible to reason about.
+         */
+        uint64_t gapCycles; //phase 2 clock == curCycle - gapCycles
+
+        //Event bookkeeping
+        EventRecorder eventRecorder;
+        uint64_t prevRespCycle;
+        TimingEvent* prevRespEvent;
+        TimingCoreEvent* lastEventSimulated;
+
+        //Cycle accounting
+        uint64_t totalGapCycles; //does not include gapCycles
+        uint64_t totalHaltedCycles; //does not include cycles since last transition to HALTED
+        uint64_t lastUnhaltedCycle; //set on transition to HALTED
+
+        uint32_t domain;
+        g_string name;
+
+    public:
+        CoreRecorder(uint32_t _domain, g_string& _name);
+
+        //Methods called in the bound phase
+        uint64_t notifyJoin(uint64_t curCycle); //returns th updated curCycle, if it needs updating
+        void notifyLeave(uint64_t curCycle);
+
+        //This better be inlined 100% of the time, it's called on EVERY access
+        inline void record(uint64_t startCycle) {
+            if (unlikely(eventRecorder.numRecords())) recordAccess(startCycle);
+        }
+
+        //Methods called between the bound and weave phases
+        uint64_t cSimStart(uint64_t curCycle); //returns updated curCycle
+        uint64_t cSimEnd(uint64_t curCycle); //returns updated curCycle
+
+        //Methods called in the weave phase
+        inline void reportEventSimulated(TimingCoreEvent* ev);
+
+        //Misc
+        inline EventRecorder* getEventRecorder() {return &eventRecorder;}
+
+        //Stats (called fully synchronized)
+        uint64_t getUnhaltedCycles(uint64_t curCycle) const;
+        uint64_t getContentionCycles() const;
+
+    private:
+        void recordAccess(uint64_t startCycle);
+};
+
+#endif  // CORE_RECORDER_H_
diff --git a/src/cpuenum.h b/src/cpuenum.h
new file mode 100644
index 00000000..818403a5
--- /dev/null
+++ b/src/cpuenum.h
@@ -0,0 +1,87 @@
+/** $lic$
+ * Copyright (C) 2012-2014 by Massachusetts Institute of Technology
+ * Copyright (C) 2010-2013 by The Board of Trustees of Stanford University
+ *
+ * This file is part of zsim.
+ *
+ * zsim is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU General Public License as published by the Free Software
+ * Foundation, version 2.
+ *
+ * If you use this software in your research, we request that you reference
+ * the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of
+ * Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the
+ * source of the simulator in any publications that use this software, and that
+ * you send us a citation of your work.
+ *
+ * zsim is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef CPUENUM_H_
+#define CPUENUM_H_
+
+/* Small routines for core enumeration */
+
+#include "process_tree.h"
+#include "zsim.h"
+
+inline uint32_t cpuenumNumCpus(uint32_t pid) {
+    if (zinfo->perProcessCpuEnum) {
+        const g_vector<bool>& mask = zinfo->procArray[pid]->getMask();
+        uint32_t count = 0;
+        for (bool x : mask) count += x;
+        assert(count);
+        return count;
+    } else {
+        return zinfo->numCores;
+    }
+}
+
+inline std::vector<bool> cpuenumMask(uint32_t pid) {
+    std::vector<bool> res;
+    if (zinfo->perProcessCpuEnum) {
+        res.resize(cpuenumNumCpus(pid));
+        for (uint32_t i = 0; i < res.size(); i++) res[i] = true;
+    } else { 
+        const g_vector<bool>& mask = zinfo->procArray[pid]->getMask();
+        res.resize(mask.size());
+        for (uint32_t i = 0; i < res.size(); i++) res[i] = mask[i];
+    }
+    return res;
+}
+
+// Returns the cpu that this cid is scheduled on, taking care of per-process cpuenum
+// Can be called when app is fast-forwarding (cid == -1), it will return the first cpu
+// that can run a thread from the specified pid
+inline uint32_t cpuenumCpu(uint32_t pid, uint32_t cid) {
+    if (zinfo->perProcessCpuEnum) {
+        if (cid > zinfo->numCores) return 0;  // not scheduled; with perProcessCpuEnum, first cpu is always 0
+        const g_vector<bool>& mask = zinfo->procArray[pid]->getMask();
+        uint32_t count = 0;
+        for (uint32_t i = 0; i < mask.size(); i++) {
+            if (i == cid) return count;
+            if (mask[i]) count++;
+        }
+        panic("Something went horribly wrong with the process masks... are they dynamic now?");
+        return -1;
+    } else {
+        if (cid > zinfo->numCores) {  // not scheduled
+            const g_vector<bool>& mask = zinfo->procArray[pid]->getMask();
+            for (uint32_t i = 0; i < mask.size(); i++) {
+                if (mask[i]) return i;  // first core that can run this pid
+            }
+            panic("Empty mask for pid %d?", pid);
+            return -1;
+        } else {
+            return cid;
+        }
+    }
+}
+
+#endif  // CPUENUM_H_
diff --git a/src/cpuid.h b/src/cpuid.h
new file mode 100644
index 00000000..f43b3f86
--- /dev/null
+++ b/src/cpuid.h
@@ -0,0 +1,102 @@
+/** $lic$
+ * Copyright (C) 2012-2014 by Massachusetts Institute of Technology
+ * Copyright (C) 2010-2013 by The Board of Trustees of Stanford University
+ *
+ * This file is part of zsim.
+ *
+ * zsim is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU General Public License as published by the Free Software
+ * Foundation, version 2.
+ *
+ * If you use this software in your research, we request that you reference
+ * the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of
+ * Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the
+ * source of the simulator in any publications that use this software, and that
+ * you send us a citation of your work.
+ *
+ * zsim is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef CPUID_H_
+#define CPUID_H_
+
+/* CPUID records gathered from other machines, used to virtualize CPUID
+ * CPUID is a bundle of joy. See these to get started:
+ *  - http://www.sandpile.org/x86/cpuid.htm
+ *  - http://www.intel.com/content/www/us/en/processors/processor-identification-cpuid-instruction-note.html
+ * Try not to cry.
+ */
+
+struct CpuIdRecord {
+    unsigned eaxIn;
+    unsigned ecxIn;
+    unsigned eax, ebx, ecx, edx;
+
+    bool operator< (const CpuIdRecord& other) const {
+        return (eaxIn < other.eaxIn) || ( (eaxIn == other.eaxIn) && (ecxIn < other.ecxIn) );
+    }
+};
+
+// cpuid values produced by getcpuid (dsm)
+CpuIdRecord cpuid_core2[] = { // 2-socket Xeon E5335 (2x4 cores, core2 arch, 2.00GHz)
+    {0000000000, 0000000000, 0x0000000a, 0x756e6547, 0x6c65746e, 0x49656e69},
+    {0x00000001, 0000000000, 0x000006f7, 0x05040800, 0x0004e33d, 0xbfebfbff},
+    {0x00000002, 0000000000, 0x05b0b101, 0x005657f0, 0000000000, 0x2cb43049},
+    {0x00000003, 0000000000, 0000000000, 0000000000, 0000000000, 0000000000},
+    {0x00000004, 0000000000, 0x0c000121, 0x01c0003f, 0x0000003f, 0x00000001},
+    {0x00000004, 0x00000001, 0x0c000122, 0x01c0003f, 0x0000003f, 0x00000001},
+    {0x00000004, 0x00000002, 0x0c004143, 0x03c0003f, 0x00000fff, 0x00000001},
+    {0x00000004, 0x00000003, 0000000000, 0000000000, 0000000000, 0000000000},
+    {0x00000005, 0000000000, 0x00000040, 0x00000040, 0x00000003, 0x00000020},
+    {0x00000006, 0000000000, 0x00000001, 0x00000002, 0x00000001, 0000000000},
+    {0x00000007, 0000000000, 0000000000, 0000000000, 0000000000, 0000000000},
+    {0x00000008, 0000000000, 0x00000400, 0000000000, 0000000000, 0000000000},
+    {0x00000009, 0000000000, 0000000000, 0000000000, 0000000000, 0000000000},
+    {0x0000000a, 0000000000, 0x07280202, 0000000000, 0000000000, 0000000000},
+    {0x80000000, 0000000000, 0x80000008, 0000000000, 0000000000, 0000000000},
+    {0x80000001, 0000000000, 0000000000, 0000000000, 0x00000001, 0x20100800},
+    {0x80000002, 0000000000, 0x65746e49, 0x2952286c, 0x6f655820, 0x2952286e},
+    {0x80000003, 0000000000, 0x55504320, 0x20202020, 0x20202020, 0x45202020},
+    {0x80000004, 0000000000, 0x35333335, 0x20402020, 0x30302e32, 0x007a4847},
+    {0x80000005, 0000000000, 0000000000, 0000000000, 0000000000, 0000000000},
+    {0x80000006, 0000000000, 0000000000, 0000000000, 0x10008040, 0000000000},
+    {0x80000007, 0000000000, 0000000000, 0000000000, 0000000000, 0000000000},
+    {0x80000008, 0000000000, 0x00003024, 0000000000, 0000000000, 0000000000},
+    {0x80000009, 0000000000, 0x07280202, 0000000000, 0000000000, 0000000000},
+    {0x8000000a, 0000000000, 0x07280202, 0000000000, 0000000000, 0000000000},
+    {0x8000000b, 0000000000, 0x07280202, 0000000000, 0000000000, 0000000000},
+    {0x8000000c, 0000000000, 0x07280202, 0000000000, 0000000000, 0000000000},
+    {0x8000000d, 0000000000, 0x07280202, 0000000000, 0000000000, 0000000000},
+    {0x8000000e, 0000000000, 0x07280202, 0000000000, 0000000000, 0000000000},
+    {0x8000000f, 0000000000, 0x07280202, 0000000000, 0000000000, 0000000000},
+    {0x80000010, 0000000000, 0x07280202, 0000000000, 0000000000, 0000000000},
+    {0x80000011, 0000000000, 0x07280202, 0000000000, 0000000000, 0000000000},
+    {0x80000012, 0000000000, 0x07280202, 0000000000, 0000000000, 0000000000},
+    {0x80000013, 0000000000, 0x07280202, 0000000000, 0000000000, 0000000000},
+    {0x80000014, 0000000000, 0x07280202, 0000000000, 0000000000, 0000000000},
+    {0x80000015, 0000000000, 0x07280202, 0000000000, 0000000000, 0000000000},
+    {0x80000016, 0000000000, 0x07280202, 0000000000, 0000000000, 0000000000},
+    {0x80000017, 0000000000, 0x07280202, 0000000000, 0000000000, 0000000000},
+    {0x80000018, 0000000000, 0x07280202, 0000000000, 0000000000, 0000000000},
+    {0x80000019, 0000000000, 0x07280202, 0000000000, 0000000000, 0000000000},
+    {0x8000001a, 0000000000, 0x07280202, 0000000000, 0000000000, 0000000000},
+    {0x8000001b, 0000000000, 0x07280202, 0000000000, 0000000000, 0000000000},
+    {0x8000001c, 0000000000, 0x07280202, 0000000000, 0000000000, 0000000000},
+    {0x8000001d, 0000000000, 0x07280202, 0000000000, 0000000000, 0000000000},
+    {0x8000001d, 0x00000001, 0x07280202, 0000000000, 0000000000, 0000000000},
+    {0x8000001d, 0x00000002, 0x07280202, 0000000000, 0000000000, 0000000000},
+    {0x8000001d, 0x00000003, 0x07280202, 0000000000, 0000000000, 0000000000},
+    {0x8000001d, 0x00000004, 0x07280202, 0000000000, 0000000000, 0000000000},
+    {0x8000001d, 0x00000005, 0x07280202, 0000000000, 0000000000, 0000000000},
+    {0x8000001d, 0x00000006, 0x07280202, 0000000000, 0000000000, 0000000000},
+    {0x8000001e, 0000000000, 0x07280202, 0000000000, 0000000000, 0000000000},
+    {0xffffffff, 0}
+};
+
+#endif  // CPUID_H_
diff --git a/src/ddr_mem.cpp b/src/ddr_mem.cpp
new file mode 100644
index 00000000..5771ceea
--- /dev/null
+++ b/src/ddr_mem.cpp
@@ -0,0 +1,734 @@
+/** $lic$
+ * Copyright (C) 2012-2014 by Massachusetts Institute of Technology
+ * Copyright (C) 2010-2013 by The Board of Trustees of Stanford University
+ *
+ * This file is part of zsim.
+ *
+ * zsim is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU General Public License as published by the Free Software
+ * Foundation, version 2.
+ *
+ * If you use this software in your research, we request that you reference
+ * the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of
+ * Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the
+ * source of the simulator in any publications that use this software, and that
+ * you send us a citation of your work.
+ *
+ * zsim is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "ddr_mem.h"
+#include <algorithm>
+#include <string>
+#include <vector>
+#include "bithacks.h"
+#include "config.h"  // for Tokenize
+#include "contention_sim.h"
+#include "event_recorder.h"
+#include "timing_event.h"
+#include "zsim.h"
+
+//#define DEBUG(args...) info(args)
+#define DEBUG(args...)
+
+// Recorder-allocated event, represents one read or write request
+class DDRMemoryAccEvent : public TimingEvent {
+    private:
+        DDRMemory* mem;
+        Address addr;
+        bool write;
+
+    public:
+        DDRMemoryAccEvent(DDRMemory* _mem, bool _isWrite, Address _addr, int32_t domain, uint32_t preDelay, uint32_t postDelay)
+            : TimingEvent(preDelay, postDelay, domain), mem(_mem), addr(_addr), write(_isWrite) {}
+
+        Address getAddr() const {return addr;}
+        bool isWrite() const {return write;}
+
+        void simulate(uint64_t startCycle) {
+            mem->enqueue(this, startCycle);
+        }
+};
+
+// Globally allocated event that calls us every tREFI cycles
+class RefreshEvent : public TimingEvent, public GlobAlloc {
+    private:
+        DDRMemory* mem;
+        uint32_t refInterval;  // in sysCycles
+
+    public:
+        RefreshEvent(DDRMemory* _mem, uint32_t _refInterval, int32_t domain) :
+            TimingEvent(0, 0, domain), mem(_mem), refInterval(_refInterval)
+        {
+            setMinStartCycle(0);
+            zinfo->contentionSim->enqueueSynced(this, 0);
+        }
+
+        void parentDone(uint64_t startCycle) {
+            panic("This is queued directly");
+        }
+
+        void simulate(uint64_t startCycle) {
+            mem->refresh(startCycle);
+            requeue(startCycle + refInterval);
+        }
+
+        // Use glob mem
+        using GlobAlloc::operator new;
+        using GlobAlloc::operator delete;
+};
+
+/* Globally allocated event for scheduling
+ *
+ * NOTE: This event plus the bit of logic in DDRMemory that deals with event
+ * management can be generalized to deal with event-driven classes that need to
+ * be ticked according to varying constraints.
+ */
+class SchedEvent : public TimingEvent, public GlobAlloc {
+    private:
+        DDRMemory* const mem;
+        enum State { IDLE, QUEUED, RUNNING, ANNULLED };
+        State state;
+
+    public:
+        SchedEvent* next;  // for event freelist
+
+        SchedEvent(DDRMemory* _mem, int32_t domain) : TimingEvent(0, 0, domain), mem(_mem) {
+            setMinStartCycle(0);
+            setRunning();
+            hold();
+            state = IDLE;
+            next = NULL;
+        }
+
+        void parentDone(uint64_t startCycle) {
+            panic("This is queued directly");
+        }
+
+        void simulate(uint64_t startCycle) {
+            if (state == QUEUED) {
+                state = RUNNING;
+                uint64_t nextCycle = mem->tick(startCycle);
+                if (nextCycle) {
+                    requeue(nextCycle);
+                    state = QUEUED;
+                } else {
+                    state = IDLE;
+                    hold();
+                    mem->recycleEvent(this);
+                }
+            } else {
+                assert(state == ANNULLED);
+                state = IDLE;
+                hold();
+                mem->recycleEvent(this);
+            }
+        }
+
+        void enqueue(uint64_t cycle) {
+            assert(state == IDLE);
+            state = QUEUED;
+            requeue(cycle);
+        }
+
+        void annul() {
+            assert_msg(state == QUEUED, "sched state %d", state);
+            state = ANNULLED;
+        }
+
+        // Use glob mem
+        using GlobAlloc::operator new;
+        using GlobAlloc::operator delete;
+};
+
+
+/* Init & bound phase functionality */
+
+DDRMemory::DDRMemory(uint32_t _lineSize, uint32_t _colSize, uint32_t _ranksPerChannel, uint32_t _banksPerRank,
+        uint32_t _sysFreqMHz, const char* tech, const char* addrMapping, uint32_t _controllerSysLatency,
+        uint32_t _queueDepth, uint32_t _rowHitLimit, bool _deferredWrites, bool _closedPage,
+        uint32_t _domain, g_string& _name)
+    : lineSize(_lineSize), ranksPerChannel(_ranksPerChannel), banksPerRank(_banksPerRank),
+      controllerSysLatency(_controllerSysLatency), queueDepth(_queueDepth), rowHitLimit(_rowHitLimit),
+      deferredWrites(_deferredWrites), closedPage(_closedPage), domain(_domain), name(_name)
+{
+    sysFreqKHz = 1000 * _sysFreqMHz;
+    initTech(tech);  // sets all tXX and memFreqKHz
+    if (memFreqKHz >= sysFreqKHz/2) {
+        panic("You may need to tweak the scheduling code, which works with system cycles." \
+            "With these frequencies, events (which run on system cycles) can't hit us every memory cycle.");
+    }
+
+    minRdLatency = controllerSysLatency + memToSysCycle(tCL+tBL-1);
+    minWrLatency = controllerSysLatency;
+    preDelay = controllerSysLatency;
+    postDelayRd = minRdLatency - preDelay;
+    postDelayWr = 0;
+
+    rdQueue.init(queueDepth);
+    wrQueue.init(queueDepth);
+
+    info("%s: domain %d, %d ranks/ch %d banks/rank, tech %s, boundLat %d rd / %d wr",
+            name.c_str(), domain, ranksPerChannel, banksPerRank, tech, minRdLatency, minWrLatency);
+
+    minRespCycle = tCL + tBL + 1; // We subtract tCL + tBL from this on some checks; this avoids overflows
+
+    banks.resize(ranksPerChannel);
+    for (uint32_t i = 0; i < ranksPerChannel; i++) banks[i].resize(banksPerRank);
+
+    rankActWindows.resize(ranksPerChannel);
+    for (uint32_t i = 0; i < ranksPerChannel; i++) rankActWindows[i].init(4);  // we only model FAW; for TAW (other technologies) change this to 2
+
+    // We get line addresses, and for a 64-byte line, there are _colSize/(JEDEC_BUS_WIDTH/8) lines/page
+    uint32_t colBits = ilog2(_colSize/(JEDEC_BUS_WIDTH/8)*64/lineSize);
+    uint32_t bankBits = ilog2(banksPerRank);
+    uint32_t rankBits = ilog2(ranksPerChannel);
+
+    // Parse config string, has to be some combination of rank, bank, and col separated by semicolons
+    // (row is always MSB bits, since we don't actually know how many bits it is to begin with...)
+    std::vector<std::string> tokens;
+    Tokenize(addrMapping, tokens, ":");
+    if (tokens.size() != 3) panic("Invalid addrMapping %s, need all row/col/rank tokens separated by colons", addrMapping);
+    std::reverse(tokens.begin(), tokens.end()); // want lowest bits first
+
+    colMask = rankMask = bankMask = 0;
+    uint32_t startBit = 0;
+    auto computeShiftAndMask = [&startBit, addrMapping](const std::string& field, const uint32_t fieldBits, uint32_t& shift, uint32_t& mask) {
+        if (mask) panic("Repeated field %s in addrMapping %s", field.c_str(), addrMapping);
+        shift = startBit;
+        mask = (1 << fieldBits) - 1;
+        startBit += fieldBits;
+    };
+    for (auto t : tokens) {
+        if (t == "col")       computeShiftAndMask(t, colBits,  colShift,  colMask);
+        else if (t == "rank") computeShiftAndMask(t, rankBits, rankShift, rankMask);
+        else if (t == "bank") computeShiftAndMask(t, bankBits, bankShift, bankMask);
+        else panic("Invalid token %s in addrMapping %s (only row/col/rank)", t.c_str(), addrMapping);
+    }
+    rowShift = startBit;  // row has no mask
+
+    info("%s: Address mapping %s row %d:%ld col %d:%d rank %d:%d bank %d:%d",
+            name.c_str(), addrMapping, 63, rowShift, ilog2(colMask << colShift), colShift,
+            ilog2(rankMask << rankShift), rankShift, ilog2(bankMask << bankShift), bankShift);
+
+    // Weave phase events
+    new RefreshEvent(this, memToSysCycle(tREFI), domain);
+
+    nextSchedCycle = -1ul;
+    nextSchedEvent = NULL;
+    eventFreelist = NULL;
+}
+
+void DDRMemory::initStats(AggregateStat* parentStat) {
+    AggregateStat* memStats = new AggregateStat();
+    memStats->init(name.c_str(), "Memory controller stats");
+    profReads.init("rd", "Read requests"); memStats->append(&profReads);
+    profWrites.init("wr", "Write requests"); memStats->append(&profWrites);
+    profTotalRdLat.init("rdlat", "Total latency experienced by read requests"); memStats->append(&profTotalRdLat);
+    profTotalWrLat.init("wrlat", "Total latency experienced by write requests"); memStats->append(&profTotalWrLat);
+    profReadHits.init("rdhits", "Read row hits"); memStats->append(&profReadHits);
+    profWriteHits.init("wrhits", "Write row hits"); memStats->append(&profWriteHits);
+    latencyHist.init("mlh", "latency histogram for memory requests", NUMBINS); memStats->append(&latencyHist);
+    parentStat->append(memStats);
+}
+
+/* Bound phase interface */
+
+uint64_t DDRMemory::access(MemReq& req) {
+    switch (req.type) {
+        case PUTS:
+        case PUTX:
+            *req.state = I;
+            break;
+        case GETS:
+            *req.state = req.is(MemReq::NOEXCL)? S : E;
+            break;
+        case GETX:
+            *req.state = M;
+            break;
+
+        default: panic("!?");
+    }
+
+    if (req.type == PUTS) {
+        return req.cycle; //must return an absolute value, 0 latency
+    } else {
+        bool isWrite = (req.type == PUTX);
+        uint64_t respCycle = req.cycle + (isWrite? minWrLatency : minRdLatency);
+        if (zinfo->eventRecorders[req.srcId]) {
+            DDRMemoryAccEvent* memEv = new (zinfo->eventRecorders[req.srcId]) DDRMemoryAccEvent(this,
+                    isWrite, req.lineAddr, domain, preDelay, isWrite? postDelayWr : postDelayRd);
+            memEv->setMinStartCycle(req.cycle);
+            TimingRecord tr = {req.lineAddr, req.cycle, respCycle, req.type, memEv, memEv};
+            zinfo->eventRecorders[req.srcId]->pushRecord(tr);
+        }
+        //info("Access to %lx at %ld, %ld latency", req.lineAddr, req.cycle, minLatency);
+        return respCycle;
+    }
+}
+
+/* Weave phase functionality */
+
+//Address mapping:
+// For now, row:col:bank:rank:channel for max parallelism (same as scheme7 from DRAMSim)
+// NOTE: channel is external (from SplitAddrMem)
+// Change or reorder to define your own mappings
+DDRMemory::AddrLoc DDRMemory::mapLineAddr(Address lineAddr) {
+    AddrLoc l;
+    l.col  = (lineAddr >> colShift)  & colMask;
+    l.rank = (lineAddr >> rankShift) & rankMask;
+    l.bank = (lineAddr >> bankShift) & bankMask;
+    l.row  = lineAddr >> rowShift;
+
+    //info("0x%lx r%ld:c%d b%d:r%d", lineAddr, l.row, l.col, l.bank, l.rank);
+    assert(l.rank < ranksPerChannel);
+    assert(l.bank < banksPerRank);
+
+    return l;
+}
+
+void DDRMemory::enqueue(DDRMemoryAccEvent* ev, uint64_t sysCycle) {
+    uint64_t memCycle = sysToMemCycle(sysCycle);
+    DEBUG("%ld: enqueue() addr 0x%lx wr %d", memCycle, ev->getAddr(), ev->isWrite());
+    
+    // Create request
+    Request ovfReq;
+    bool overflow = rdQueue.full() || wrQueue.full();
+    bool useWrQueue = deferredWrites && ev->isWrite();
+    Request* req = overflow? &ovfReq : useWrQueue? wrQueue.alloc() : rdQueue.alloc();
+
+    req->addr = ev->getAddr();
+    req->loc = mapLineAddr(ev->getAddr());
+    req->write = ev->isWrite();
+
+    req->arrivalCycle = memCycle;
+    req->startSysCycle = sysCycle;
+
+    req->ev = ev;
+    ev->hold();
+
+    if (overflow) {
+        overflowQueue.push_back(*req);
+    } else {
+        queue(req, memCycle);
+
+        // If needed, schedule an event to handle this new request
+        if (!req->prev /* first in bank */) {
+            uint64_t minSchedCycle = std::max(memCycle, minRespCycle - tCL - tBL);  
+            if (nextSchedCycle > minSchedCycle) minSchedCycle = std::max(minSchedCycle, findMinCmdCycle(*req));
+            if (nextSchedCycle > minSchedCycle) {
+                if (nextSchedEvent) nextSchedEvent->annul();
+                if (eventFreelist) {
+                    nextSchedEvent = eventFreelist;
+                    eventFreelist = eventFreelist->next;
+                    nextSchedEvent->next = NULL;
+                } else {
+                    nextSchedEvent = new SchedEvent(this, domain);
+                }
+                DEBUG("queued %ld", minSchedCycle);
+
+                // Under memFreq < sysFreq/2, sysToMemCycle translates back to the same memCycle
+                uint64_t enqSysCycle = std::max(matchingMemToSysCycle(minSchedCycle), sysCycle);
+                nextSchedEvent->enqueue(enqSysCycle);
+                nextSchedCycle = minSchedCycle;
+            }
+        } 
+    }
+}
+
+void DDRMemory::queue(Request* req, uint64_t memCycle) {    
+    // If it's a write, respond to it immediately
+    if (req->write) {
+        auto ev = req->ev;
+        req->ev = NULL;
+
+        ev->release();
+        uint64_t respCycle = memToSysCycle(memCycle) + minWrLatency;
+        ev->done(respCycle - preDelay - postDelayWr);
+    }
+    
+    req->arrivalCycle = memCycle;  // if this comes from the overflow queue, update
+
+    // Test: Skip writes
+#if 0
+    if (req->write) {
+        assert(wrQueue.size() == 1);
+        wrQueue.remove(wrQueue.begin());
+        return;
+    }
+#endif
+
+    // Alloc in per-bank queue, in FR order
+    Bank& bank = banks[req->loc.rank][req->loc.bank];
+    InList<Request>& q = (deferredWrites && req->write)? bank.wrReqs : bank.rdReqs;
+
+    // Print bak queue? Use to verify FR-FCFS
+#if 0
+    auto printQ = [&](const char* id) {
+        info("%8ld: %s r%db%d : %s  %s", memCycle, name.c_str(), req->loc.rank, req->loc.bank, (deferredWrites && req->write)? "WQ" : "RQ", id);
+        Request* pr = q.front();
+        while (pr) {
+            info("     0x%08lx | %ld | %ld", pr->loc.row, pr->rowHitSeq, pr->arrivalCycle);
+            pr = pr->next;
+        }
+    };
+    printQ("PRE");
+#endif
+
+    Request* m = q.back();
+    while (m) {
+        if (m->loc.row == req->loc.row) {
+            if (m->rowHitSeq < rowHitLimit) {
+                // queue after last same-row access
+                req->rowHitSeq = m->rowHitSeq + 1;
+                q.insertAfter(m, req);
+            } else {
+                // queue last to get some fairness
+                req->rowHitSeq = 0;
+                q.push_back(req);
+            }
+            break;
+        }
+        m = m->prev;
+    }
+
+    // No matches...
+    if (!m) {
+        if (bank.open && req->loc.row == bank.openRow && bank.curRowHits < rowHitLimit && q.empty()) {
+            // ... but row is open (& bank queue empty), bypass everyone
+            /* NOTE: If the bank queue is not empty, don't go before the
+             * current request. We assume that the request could have issued
+             * PRE/ACT commands by now, but those are not recorded till
+             * trySchedule. If you choose to bypass to the front, you should
+             * check whether the next request would have issued a PRE or ACT by
+             * now (o/w you have oracular knowledge...).
+             */
+             req->rowHitSeq = bank.curRowHits + 1;
+            q.push_front(req);
+        } else {
+            // ... and row is closed or has too many hits, maintain FCFS
+            req->rowHitSeq = 0;
+            q.push_back(req);
+        }
+    }
+#if 0
+    printQ("POST");
+#endif
+}
+
+// For external ticks
+uint64_t DDRMemory::tick(uint64_t sysCycle) {
+    uint64_t memCycle = sysToMemCycle(sysCycle);
+    assert_msg(memCycle == nextSchedCycle, "%ld != %ld", memCycle, nextSchedCycle);
+
+    uint64_t minSchedCycle = trySchedule(memCycle, sysCycle);
+    assert(minSchedCycle >= memCycle);
+    if (!rdQueue.full() && !wrQueue.full() && !overflowQueue.empty()) {
+        Request& ovfReq = overflowQueue.front();
+        bool useWrQueue = deferredWrites && ovfReq.write;
+        Request* req = useWrQueue? wrQueue.alloc() : rdQueue.alloc();
+        *req = ovfReq;
+        overflowQueue.pop_front();
+
+        queue(req, memCycle);
+        
+        // This request may be schedulable before trySchedule's minSchedCycle
+        if (!req->prev /*first in bank queue*/) {
+            uint64_t minQueuedSchedCycle = std::max(memCycle, minRespCycle - tCL - tBL);
+            if (minSchedCycle > minQueuedSchedCycle) minSchedCycle = std::max(minQueuedSchedCycle, findMinCmdCycle(*req));
+            if (minSchedCycle > minQueuedSchedCycle) {
+                DEBUG("Overflowed request lowered minSchedCycle %ld -> %ld (memCycle %ld)", minSchedCycle, minQueuedSchedCycle, memCycle);
+                minSchedCycle = minQueuedSchedCycle;
+            }
+        }
+    }
+
+    nextSchedCycle = minSchedCycle;
+    if (nextSchedCycle == -1ul) {
+        nextSchedEvent = NULL;
+        return 0;
+    } else {
+        // sysToMemCycle translates this back to nextSchedCycle
+        uint64_t enqSysCycle = std::max(matchingMemToSysCycle(nextSchedCycle), sysCycle);
+        return enqSysCycle;
+    }
+}
+
+void DDRMemory::recycleEvent(SchedEvent* ev) {
+    assert(ev != nextSchedEvent);
+    assert(ev->next == NULL);
+    ev->next = eventFreelist;
+    eventFreelist = ev;
+}
+
+uint64_t DDRMemory::findMinCmdCycle(const Request& r) const {
+    const Bank& bank = banks[r.loc.rank][r.loc.bank];
+    uint64_t minCmdCycle = std::max(r.arrivalCycle, bank.lastCmdCycle + 1);
+    if (r.loc.row == bank.openRow && bank.open) {
+        // Row buffer hit
+    } else {
+        // Either row closed, or row buffer miss
+        uint64_t preCycle;
+        if (!bank.open) {
+            preCycle = bank.minPreCycle;
+        } else {
+            assert(r.loc.row != bank.openRow);
+            preCycle = std::max(r.arrivalCycle, bank.minPreCycle);
+        }
+        uint64_t actCycle = std::max(r.arrivalCycle, std::max(preCycle + tRP, bank.lastActCycle + tRRD));
+        actCycle = std::max(actCycle, rankActWindows[r.loc.rank].minActCycle() + tFAW);
+        minCmdCycle = actCycle + tRCD;
+    }
+    return minCmdCycle;
+}
+
+uint64_t DDRMemory::trySchedule(uint64_t curCycle, uint64_t sysCycle) {
+    /* Implement FR-FCFS scheduling to maximize bus utilization
+     *
+     * This model is issue-centric: We queue our events at the appropriate
+     * COLUMN ACCESS issue time, and compute constraints on when we can
+     * actually do the column access. This ensures we put the column access at
+     * the right time. But be careful... you have more information here than
+     * you'd have in a cycle-by-cycle model, and it's easy to modify this
+     * algorithm to have oracular characteristics. If you're writing a shiny
+     * new scheduler algorithm, think about what you know when.
+     *
+     * Here, we're not using future knowledge because requests queue in FR-FCFS
+     * order at *arrival* time, and we obey the appropriate timing constraints.
+     */
+
+    if (rdQueue.empty() && wrQueue.empty()) return -1ul;
+    if (curCycle + tCL < minRespCycle) return minRespCycle - tCL;  // too far ahead
+
+    // Writes have priority if the write queue is getting full...
+    bool prioWrites = (wrQueue.size() > (3*queueDepth/4)) || (lastCmdWasWrite && wrQueue.size() > queueDepth/4);
+    bool isWriteQueue = rdQueue.empty() || prioWrites;
+
+    RequestQueue<Request>& queue = isWriteQueue? wrQueue : rdQueue;
+    assert(!queue.empty());
+
+    Request* r = NULL;
+    RequestQueue<Request>::iterator ir = queue.begin();
+    uint64_t minSchedCycle = -1ul;
+    while (ir != queue.end()) {
+        //Bank& bank = banks[(*ir)->loc.rank][(*ir)->loc.bank];
+        //if ((isWriteQueue? bank.wrReqs : bank.rdReqs).front() == *ir) {
+        if (!(*ir)->prev) {  // FASTAH!
+            uint64_t minCmdCycle = findMinCmdCycle(**ir);
+            minSchedCycle = std::min(minSchedCycle, minCmdCycle);
+            if (minCmdCycle <= curCycle) {
+                r = *ir;
+                break;
+            }
+            //DEBUG("Skipping 0x%lx, not ready %ld", (*ir)->ev->getAddr(), minCmdCycle);
+        } else {
+            //DEBUG("Skipping 0x%lx, not first", (*ir)->ev->getAddr());
+        }
+        ir.inc();
+    }
+
+    if (!r) {
+        /* Because we have an event-driven model that uses the same timing
+         * constraints to schedule a tick, this rarely happens. For example,
+         * refreshes trigger these.
+         */
+        DEBUG("%ld : First req ready at %ld", curCycle, minSchedCycle);
+        return minSchedCycle;  // no requests are ready to issue yet
+    }
+
+    DEBUG("%ld : Found ready request 0x%lx %s %ld (%ld / %ld)", curCycle, r->addr, r->write? "W" : "R", r->arrivalCycle, rdQueue.size(), wrQueue.size());
+
+    Bank& bank = banks[r->loc.rank][r->loc.bank];
+
+    // Compute the minimum cycle at which the read or write command can be issued,
+    // without column access or data bus constraints
+    uint64_t minCmdCycle = std::max(curCycle, minRespCycle - tCL);
+    if (lastCmdWasWrite && !r->write) minCmdCycle = std::max(minCmdCycle, minRespCycle + tWTR);
+    bool rowHit = false;
+    if (r->loc.row == bank.openRow && bank.open) {
+        // Row buffer hit
+        rowHit = true;
+    } else {
+        // Either row closed, or row buffer miss
+        uint64_t preCycle;
+        bool preIssued = bank.open;
+        if (!bank.open) {
+            preCycle = bank.minPreCycle;
+        } else {
+            assert(r->loc.row != bank.openRow);
+            preCycle = std::max(r->arrivalCycle, bank.minPreCycle);
+        }
+
+        uint64_t actCycle = std::max(r->arrivalCycle, std::max(preCycle + tRP, bank.lastActCycle + tRRD));
+        actCycle = std::max(actCycle, rankActWindows[r->loc.rank].minActCycle() + tFAW);
+        
+        // Record ACT
+        bank.open = true;
+        bank.openRow = r->loc.row;
+        if (preIssued) bank.minPreCycle = preCycle + tRAS;
+        rankActWindows[r->loc.rank].addActivation(actCycle);
+        bank.lastActCycle = actCycle;
+
+        minCmdCycle = std::max(minCmdCycle, actCycle + tRCD);
+    }
+
+    // Figure out data bus constraints, find actual time at which command is issued
+    uint64_t cmdCycle = std::max(minCmdCycle, minRespCycle - tCL);
+    minRespCycle = cmdCycle + tCL + tBL;
+    lastCmdWasWrite = r->write;
+
+    // Record PRE
+    // if closed-page, close (auto-precharge) if no more row buffer hits
+    // if open-page, minPreCycle is used for row buffer misses
+    if (closedPage && !(r->next && r->next->rowHitSeq != 0)) bank.open = false;
+    bank.minPreCycle = std::max(
+            bank.minPreCycle,  // for mixed read and write commands, minPreCycle may not be monotonic without this
+            std::max(bank.lastActCycle + tRAS,  // RAS constraint
+            r->write? minRespCycle + tWR : cmdCycle + tRTP  // read to precharge for reads, write recovery for writes
+            ));
+
+    // Record RD or WR
+    assert(bank.lastCmdCycle < cmdCycle);
+    bank.lastCmdCycle = cmdCycle;
+    bank.curRowHits = r->rowHitSeq;
+
+    // Issue response
+    if (r->ev) {
+        auto ev = r->ev;
+        assert(!ev->isWrite() && !r->write);  // reads only
+        
+        uint64_t doneSysCycle = memToSysCycle(minRespCycle) + controllerSysLatency;
+        assert(doneSysCycle >= sysCycle);
+        
+        ev->release();
+        ev->done(doneSysCycle - preDelay - postDelayRd);
+
+        uint32_t scDelay = doneSysCycle - r->startSysCycle;
+        profReads.inc();
+        profTotalRdLat.inc(scDelay);
+        if (rowHit) profReadHits.inc();
+        uint32_t bucket = std::min(NUMBINS-1, scDelay/BINSIZE);
+        latencyHist.inc(bucket, 1);
+    } else {
+        uint32_t scDelay = memToSysCycle(minRespCycle) + controllerSysLatency - r->startSysCycle;
+        profWrites.inc();
+        profTotalWrLat.inc(scDelay);
+        if (rowHit) profWriteHits.inc();
+    }
+
+    DEBUG("Served 0x%lx lat %ld clocks", r->addr, minRespCycle-curCycle);
+    
+    // Dequeue this req
+    queue.remove(ir);
+    (isWriteQueue? bank.wrReqs : bank.rdReqs).pop_front();
+
+    return (rdQueue.empty() && wrQueue.empty())? -1ul : minRespCycle - tCL;
+}
+
+void DDRMemory::refresh(uint64_t sysCycle) {
+    uint64_t memCycle = sysToMemCycle(sysCycle);
+    uint64_t minRefreshCycle = memCycle;
+    for (auto& rankBanks : banks) {
+        for (auto& bank : rankBanks) {
+            minRefreshCycle = std::max(minRefreshCycle, std::max(bank.minPreCycle, bank.lastCmdCycle));
+        }
+    }
+    assert(minRefreshCycle >= memCycle);
+
+    uint64_t refreshDoneCycle = minRefreshCycle + tRFC;
+    assert(tRFC >= tRP);
+    for (auto& rankBanks : banks) {
+        for (auto& bank : rankBanks) {
+            // Close and force the ACT to happen at least at tRFC
+            // PRE <-tRP-> ACT, so discount tRP
+            bank.minPreCycle = refreshDoneCycle - tRP;
+            bank.open = false;
+        }
+    }
+    
+    DEBUG("Refresh %ld start %ld done %ld", memCycle, minRefreshCycle, refreshDoneCycle);
+}
+
+
+/* Tech/Device timing parameters */
+
+void DDRMemory::initTech(const char* techName) {
+    std::string tech(techName);
+    double tCK;
+
+    // tBL's below are for 64-byte lines; we adjust as needed
+
+    // Please keep this orderly; go from faster to slower technologies
+    if (tech == "DDR3-1333-CL10") {
+        // from DRAMSim2/ini/DDR3_micron_16M_8B_x4_sg15.ini (Micron)
+        tCK = 1.5;  // ns; all other in mem cycles
+        tBL = 4;
+        tCL = 10;
+        tRCD = 10;
+        tRTP = 5;
+        tRP = 10;
+        tRRD = 4;
+        tRAS = 24;
+        tFAW = 20;
+        tWTR = 5;
+        tWR = 10;
+        tRFC = 74;
+        tREFI = 7800;
+    } else if (tech == "DDR3-1066-CL7") {
+        // from DDR3_micron_16M_8B_x4_sg187.ini
+        // see http://download.micron.com/pdf/datasheets/dram/ddr3/1Gb_DDR3_SDRAM.pdf, cl7 variant, copied from it; tRRD is widely different, others match
+        tCK = 1.875;
+        tBL = 4;
+        tCL = 7;
+        tRCD = 7;
+        tRTP = 4;
+        tRP = 7;
+        tRRD = 4;
+        tRAS = 18;
+        tFAW = 18;
+        tWTR = 4;
+        tWR = 7;
+        tRFC = 59;
+        tREFI = 7800;
+    } else if (tech == "DDR3-1066-CL8") {
+        // from DDR3_micron_16M_8B_x4_sg187.ini
+        tCK = 1.875;
+        tBL = 4;
+        tCL = 8;
+        tRCD = 8;
+        tRTP = 4;
+        tRP = 8;
+        tRRD = 4;
+        tRAS = 20;
+        tFAW = 20;
+        tWTR = 4;
+        tWR = 8;
+        tRFC = 59;
+        tREFI = 7800;
+    } else {
+        panic("Unknown technology %s, you'll need to define it", techName);
+    }
+
+    // Check all params were set
+    assert(tCK > 0.0);
+    assert(tBL && tCL && tRCD && tRTP && tRP && tRRD && tRAS && tFAW && tWTR && tWR && tRFC && tREFI);
+
+    if (isPow2(lineSize) && lineSize >= 64) {
+        tBL = lineSize*tBL/64;
+    } else if (lineSize == 32) {
+        tBL = tBL/2;
+    } else {
+        // If we wanted shorter lines, we'd have to start really caring about contention in the command bus;
+        // even 32 bytes is pushing it, 32B probably calls for coalescing buffers
+        panic("Unsupported line size %d", lineSize);
+    }
+    
+    memFreqKHz = (uint64_t)(1e9/tCK/1e3);
+}
+
diff --git a/src/ddr_mem.h b/src/ddr_mem.h
new file mode 100644
index 00000000..55155ac0
--- /dev/null
+++ b/src/ddr_mem.h
@@ -0,0 +1,292 @@
+/** $lic$
+ * Copyright (C) 2012-2014 by Massachusetts Institute of Technology
+ * Copyright (C) 2010-2013 by The Board of Trustees of Stanford University
+ *
+ * This file is part of zsim.
+ *
+ * zsim is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU General Public License as published by the Free Software
+ * Foundation, version 2.
+ *
+ * If you use this software in your research, we request that you reference
+ * the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of
+ * Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the
+ * source of the simulator in any publications that use this software, and that
+ * you send us a citation of your work.
+ *
+ * zsim is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef DDR_MEM_H_
+#define DDR_MEM_H_
+
+#include <deque>
+
+#include "g_std/g_string.h"
+#include "intrusive_list.h"
+#include "memory_hierarchy.h"
+#include "pad.h"
+#include "stats.h"
+
+
+/* Helper data structures */
+
+/* Efficiently track the activation window: A circular buffer that stores the
+ * next allowed cycle we're allowed to issue an activation
+ */
+class ActWindow {
+    private:
+        g_vector<uint64_t> buf;
+        uint32_t idx;
+
+    public:
+        void init(uint32_t size) {
+            buf.resize(size);
+            for (uint32_t i = 0; i < size; i++) buf[i] = 0;
+            idx = 0;
+        }
+
+        inline uint64_t minActCycle() const {
+            return buf[idx];
+        }
+
+        inline void addActivation(uint64_t actCycle) {
+            assert(buf[idx] <= actCycle); // o/w we have violated tTAW/tFAW and more...
+
+            // We need to reorder rank ACT commands, which may happen somewhat out of order
+            // Typically, acts come in order or nearly in order, so doing this is pretty efficient
+            // (vs e.g. scanning all last few acts to figure out the minumum constraint)
+            uint32_t cur = idx;
+            while (buf[dec(cur)] > actCycle) {
+                buf[cur] = buf[dec(cur)];
+                cur = dec(cur);
+                if (cur == idx) break;  // we're the oldest in the window
+            }
+            buf[cur] = actCycle;
+
+            idx = inc(idx);
+        }
+
+    private:
+        inline uint32_t inc(uint32_t i) const { return (i < buf.size()-1)? i+1 : 0; }
+        inline uint32_t dec(uint32_t i) const { return i? i-1 : buf.size()-1; }
+};
+
+// Read or write queues, ordered/inserted by arrival time, out-of-order finish
+template <typename T>
+class RequestQueue {
+    private:
+        struct Node : InListNode<Node> {
+            T elem;
+        };
+        InList<Node> reqList;  // FIFO
+        InList<Node> freeList; // LIFO (higher locality)
+
+    public:
+        void init(size_t size) {
+            assert(reqList.empty() && freeList.empty());
+            Node* buf = gm_calloc<Node>(size);
+            for (uint32_t i = 0; i < size; i++) {
+                new (&buf[i]) Node();
+                freeList.push_back(&buf[i]);
+            }
+        }
+
+        inline bool empty() const { return reqList.empty(); }
+        inline bool full() const { return freeList.empty(); }
+        inline size_t size() const { return reqList.size(); }
+
+        inline T* alloc() {
+            assert(!full());
+            Node* n = freeList.back();
+            freeList.pop_back();
+            reqList.push_back(n);
+            return &n->elem;
+        }
+
+        struct iterator {
+            Node* n;
+            explicit inline iterator(Node* _n) : n(_n) {}
+            inline void inc() {n = n->next;}  // overloading prefix/postfix too messy
+            inline T* operator*() const { return &(n->elem); }
+            inline bool operator==(const iterator& it) const { return it.n == n; }
+            inline bool operator!=(const iterator& it) const { return it.n != n; }
+        };
+
+        inline iterator begin() const {return iterator(reqList.front());}
+        inline iterator end() const {return iterator(NULL);}
+
+        inline void remove(iterator i) {
+            assert(i.n);
+            reqList.remove(i.n);
+            freeList.push_back(i.n);
+        }
+};
+
+class DDRMemoryAccEvent;
+class SchedEvent;
+
+// Single-channel controller. For multiple channels, use multiple controllers.
+class DDRMemory : public MemObject {
+    private:
+        
+        struct AddrLoc {
+            uint64_t row;
+            uint32_t bank;
+            uint32_t rank;
+            uint32_t col;
+        };
+
+        struct Request : InListNode<Request> {
+            Address addr;
+            AddrLoc loc;
+            bool write;
+
+            uint64_t rowHitSeq; // sequence number used to throttle max # row hits
+
+            // Cycle accounting
+            uint64_t arrivalCycle;  // in memCycles
+            uint64_t startSysCycle;  // in sysCycles
+
+            // Corresponding event to send a response to
+            // Writes get a response immediately, so this is NULL for them
+            DDRMemoryAccEvent* ev;
+        };
+
+        struct Bank {
+            uint64_t openRow;
+            bool open;  // false indicates a PRE has been issued
+
+            // Timing constraints
+            uint64_t minPreCycle;   // if !open, time of last PRE; if open, min cycle PRE can be issued
+            uint64_t lastActCycle;  // cycle of last ACT command
+            uint64_t lastCmdCycle;  // RD/WR command, used for refreshes only
+            
+            uint64_t curRowHits;    // row hits on the currently opened row
+
+            InList<Request> rdReqs;
+            InList<Request> wrReqs;
+        };
+
+        // Global timing constraints
+        /* We wake up at minSchedCycle, issue one or more requests, and
+         * reschedule ourselves at the new minSchedCycle if any requests remain
+         * unserved.
+         */
+        uint64_t minSchedCycle; // TODO: delayed commands still not implemented
+        // Minimum cycle at which the next response may arrive
+        // Equivalent to first cycle that the data bus can be used
+        uint64_t minRespCycle;
+        bool lastCmdWasWrite;
+
+        static const uint32_t JEDEC_BUS_WIDTH = 64;
+        const uint32_t lineSize, ranksPerChannel, banksPerRank;
+        const uint32_t controllerSysLatency;  // in sysCycles
+        const uint32_t queueDepth;
+        const uint32_t rowHitLimit; // row hits not prioritized in FR-FCFS beyond this point
+        const bool deferredWrites;
+        const bool closedPage;
+        const uint32_t domain;
+
+        // DRAM timing parameters -- initialized in initTech()
+        // All parameters are in memory clocks (multiples of tCK)
+        uint32_t tBL;    // burst length (== tTrans)
+        uint32_t tCL;    // CAS latency
+        uint32_t tRCD;   // ACT to CAS
+        uint32_t tRTP;   // RD to PRE
+        uint32_t tRP;    // PRE to ACT
+        uint32_t tRRD;   // ACT to ACT
+        uint32_t tRAS;   // ACT to PRE
+        uint32_t tFAW;   // No more than 4 ACTs per rank in this window
+        uint32_t tWTR;   // end of WR burst to RD command
+        uint32_t tWR;    // end of WR burst to PRE
+        uint32_t tRFC;   // Refresh to ACT (refresh leaves rows closed)
+        uint32_t tREFI;  // Refresh interval
+
+        // Address mapping information
+        uint32_t colShift, colMask;
+        uint32_t rankShift, rankMask;
+        uint32_t bankShift, bankMask;
+        uint64_t rowShift;  // row's always top
+
+        uint32_t minRdLatency;
+        uint32_t minWrLatency;
+        uint32_t preDelay, postDelayRd, postDelayWr;
+
+        RequestQueue<Request> rdQueue, wrQueue;
+        std::deque<Request> overflowQueue;
+
+        g_vector< g_vector<Bank> > banks; // indexed by rank, bank
+        g_vector<ActWindow> rankActWindows;
+        
+        // Event scheduling
+        SchedEvent* nextSchedEvent;
+        uint64_t nextSchedCycle;
+        SchedEvent* eventFreelist;
+
+        const g_string name;
+
+        // R/W stats
+        PAD();
+        Counter profReads, profWrites;
+        Counter profTotalRdLat, profTotalWrLat;
+        Counter profReadHits, profWriteHits;  // row buffer hits
+        VectorCounter latencyHist;
+        static const uint32_t BINSIZE = 10, NUMBINS = 100;
+        PAD();
+
+        //In KHz, though it does not matter so long as they are consistent and fine-grain enough (not Hz because we multiply
+        //uint64_t cycles by this; as it is, KHzs are 20 bits, so we can simulate ~40+ bits (a few trillion system cycles, around an hour))
+        uint64_t sysFreqKHz, memFreqKHz;
+
+        // sys<->mem cycle xlat functions. We get and must return system cycles, but all internal logic is in memory cycles
+        // will do the right thing so long as you multiply first
+        inline uint64_t sysToMemCycle(uint64_t sysCycle) { return sysCycle*memFreqKHz/sysFreqKHz+1; }
+        inline uint64_t memToSysCycle(uint64_t memCycle) { return (memCycle+1)*sysFreqKHz/memFreqKHz; }
+
+        // Produces a sysCycle that, when translated back using sysToMemCycle, will produce the same memCycle
+        // Requires memFreq < sysFreq/2
+        inline uint64_t matchingMemToSysCycle(uint64_t memCycle) {
+            // The -sysFreqKHz/memFreqKHz/2 cancels the +1 in sysToMemCycle in integer arithmetic --- you can prove this with inequalities
+            return (2*memCycle-1)*sysFreqKHz/memFreqKHz/2;
+        }
+
+    public:
+        DDRMemory(uint32_t _lineSize, uint32_t _colSize, uint32_t _ranksPerChannel, uint32_t _banksPerRank,
+            uint32_t _sysFreqMHz, const char* tech, const char* addrMapping, uint32_t _controllerSysLatency,
+            uint32_t _queueDepth, uint32_t _rowHitLimit, bool _deferredWrites, bool _closedPage,
+            uint32_t _domain, g_string& _name);
+
+        void initStats(AggregateStat* parentStat);
+        const char* getName() {return name.c_str();}
+
+        // Bound phase interface
+        uint64_t access(MemReq& req);
+
+        // Weave phase interface
+        void enqueue(DDRMemoryAccEvent* ev, uint64_t cycle);
+        void refresh(uint64_t sysCycle);
+
+        // Scheduling event interface
+        uint64_t tick(uint64_t sysCycle);
+        void recycleEvent(SchedEvent* ev);
+
+    private:
+        AddrLoc mapLineAddr(Address lineAddr);
+        
+        void queue(Request* req, uint64_t memCycle);
+        
+        inline uint64_t trySchedule(uint64_t curCycle, uint64_t sysCycle);
+        uint64_t findMinCmdCycle(const Request& r) const;
+        
+        void initTech(const char* tech);
+};
+
+
+#endif  // DDR_MEM_H_
diff --git a/src/debug.h b/src/debug.h
new file mode 100644
index 00000000..a7778342
--- /dev/null
+++ b/src/debug.h
@@ -0,0 +1,38 @@
+/** $lic$
+ * Copyright (C) 2012-2014 by Massachusetts Institute of Technology
+ * Copyright (C) 2010-2013 by The Board of Trustees of Stanford University
+ *
+ * This file is part of zsim.
+ *
+ * zsim is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU General Public License as published by the Free Software
+ * Foundation, version 2.
+ *
+ * If you use this software in your research, we request that you reference
+ * the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of
+ * Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the
+ * source of the simulator in any publications that use this software, and that
+ * you send us a citation of your work.
+ *
+ * zsim is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef DEBUG_H_
+#define DEBUG_H_
+
+//This header has common debugging datastructure defs.
+
+/* Describes the addresses at which libzsim.so is loaded. GDB needs this. */
+struct LibInfo {
+    void* textAddr;
+    void* bssAddr;
+    void* dataAddr;
+};
+
+#endif  // DEBUG_H_
diff --git a/src/debug_harness.cpp b/src/debug_harness.cpp
new file mode 100644
index 00000000..bb69fe09
--- /dev/null
+++ b/src/debug_harness.cpp
@@ -0,0 +1,62 @@
+/** $lic$
+ * Copyright (C) 2012-2014 by Massachusetts Institute of Technology
+ * Copyright (C) 2010-2013 by The Board of Trustees of Stanford University
+ *
+ * This file is part of zsim.
+ *
+ * zsim is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU General Public License as published by the Free Software
+ * Foundation, version 2.
+ *
+ * If you use this software in your research, we request that you reference
+ * the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of
+ * Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the
+ * source of the simulator in any publications that use this software, and that
+ * you send us a citation of your work.
+ *
+ * zsim is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "debug_harness.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <string>
+#include <unistd.h>
+#include "log.h"
+#include "str.h"
+
+//For funky macro stuff
+#define QUOTED_(x) #x
+#define QUOTED(x) QUOTED_(x)
+
+/* This file is pretty much self-contained, and has minimal external dependencies.
+ * Please keep it this way, and ESPECIALLY don't include Pin headers since there
+ * seem to be conflicts between those and some system headers.
+ */
+
+int launchXtermDebugger(int targetPid, LibInfo* libzsimAddrs) {
+    int childPid = fork();
+    if (childPid == 0) {
+        std::string targetPidStr = Str(targetPid);
+        char symbolCmdStr[2048];
+        snprintf(symbolCmdStr, sizeof(symbolCmdStr), "add-symbol-file %s %p -s .data %p -s .bss %p", QUOTED(ZSIM_PATH), libzsimAddrs->textAddr, libzsimAddrs->dataAddr, libzsimAddrs->bssAddr);
+
+        const char* const args[] = {"xterm", "-e", "gdb", "-p", targetPidStr.c_str(),
+            "-ex", "set confirm off", //we know what we're doing in the following 2 commands
+            "-ex", symbolCmdStr,
+            "-ex", "handle SIGTRAP nostop noprint", // For some reason we receive a lot of spurious sigtraps
+            "-ex", "set confirm on", //reenable confirmations
+            "-ex", "c", //start running
+            NULL};
+        execvp(args[0], (char* const*)args);
+        panic("shouldn't reach this...");
+    } else {
+        return childPid;
+    }
+}
diff --git a/src/debug_harness.h b/src/debug_harness.h
new file mode 100644
index 00000000..dc1f5ef8
--- /dev/null
+++ b/src/debug_harness.h
@@ -0,0 +1,38 @@
+/** $lic$
+ * Copyright (C) 2012-2014 by Massachusetts Institute of Technology
+ * Copyright (C) 2010-2013 by The Board of Trustees of Stanford University
+ *
+ * This file is part of zsim.
+ *
+ * zsim is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU General Public License as published by the Free Software
+ * Foundation, version 2.
+ *
+ * If you use this software in your research, we request that you reference
+ * the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of
+ * Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the
+ * source of the simulator in any publications that use this software, and that
+ * you send us a citation of your work.
+ *
+ * zsim is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef DEBUG_HARNESS_H_
+#define DEBUG_HARNESS_H_
+
+#include "debug.h"
+
+/* Launch gdb automatically in a separate xterm window to debug the current process.
+ * I'm doing this because I'm sick to death of debugging manually (wait 20 secs, attach
+ * to PID, copy the libzsim.so symbol file command, etc etc).
+ * Returns PID of children. Must be called from harness, since we can't fork from a pintool.
+ */
+int launchXtermDebugger(int targetPid, LibInfo* libzsimAddrs);
+
+#endif  // DEBUG_HARNESS_H_
diff --git a/src/debug_zsim.cpp b/src/debug_zsim.cpp
new file mode 100644
index 00000000..ff049474
--- /dev/null
+++ b/src/debug_zsim.cpp
@@ -0,0 +1,98 @@
+/** $lic$
+ * Copyright (C) 2012-2014 by Massachusetts Institute of Technology
+ * Copyright (C) 2010-2013 by The Board of Trustees of Stanford University
+ *
+ * This file is part of zsim.
+ *
+ * zsim is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU General Public License as published by the Free Software
+ * Foundation, version 2.
+ *
+ * If you use this software in your research, we request that you reference
+ * the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of
+ * Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the
+ * source of the simulator in any publications that use this software, and that
+ * you send us a citation of your work.
+ *
+ * zsim is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "debug_zsim.h"
+#include <fcntl.h>
+#include <gelf.h>
+#include <link.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include "log.h"
+
+/* This file is pretty much self-contained, and has minimal external dependencies.
+ * Please keep it this way, and ESPECIALLY don't include Pin headers since there
+ * seem to be conflicts between those and some system headers.
+ */
+
+static int pp_callback(dl_phdr_info* info, size_t size, void* data) {
+    if (strstr(info->dlpi_name, "libzsim.so")) {
+        int fd;
+        Elf* e;
+        Elf_Scn* scn;
+        if ((fd = open (info->dlpi_name, O_RDONLY , 0)) < 0)
+            panic("Opening %s failed", info->dlpi_name);
+        elf_version(EV_CURRENT);
+        if ((e = elf_begin(fd, ELF_C_READ, NULL)) == NULL)
+            panic("elf_begin() failed");
+        size_t shstrndx; //we need this to get the section names
+        if (elf_getshdrstrndx(e, &shstrndx) != 0)
+            panic("elf_getshdrstrndx() failed");
+
+        LibInfo* offsets = static_cast<LibInfo*>(data);
+        offsets->textAddr = NULL;
+        offsets->dataAddr = NULL;
+        offsets->bssAddr = NULL;
+
+        scn = NULL;
+        while ((scn = elf_nextscn(e, scn)) != NULL) {
+            GElf_Shdr shdr;
+            if (gelf_getshdr(scn, &shdr) != &shdr)
+                panic("gelf_getshdr() failed");
+            char* name = elf_strptr(e, shstrndx , shdr.sh_name);
+            //info("Section %s %lx %lx", name, shdr.sh_addr, shdr.sh_offset);
+            //info("Section %s %lx %lx\n", name, info->dlpi_addr + shdr.sh_addr, info->dlpi_addr + shdr.sh_offset);
+            void* sectionAddr = reinterpret_cast<void*>(info->dlpi_addr + shdr.sh_addr);
+            if (strcmp(".text", name) == 0) {
+                offsets->textAddr = sectionAddr;
+            } else if (strcmp(".data", name) == 0) {
+                offsets->dataAddr = sectionAddr;
+            } else if (strcmp(".bss", name) == 0) {
+                offsets->bssAddr = sectionAddr;
+            }
+        }
+        elf_end(e);
+        close(fd);
+
+        //Check that we got all the section addresses; it'd be extremely weird if we didn't
+        assert(offsets->textAddr && offsets->dataAddr && offsets->bssAddr);
+
+        return 1; //stops iterating
+    }
+    return 0; //continues iterating
+}
+
+void getLibzsimAddrs(LibInfo* libzsimAddrs) {
+    int ret = dl_iterate_phdr(pp_callback, libzsimAddrs);
+    if (ret != 1) panic("libzsim.so not found");
+}
+
+
+void notifyHarnessForDebugger(int harnessPid) {
+    kill(harnessPid, SIGUSR1);
+    sleep(1); //this is a bit of a hack, but ensures the debugger catches us
+}
diff --git a/src/debug_zsim.h b/src/debug_zsim.h
new file mode 100644
index 00000000..00f8b1da
--- /dev/null
+++ b/src/debug_zsim.h
@@ -0,0 +1,41 @@
+/** $lic$
+ * Copyright (C) 2012-2014 by Massachusetts Institute of Technology
+ * Copyright (C) 2010-2013 by The Board of Trustees of Stanford University
+ *
+ * This file is part of zsim.
+ *
+ * zsim is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU General Public License as published by the Free Software
+ * Foundation, version 2.
+ *
+ * If you use this software in your research, we request that you reference
+ * the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of
+ * Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the
+ * source of the simulator in any publications that use this software, and that
+ * you send us a citation of your work.
+ *
+ * zsim is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef DEBUG_ZSIM_H_
+#define DEBUG_ZSIM_H_
+
+#include "debug.h"
+
+/* Gather libzsim addresses and initialize a libinfo structure.
+ * This is needed to essentially replicate the line that PIN prints when
+ * called with pause_tool. It uses libelf, but PIN is linked to it already
+ * (I bet that PIN does pretty much the same thing).
+ */
+void getLibzsimAddrs(LibInfo* libzsimAddrs);
+
+/* Signal the harness process that we're ready to be debugged */
+void notifyHarnessForDebugger(int harnessPid);
+
+#endif  // DEBUG_ZSIM_H_
diff --git a/src/decoder.cpp b/src/decoder.cpp
new file mode 100644
index 00000000..663e9f12
--- /dev/null
+++ b/src/decoder.cpp
@@ -0,0 +1,1487 @@
+/** $lic$
+ * Copyright (C) 2012-2014 by Massachusetts Institute of Technology
+ * Copyright (C) 2010-2013 by The Board of Trustees of Stanford University
+ *
+ * This file is part of zsim.
+ *
+ * zsim is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU General Public License as published by the Free Software
+ * Foundation, version 2.
+ *
+ * If you use this software in your research, we request that you reference
+ * the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of
+ * Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the
+ * source of the simulator in any publications that use this software, and that
+ * you send us a citation of your work.
+ *
+ * zsim is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "decoder.h"
+#include <algorithm>
+#include <fstream>
+#include <iomanip>
+#include <iostream>
+#include <string.h>
+#include <string>
+#include <vector>
+#include "core.h"
+#include "locks.h"
+#include "log.h"
+
+extern "C" {
+#include "xed-interface.h"
+}
+
+//XED expansion macros (enable us to type opcodes at a reasonable speed)
+#define XC(cat) (XED_CATEGORY_##cat)
+#define XO(opcode) (XED_ICLASS_##opcode)
+
+//PORT defines. You might want to change these to affect scheduling
+#define PORT_0 (0x1)
+#define PORT_1 (0x2)
+#define PORT_2 (0x4)
+#define PORT_3 (0x8)
+#define PORT_4 (0x10)
+#define PORT_5 (0x20)
+
+#define PORTS_015 (PORT_0 | PORT_1 | PORT_5)
+
+void DynUop::clear() {
+    memset(this, 0, sizeof(DynUop));  // NOTE: This may break if DynUop becomes non-POD
+}
+
+Decoder::Instr::Instr(INS _ins) : ins(_ins), numLoads(0), numInRegs(0), numOutRegs(0), numStores(0) {
+    uint32_t numOperands = INS_OperandCount(ins);
+    for (uint32_t op = 0; op < numOperands; op++) {
+        bool read = INS_OperandRead(ins, op);
+        bool write = INS_OperandWritten(ins, op);
+        assert(read || write);
+        if (INS_OperandIsMemory(ins, op)) {
+            if (read) loadOps[numLoads++] = op;
+            if (write) storeOps[numStores++] = op;
+        } else if (INS_OperandIsReg(ins, op) && INS_OperandReg(ins, op)) { //it's apparently possible to get INS_OperandIsReg to be true and an invalid reg ... WTF Pin?
+            REG reg = INS_OperandReg(ins, op);
+            assert(reg);  // can't be invalid
+            reg = REG_FullRegName(reg);  // eax -> rax, etc; o/w we'd miss a bunch of deps!
+            if (read) inRegs[numInRegs++] = reg;
+            if (write) outRegs[numOutRegs++] = reg;
+        }
+    }
+
+    //By convention, we move flags regs to the end
+    reorderRegs(inRegs, numInRegs);
+    reorderRegs(outRegs, numOutRegs);
+}
+
+static inline bool isFlagsReg(uint32_t reg) {
+    return (reg == REG_EFLAGS || reg == REG_FLAGS || reg == REG_MXCSR);
+}
+
+void Decoder::Instr::reorderRegs(uint32_t* array, uint32_t regs) {
+    if (regs == 0) return;
+    //Unoptimized bubblesort -- when arrays are this short, regularity wins over O(n^2).
+    uint32_t swaps;
+    do {
+        swaps = 0;
+        for (uint32_t i = 0; i < regs-1; i++) {
+            if (isFlagsReg(array[i]) && !isFlagsReg(array[i+1])) {
+                std::swap(array[i], array[i+1]);
+                swaps++;
+            }
+        }
+    } while (swaps > 0);
+}
+
+//Helper function
+static std::string regsToString(uint32_t* regs, uint32_t numRegs) {
+    std::string str = ""; //if efficiency was a concern, we'd use a stringstream
+    if (numRegs) {
+        str += "(";
+        for (uint32_t i = 0; i < numRegs - 1; i++) {
+            str += REG_StringShort((REG)regs[i]) + ", ";
+        }
+        str += REG_StringShort((REG)regs[numRegs - 1]) + ")";
+    }
+    return str;
+}
+
+void Decoder::reportUnhandledCase(Instr& instr, const char* desc) {
+    warn("Unhandled case: %s | %s | loads=%d stores=%d inRegs=%d %s outRegs=%d %s", desc, INS_Disassemble(instr.ins).c_str(),
+            instr.numLoads, instr.numStores, instr.numInRegs, regsToString(instr.inRegs, instr.numInRegs).c_str(),
+            instr.numOutRegs, regsToString(instr.outRegs, instr.numOutRegs).c_str());
+}
+
+void Decoder::emitLoad(Instr& instr, uint32_t idx, DynUopVec& uops, uint32_t destReg) {
+    assert(idx < instr.numLoads);
+    uint32_t op = instr.loadOps[idx];
+    uint32_t baseReg = INS_OperandMemoryBaseReg(instr.ins, op);
+    uint32_t indexReg = INS_OperandMemoryIndexReg(instr.ins, op);
+
+    if (destReg == 0) destReg = REG_LOAD_TEMP + idx;
+
+    DynUop uop;
+    uop.clear();
+    uop.rs[0] = baseReg;
+    uop.rs[1] = indexReg;
+    uop.rd[0] = destReg;
+    uop.type = UOP_LOAD;
+    uop.portMask = PORT_2;
+    uops.push_back(uop); //FIXME: The interface should support in-place grow...
+}
+
+void Decoder::emitStore(Instr& instr, uint32_t idx, DynUopVec& uops, uint32_t srcReg) {
+    assert(idx < instr.numStores);
+    uint32_t op = instr.storeOps[idx];
+    uint32_t baseReg = INS_OperandMemoryBaseReg(instr.ins, op);
+    uint32_t indexReg = INS_OperandMemoryIndexReg(instr.ins, op);
+
+    if (srcReg == 0) srcReg = REG_STORE_TEMP + idx;
+
+    uint32_t addrReg;
+
+    //Emit store address uop
+    //NOTE: Although technically one uop would suffice with <=1 address register,
+    //stores always generate 2 uops. The store address uop is especially important,
+    //as in Nehalem loads don't issue after all prior store addresses have been resolved.
+    addrReg = REG_STORE_ADDR_TEMP + idx;
+
+    DynUop addrUop;
+    addrUop.clear();
+    addrUop.rs[0] = baseReg;
+    addrUop.rs[1] = indexReg;
+    addrUop.rd[0] = addrReg;
+    addrUop.lat = 1;
+    addrUop.portMask = PORT_3;
+    addrUop.type = UOP_STORE_ADDR;
+    uops.push_back(addrUop);
+
+    //Emit store uop
+    DynUop uop;
+    uop.clear();
+    uop.rs[0] = addrReg;
+    uop.rs[1] = srcReg;
+    uop.portMask = PORT_4;
+    uop.type = UOP_STORE;
+    uops.push_back(uop);
+}
+
+
+void Decoder::emitLoads(Instr& instr, DynUopVec& uops) {
+    for (uint32_t i = 0; i < instr.numLoads; i++) {
+        emitLoad(instr, i, uops);
+    }
+}
+
+void Decoder::emitStores(Instr& instr, DynUopVec& uops) {
+    for (uint32_t i = 0; i < instr.numStores; i++) {
+        emitStore(instr, i, uops);
+    }
+}
+
+void Decoder::emitFence(DynUopVec& uops, uint32_t lat) {
+    DynUop uop;
+    uop.clear();
+    uop.lat = lat;
+    uop.portMask = PORT_4; //to the store queue
+    uop.type = UOP_FENCE;
+    uops.push_back(uop);
+}
+
+void Decoder::emitExecUop(uint32_t rs0, uint32_t rs1, uint32_t rd0, uint32_t rd1, DynUopVec& uops, uint32_t lat, uint8_t ports, uint8_t extraSlots) {
+    DynUop uop;
+    uop.clear();
+    uop.rs[0] = rs0;
+    uop.rs[1] = rs1;
+    uop.rd[0] = rd0;
+    uop.rd[1] = rd1;
+    uop.lat = lat;
+    uop.type = UOP_GENERAL;
+    uop.portMask = ports;
+    uop.extraSlots = extraSlots;
+    uops.push_back(uop);
+}
+
+void Decoder::emitBasicMove(Instr& instr, DynUopVec& uops, uint32_t lat, uint8_t ports) {
+    if (instr.numLoads + instr.numInRegs > 1 || instr.numStores + instr.numOutRegs != 1) {
+        reportUnhandledCase(instr, "emitBasicMove");
+    }
+    //Note that we can have 0 loads and 0 input registers. In this case, we are loading from an immediate, and we set the input register to 0 so there is no dependence
+    uint32_t inReg = (instr.numInRegs == 1)? instr.inRegs[0] : 0;
+    if (!instr.numLoads && !instr.numStores) { //reg->reg
+        emitExecUop(inReg, 0, instr.outRegs[0], 0, uops, lat, ports);
+    } else if (instr.numLoads && !instr.numStores) { //mem->reg
+        emitLoad(instr, 0, uops, instr.outRegs[0]);
+    } else if (!instr.numLoads && instr.numStores) { //reg->mem
+        emitStore(instr, 0, uops, inReg);
+    } else { //mem->mem
+        emitLoad(instr, 0, uops);
+        emitStore(instr, 0, uops, REG_LOAD_TEMP /*chain with load*/);
+    }
+}
+
+void Decoder::emitXchg(Instr& instr, DynUopVec& uops) {
+    if (instr.numLoads) { // mem <-> reg
+        assert(instr.numLoads == 1 && instr.numStores == 1);
+        assert(instr.numInRegs == 1 && instr.numOutRegs == 1);
+        assert(instr.inRegs[0] == instr.outRegs[0]);
+
+        emitLoad(instr, 0, uops);
+        emitExecUop(instr.inRegs[0], 0, REG_EXEC_TEMP, 0, uops, 1, PORTS_015); //r -> temp
+        emitExecUop(REG_LOAD_TEMP, 0, instr.outRegs[0], 0, uops, 1, PORTS_015); // load -> r
+        emitStore(instr, 0, uops, REG_EXEC_TEMP); //temp -> out
+        if (!INS_LockPrefix(instr.ins)) emitFence(uops, 14); //xchg has an implicit lock prefix (TODO: Check we don't introduce two fences...)
+    } else { // reg <-> reg
+        assert(instr.numInRegs == 2 && instr.numOutRegs == 2);
+        assert(instr.inRegs[0] == instr.outRegs[0]);
+        assert(instr.inRegs[1] == instr.outRegs[1]);
+
+        emitExecUop(instr.inRegs[0], 0, REG_EXEC_TEMP, 0, uops, 1, PORTS_015);
+        emitExecUop(instr.inRegs[1], 0, instr.outRegs[0], 0, uops, 1, PORTS_015);
+        emitExecUop(REG_EXEC_TEMP, 0, instr.outRegs[1], 0, uops, 1, PORTS_015);
+    }
+}
+
+
+void Decoder::emitConditionalMove(Instr& instr, DynUopVec& uops, uint32_t lat, uint8_t ports) {
+    uint32_t initialUops = uops.size();
+    assert(instr.numOutRegs == 1); //always move to reg
+    assert(instr.numStores == 0);
+
+    if (instr.numLoads) {
+        assert(instr.numLoads == 1);
+        assert(instr.numInRegs == 1);
+        uint32_t flagsReg = instr.inRegs[0];
+        emitExecUop(flagsReg, 0, REG_EXEC_TEMP, 0, uops, lat, ports);
+        emitLoad(instr, 0, uops);
+        uint32_t numUops = uops.size();
+        assert(numUops - initialUops == 2);
+        //We need to make the load depend on the result. This is quite crude, but works:
+        uops[numUops - 2].rs[1] = uops[numUops - 1].rs[1]; //comparison uop gets source of load (possibly 0)
+        uops[numUops - 1].rs[1] = REG_EXEC_TEMP; //load uop is made to depend on comparison uop
+        //TODO: Make this follow codepath below + load
+    } else {
+        assert(instr.numInRegs == 2);
+        assert(instr.numOutRegs == 1);
+        uint32_t flagsReg = instr.inRegs[1];
+        //Since this happens in 2 instructions, we'll assume we need to read the output register
+        emitExecUop(flagsReg, instr.inRegs[0], REG_EXEC_TEMP, 0, uops, 1, ports);
+        emitExecUop(instr.outRegs[0], REG_EXEC_TEMP, instr.outRegs[0], 0, uops, lat, ports);
+    }
+}
+
+void Decoder::emitCompareAndExchange(Instr& instr, DynUopVec& uops) {
+    emitLoads(instr, uops);
+
+    uint32_t srcs = instr.numLoads + instr.numInRegs;
+    uint32_t dsts = instr.numStores + instr.numOutRegs;
+
+    uint32_t srcRegs[srcs + 2];
+    uint32_t dstRegs[dsts + 2];
+    populateRegArrays(instr, srcRegs, dstRegs);
+
+    assert(srcs == 3);
+    assert(dsts == 3);
+
+    //reportUnhandledCase(instr, "XXXX");
+    //info("%d %d %d | %d %d %d", srcRegs[0], srcRegs[1], srcRegs[2], dstRegs[0], dstRegs[1], dstRegs[2]);
+
+    uint32_t rflags = dstRegs[2];
+    uint32_t rax = dstRegs[1]; //note: can be EAX, etc
+    assert(srcRegs[2] == rax); //if this fails, pin has changed the register orderings...
+
+    //Compare destination (first operand) w/ RAX. If equal, copy source (second operand) into destination and set the zero flag; o/w copy destination into RAX
+    if (!instr.numLoads) {
+        //2 swaps, implemented in 2 stages: first, and all sources with rflags.zf; then or results pairwise. This is pure speculation, but matches uops required.
+        emitExecUop(srcRegs[0], rax, REG_EXEC_TEMP, rflags, uops, 1, PORTS_015); //includes compare
+        emitExecUop(srcRegs[1], rflags, REG_EXEC_TEMP+1, 0, uops, 2, PORTS_015);
+        emitExecUop(srcRegs[2], rflags, REG_EXEC_TEMP+2, 0, uops, 2, PORTS_015);
+
+        emitExecUop(REG_EXEC_TEMP, REG_EXEC_TEMP+1, dstRegs[0], 0, uops, 2, PORTS_015);
+        emitExecUop(REG_EXEC_TEMP+1, REG_EXEC_TEMP+2, dstRegs[1] /*rax*/, 0, uops, 2, PORTS_015);
+    } else {
+        //6 uops (so 3 exec), and critical path is 4 (for rax), GO FIGURE
+        emitExecUop(srcRegs[0], rax, REG_EXEC_TEMP, rflags, uops, 2, PORTS_015);
+        emitExecUop(srcRegs[1], rflags, dstRegs[0], 0, uops, 2, PORTS_015); //let's assume we can do a fancy conditional store
+        emitExecUop(srcRegs[2], REG_EXEC_TEMP, dstRegs[1] /*rax*/, 0, uops, 2, PORTS_015); //likewise
+    }
+
+    //NOTE: While conceptually srcRegs[0] == dstRegs[0], when it's a memory location they map to different temporary regs
+
+    emitStores(instr, uops);
+}
+
+
+
+void Decoder::populateRegArrays(Instr& instr, uint32_t* srcRegs, uint32_t* dstRegs) {
+    uint32_t curSource = 0;
+    for (uint32_t i = 0; i < instr.numLoads; i++) {
+        srcRegs[curSource++] = REG_LOAD_TEMP + i;
+    }
+    for (uint32_t i = 0; i < instr.numInRegs; i++) {
+        srcRegs[curSource++] = instr.inRegs[i];
+    }
+    srcRegs[curSource++] = 0;
+    srcRegs[curSource++] = 0;
+
+    uint32_t curDest = 0;
+    for (uint32_t i = 0; i < instr.numStores; i++) {
+        dstRegs[curDest++] = REG_STORE_TEMP + i;
+    }
+    for (uint32_t i = 0; i < instr.numOutRegs; i++) {
+        dstRegs[curDest++] = instr.outRegs[i];
+    }
+    dstRegs[curDest++] = 0;
+    dstRegs[curDest++] = 0;
+}
+
+void Decoder::emitBasicOp(Instr& instr, DynUopVec& uops, uint32_t lat, uint8_t ports, uint8_t extraSlots, bool reportUnhandled) {
+    emitLoads(instr, uops);
+
+    uint32_t srcs = instr.numLoads + instr.numInRegs;
+    uint32_t dsts = instr.numStores + instr.numOutRegs;
+
+    uint32_t srcRegs[srcs + 2];
+    uint32_t dstRegs[dsts + 2];
+    populateRegArrays(instr, srcRegs, dstRegs);
+
+    if (reportUnhandled && (srcs > 2 || dsts > 2)) reportUnhandledCase(instr, "emitBasicOp"); //We're going to be ignoring some dependencies
+
+    emitExecUop(srcRegs[0], srcRegs[1], dstRegs[0], dstRegs[1], uops, lat, ports, extraSlots);
+
+    emitStores(instr, uops);
+}
+
+void Decoder::emitChainedOp(Instr& instr, DynUopVec& uops, uint32_t numUops, uint32_t* latArray, uint8_t* portsArray) {
+    emitLoads(instr, uops);
+
+    uint32_t srcs = instr.numLoads + instr.numInRegs;
+    uint32_t dsts = instr.numStores + instr.numOutRegs;
+
+    uint32_t srcRegs[srcs + 2];
+    uint32_t dstRegs[dsts + 2];
+    populateRegArrays(instr, srcRegs, dstRegs);
+
+    assert(numUops > 1);
+    //if (srcs != numUops + 1) reportUnhandledCase(instr, "emitChainedOps");
+    assert(srcs + 2 >= numUops + 1); // note equality is not necessary in case one or more operands are immediates
+
+    emitExecUop(srcRegs[0], srcRegs[1], REG_EXEC_TEMP, 0, uops, latArray[0], portsArray[0]);
+    for (uint32_t i = 1; i < numUops-1; i++) {
+        emitExecUop(REG_EXEC_TEMP, srcRegs[i+1], REG_EXEC_TEMP, 0, uops, latArray[i], portsArray[i]);
+    }
+    emitExecUop(REG_EXEC_TEMP, srcRegs[numUops-1], dstRegs[0], dstRegs[1], uops, latArray[numUops-1], portsArray[numUops-1]);
+
+    emitStores(instr, uops);
+}
+
+//Some convert ops are implemented in 2 uops, even though they could just use one given src/dst reg constraints
+void Decoder::emitConvert2Op(Instr& instr, DynUopVec& uops, uint32_t lat1, uint32_t lat2, uint8_t ports1, uint8_t ports2) {
+    if (instr.numStores > 0 || instr.numLoads > 1 || instr.numOutRegs != 1 || instr.numLoads + instr.numInRegs != 1) {
+        reportUnhandledCase(instr, "convert");
+    } else {
+        //May have single load, has single output
+        uint32_t src;
+        if (instr.numLoads) {
+            emitLoads(instr, uops);
+            src = REG_LOAD_TEMP;
+        } else {
+            src = instr.inRegs[0];
+        }
+        uint32_t dst = instr.outRegs[0];
+        emitExecUop(src, 0, REG_EXEC_TEMP, 0, uops, lat1, ports1);
+        emitExecUop(REG_EXEC_TEMP, 0, dst, 0, uops, lat2, ports2);
+    }
+}
+
+
+void Decoder::emitMul(Instr& instr, DynUopVec& uops) {
+    uint32_t dsts = instr.numStores + instr.numOutRegs;
+    if (dsts == 3) {
+        emitLoads(instr, uops);
+
+        uint32_t srcs = instr.numLoads + instr.numInRegs;
+
+        uint32_t srcRegs[srcs + 2];
+        uint32_t dstRegs[dsts + 2];
+        populateRegArrays(instr, srcRegs, dstRegs);
+
+        assert(srcs <= 2);
+
+        emitExecUop(srcRegs[0], srcRegs[1], dstRegs[0], REG_EXEC_TEMP, uops, 3, PORT_1);
+        emitExecUop(srcRegs[0], srcRegs[1], dstRegs[1], REG_EXEC_TEMP+1, uops, 3, PORT_1);
+        emitExecUop(REG_EXEC_TEMP, REG_EXEC_TEMP+1, dstRegs[2], 0, uops, 1, PORTS_015);
+
+        emitStores(instr, uops);
+    } else {
+        emitBasicOp(instr, uops, 3, PORT_1);
+    }
+}
+
+void Decoder::emitDiv(Instr& instr, DynUopVec& uops) {
+    uint32_t srcs = instr.numLoads + instr.numInRegs;
+    uint32_t dsts = instr.numStores + instr.numOutRegs;
+
+    /* div and idiv are microsequenced, with a variable number of uops on all ports, and have fixed
+     * input and output regs (rdx:rax is the input, rax is the quotient and rdx is the remainder).
+     * Also, the number of uops and latency depends on the data. We approximate this with a 4-uop
+     * sequence that sorta kinda emulates the typical latency.
+     */
+
+    uint32_t srcRegs[srcs + 2];
+    uint32_t dstRegs[dsts + 2];
+    populateRegArrays(instr, srcRegs, dstRegs);
+
+    //assert(srcs == 3); //there is a variant of div that uses only 2 regs --> see below
+    //assert(dsts == 3);
+    assert(instr.numInRegs > 1);
+
+    uint32_t width = INS_OperandWidth(instr.ins, 1);
+    uint32_t lat = 0;
+    switch (width) {
+        case 8:
+            lat = 15;
+            break;
+        case 16:
+            lat = 19;
+            break;
+        case 32:
+            lat = 23;
+            break;
+        case 64:
+            lat = 63;
+            break;
+        default:
+            panic("emitDiv: Invalid reg size");
+    }
+    uint8_t extraSlots = lat-1;
+    if (srcs == 3 && dsts == 3) {
+        emitLoads(instr, uops);
+
+        emitExecUop(srcRegs[0], srcRegs[1], REG_EXEC_TEMP, 0, uops, lat, PORTS_015, extraSlots);
+        emitExecUop(srcRegs[0], srcRegs[2], REG_EXEC_TEMP+1, 0, uops, lat, PORTS_015, extraSlots);
+        emitExecUop(REG_EXEC_TEMP, REG_EXEC_TEMP+1, dstRegs[0], dstRegs[1], uops, 1, PORTS_015); //quotient and remainder
+        emitExecUop(REG_EXEC_TEMP, REG_EXEC_TEMP+1, dstRegs[2], 0, uops, 1, PORTS_015); //flags
+
+        emitStores(instr, uops);
+    } else if (srcs <= 2 && dsts <= 2) {
+        emitBasicOp(instr, uops, lat, PORTS_015, extraSlots);
+    } else {
+        reportUnhandledCase(instr, "emitDiv");
+    }
+}
+
+//Helper function
+static bool dropRegister(uint32_t targetReg, uint32_t* regs, uint32_t& numRegs) {
+    for (uint32_t i = 0; i < numRegs; i++) {
+        uint32_t reg = regs[i];
+        if (reg == targetReg) {
+            //Shift rest of regs
+            for (uint32_t j = i; j < numRegs - 1; j++) regs[j] = regs[j+1];
+            numRegs--;
+            return true;
+        }
+    }
+    return false;
+}
+
+void Decoder::dropStackRegister(Instr& instr) {
+    bool dropIn = dropRegister(REG_RSP, instr.inRegs, instr.numInRegs);
+    bool dropOut = dropRegister(REG_RSP, instr.outRegs, instr.numOutRegs);
+    if (!dropIn && !dropOut) /*reportUnhandledCase(instr, "dropStackRegister (no RSP found)")*/;
+    else reportUnhandledCase(instr, "dropStackRegister (RSP found)");
+}
+
+
+bool Decoder::decodeInstr(INS ins, DynUopVec& uops) {
+    uint32_t initialUops = uops.size();
+    bool inaccurate = false;
+    xed_category_enum_t category = (xed_category_enum_t) INS_Category(ins);
+    xed_iclass_enum_t opcode = (xed_iclass_enum_t) INS_Opcode(ins);
+
+    Instr instr(ins);
+
+    bool isLocked = false;
+    if (INS_LockPrefix(instr.ins)) {
+        isLocked = true;
+        emitFence(uops, 0); //serialize the initial load w.r.t. all prior stores
+    }
+
+
+    switch (category) {
+        //NOPs are optimized out in the execution pipe, but they still grab a ROB entry
+        case XC(NOP):
+        case XC(WIDENOP):
+            emitExecUop(0, 0, 0, 0, uops, 1, PORTS_015);
+            break;
+
+         /* Moves */
+        case XC(DATAXFER):
+            switch (opcode) {
+                case XO(BSWAP):
+                    emitBasicMove(instr, uops, 1, PORT_1);
+                    break;
+                case XO(MOV):
+                    emitBasicMove(instr, uops, 1, PORTS_015);
+                    break;
+                case XO(MOVAPS):
+                case XO(MOVAPD):
+                case XO(MOVUPS):
+                case XO(MOVUPD):
+                case XO(MOVSS):
+                case XO(MOVSD):
+                case XO(MOVSD_XMM):
+                case XO(MOVHLPS):
+                case XO(MOVLHPS):
+                case XO(MOVDDUP):
+                case XO(MOVSHDUP):
+                case XO(MOVSLDUP):
+                    emitBasicMove(instr, uops, 1, PORT_5);
+                    break;
+                case XO(MOVHPS):
+                case XO(MOVHPD):
+                case XO(MOVLPS):
+                case XO(MOVLPD):
+                    //A bit unclear... could be 2 or 3 cycles, and current microbenchmarks are not enough to tell
+                    emitBasicOp(instr, uops, /*2*/ 1, PORT_5);
+                    break;
+                case XO(MOVMSKPS):
+                case XO(MOVMSKPD):
+                    emitBasicMove(instr, uops, 1, PORT_0);
+                    break;
+                case XO(MOVD):
+                case XO(MOVQ):
+                case XO(MOVDQA):
+                case XO(MOVDQU):
+                case XO(MOVDQ2Q):
+                case XO(MOVQ2DQ):
+                    emitBasicMove(instr, uops, 1, PORTS_015); //like mov
+                    break;
+                case XO(MOVSX):
+                case XO(MOVSXD):
+                case XO(MOVZX):
+                    emitBasicMove(instr, uops, 1, PORTS_015); //like mov
+                    break;
+                case XO(XCHG):
+                    emitXchg(instr, uops);
+                    break;
+                default:
+                    //TODO: MASKMOVQ, MASKMOVDQ, MOVBE (Atom only), MOVNTxx variants (nontemporal), MOV_CR and MOV_DR (privileged?), VMOVxxxx variants (AVX)
+                    inaccurate = true;
+                    emitBasicMove(instr, uops, 1, PORTS_015);
+            }
+            break;
+
+        case XC(CMOV):
+            emitConditionalMove(instr, uops, 1, PORTS_015);
+            break;
+        case XC(FCMOV):
+            emitConditionalMove(instr, uops, 1, PORT_0);
+            break;
+
+        /* Barebones arithmetic instructions */
+        case XC(BINARY):
+            {
+                if (opcode == XO(ADC) || opcode == XO(SBB)) {
+                    uint32_t lats[] = {1, 1};
+                    uint8_t ports[] = {PORTS_015, PORTS_015};
+                    emitChainedOp(instr, uops, 2, lats, ports);
+                } else if (opcode == XO(MUL) || opcode == XO(IMUL)) {
+                    emitMul(instr, uops);
+                } else if (opcode == XO(DIV) || opcode == XO(IDIV)) {
+                    emitDiv(instr, uops);
+                } else {
+                    //ADD, SUB, CMP, DEC, INC, NEG are 1 cycle
+                    emitBasicOp(instr, uops, 1, PORTS_015);
+                }
+            }
+            break;
+        case XC(BITBYTE):
+            {
+                uint32_t opLat = 1;
+                switch (opcode) {
+                    case XO(BSF):
+                    case XO(BSR):
+                        opLat = 3;
+                        break;
+                        //TODO: EXTRQ, INSERTQ, LZCNT
+                    default: {} //BT, BTx, SETcc ops are 1 cycle
+                }
+                emitBasicOp(instr, uops, opLat, PORTS_015);
+            }
+            break;
+        case XC(LOGICAL):
+            //AND, OR, XOR, TEST are 1 cycle
+            emitBasicOp(instr, uops, 1, PORTS_015);
+            break;
+        case XC(ROTATE):
+            {
+                uint32_t opLat = 1; //ROR, ROL 1 cycle
+                if (opcode == XO(RCR) || opcode == XO(RCL)) opLat = 2;
+                emitBasicOp(instr, uops, opLat, PORT_0 | PORT_5);
+            }
+            break;
+        case XC(SHIFT):
+            {
+                if (opcode == XO(SHLD)|| opcode == XO(SHRD)) {
+                    uint32_t lats[] = {2, opcode == XO(SHLD)? 1u : 2u}; //SHRD takes 4 cycles total, SHLD takes 3
+                    uint8_t ports[] = {PORTS_015, PORTS_015};
+                    emitChainedOp(instr, uops, 2, lats, ports);
+                } else {
+                    uint32_t opLat = 1; //SHR SHL SAR are 1 cycle
+                    emitBasicOp(instr, uops, opLat, PORT_0 | PORT_5);
+                }
+            }
+            break;
+        case XC(DECIMAL): //pack/unpack BCD, these seem super-deprecated
+            {
+                uint32_t opLat = 1;
+                switch (opcode) {
+                    case XO(AAA):
+                    case XO(AAS):
+                    case XO(DAA):
+                    case XO(DAS):
+                        opLat = 3;
+                        break;
+                    case XO(AAD):
+                        opLat = 15;
+                        break;
+                    case XO(AAM):
+                        opLat = 20;
+                        break;
+                    default:
+                        panic("Invalid opcode for this class");
+                }
+                emitBasicOp(instr, uops, opLat, PORTS_015);
+            }
+            break;
+        case XC(FLAGOP):
+            switch (opcode) {
+                case XO(LAHF):
+                case XO(SAHF):
+                    emitBasicOp(instr, uops, 1, PORTS_015);
+                    break;
+                case XO(CLC):
+                case XO(STC):
+                case XO(CMC):
+                    emitBasicOp(instr, uops, 1, PORTS_015);
+                    break;
+                case XO(CLD):
+                    emitExecUop(0, 0, REG_EXEC_TEMP, 0, uops, 2, PORTS_015);
+                    emitExecUop(REG_EXEC_TEMP, 0, REG_RFLAGS, 0, uops, 2, PORTS_015);
+                    break;
+                case XO(STD):
+                    emitExecUop(0, 0, REG_EXEC_TEMP, 0, uops, 3, PORTS_015);
+                    emitExecUop(REG_EXEC_TEMP, 0, REG_RFLAGS, 0, uops, 2, PORTS_015);
+                    break;
+                default:
+                    inaccurate = true;
+            }
+            break;
+
+        case XC(SEMAPHORE): //atomic ops, these must involve memory
+            //reportUnhandledCase(instr, "SEM");
+            //emitBasicOp(instr, uops, 1, PORTS_015);
+
+            switch (opcode) {
+                case XO(CMPXCHG):
+                case XO(CMPXCHG8B):
+                //case XO(CMPXCHG16B): //not tested...
+                    emitCompareAndExchange(instr, uops);
+                    break;
+                case XO(XADD):
+                    {
+                        uint32_t lats[] = {2, 2};
+                        uint8_t ports[] = {PORTS_015, PORTS_015};
+                        emitChainedOp(instr, uops, 2, lats, ports);
+                    }
+                    break;
+                default:
+                    inaccurate = true;
+            }
+            break;
+
+        /* FP, SSE and other extensions */
+        case /*XC(X)87_ALU*/ XC(X87_ALU):
+            //emitBasicOp(instr, uops, 1, PORTS_015);
+            break;
+
+        case XED_CATEGORY_3DNOW:
+            //emitBasicOp(instr, uops, 1, PORTS_015);
+            break;
+
+        case XC(MMX):
+            //emitBasicOp(instr, uops, 1, PORTS_015);
+            break;
+
+        case XC(SSE):
+            {
+                //TODO: Multi-uop BLENDVXX, DPXX
+
+                uint32_t lat = 1;
+                uint8_t ports = PORTS_015;
+                uint8_t extraSlots = 0;
+                switch (opcode) {
+                    case XO(ADDPD):
+                    case XO(ADDPS):
+                    case XO(ADDSD):
+                    case XO(ADDSS):
+                    case XO(SUBPD):
+                    case XO(SUBPS):
+                    case XO(SUBSD):
+                    case XO(SUBSS):
+                    case XO(ADDSUBPD):
+                    case XO(ADDSUBPS):
+                        lat = 3;
+                        ports = PORT_1;
+                        break;
+
+                    case XO(BLENDPS):
+                    case XO(BLENDPD):
+                    case XO(SHUFPS):
+                    case XO(SHUFPD):
+                    case XO(UNPCKHPD):
+                    case XO(UNPCKHPS):
+                    case XO(UNPCKLPD):
+                    case XO(UNPCKLPS):
+                        lat = 1;
+                        ports = PORT_5;
+                        break;
+
+                    case XO(CMPPD):
+                    case XO(CMPPS):
+                    case XO(CMPSD):
+                    case XO(CMPSS):
+                        lat = 3;
+                        ports = PORT_1;
+                        break;
+
+                    case XO(COMISD):
+                    case XO(COMISS):
+                    case XO(UCOMISD):
+                    case XO(UCOMISS):
+                        lat = 1+2; //writes rflags, always crossing xmm -> int domains
+                        ports = PORT_1;
+                        break;
+
+                    case XO(DIVPS):
+                    case XO(DIVSS):
+                        lat = 7; //from mubench
+                        ports = PORT_0;
+                        extraSlots = lat - 1; //non-pipelined
+                        break;
+                    case XO(DIVPD):
+                    case XO(DIVSD):
+                        lat = 7; //from mubench
+                        ports = PORT_0; //non-pipelined
+                        extraSlots = lat - 1;
+                        break;
+
+                    case XO(MAXPD):
+                    case XO(MAXPS):
+                    case XO(MAXSD):
+                    case XO(MAXSS):
+                    case XO(MINPD):
+                    case XO(MINPS):
+                    case XO(MINSD):
+                    case XO(MINSS):
+                        lat = 3;
+                        ports = PORT_1;
+                        break;
+
+                    case XO(MULSS):
+                    case XO(MULPS):
+                        lat = 4;
+                        ports = PORT_0;
+                        break;
+                    case XO(MULSD):
+                    case XO(MULPD):
+                        lat = 5;
+                        ports = PORT_0;
+                        break;
+
+                    case XO(RCPPS):
+                    case XO(RCPSS):
+                        lat = 3;
+                        ports = PORT_1;
+                        break;
+
+                    case XO(ROUNDPD):
+                    case XO(ROUNDPS):
+                    case XO(ROUNDSD):
+                    case XO(ROUNDSS):
+                        lat = 3;
+                        ports = PORT_1;
+                        break;
+
+                    case XO(RSQRTPS):
+                    case XO(RSQRTSS):
+                        lat = 3;
+                        ports = PORT_1;
+                        extraSlots = 1; //from mubench, has reciprocal thput of 2
+                        break;
+
+                    case XO(SQRTSS):
+                    case XO(SQRTPS):
+                        lat = 7; //from mubench
+                        ports = PORT_0;
+                        extraSlots = lat-1; //unpiped
+                        break;
+
+                    case XO(SQRTSD):
+                    case XO(SQRTPD):
+                        lat = 7; //from mubench
+                        ports = PORT_0;
+                        extraSlots = lat-1; //unpiped
+                        break;
+
+                    case XO(POPCNT):
+                    case XO(CRC32):
+                        lat = 3;
+                        ports = PORT_1;
+                        break;
+
+                    //Packed arith; these are rare, so I'm implementing only what I've seen used (and simple variants)
+                    case XO(PADDB):
+                    case XO(PADDD):
+                    case XO(PADDQ):
+                    case XO(PADDSB):
+                    case XO(PADDSW):
+                    case XO(PADDUSB):
+                    case XO(PADDUSW):
+                    case XO(PADDW):
+                    case XO(PSUBB):
+                    case XO(PSUBD):
+                    case XO(PSUBQ):
+                    case XO(PSUBSB):
+                    case XO(PSUBSW):
+                    case XO(PSUBUSB):
+                    case XO(PSUBUSW):
+                    case XO(PSUBW):
+
+                    case XO(PALIGNR):
+
+                    case XO(PCMPEQB):
+                    case XO(PCMPEQD):
+                    case XO(PCMPEQQ):
+                    case XO(PCMPEQW):
+                    case XO(PCMPGTB):
+                    case XO(PCMPGTD):
+                    case XO(PCMPGTW):
+
+                    case XO(PUNPCKHBW):
+                    case XO(PUNPCKHDQ):
+                    case XO(PUNPCKHQDQ):
+                    case XO(PUNPCKHWD):
+                    case XO(PUNPCKLBW):
+                    case XO(PUNPCKLDQ):
+                    case XO(PUNPCKLQDQ):
+                    case XO(PUNPCKLWD):
+
+                    case XO(PSHUFB):
+                    case XO(PSHUFD):
+                    case XO(PSHUFHW):
+                    case XO(PSHUFLW):
+                        lat = 1;
+                        ports = PORT_0 | PORT_5;
+                        break;
+
+                    case XO(PCMPGTQ): //weeeird, only packed comparison that's done differently
+                        lat = 3;
+                        ports = PORT_1;
+                        break;
+
+                    case XO(PMOVMSKB):
+                        lat = 2+2;
+                        ports = PORT_0;
+                        break;
+
+                    default:
+                        inaccurate = true;
+                }
+                emitBasicOp(instr, uops, lat, ports, extraSlots);
+            }
+            break;
+
+        case XC(STTNI): //SSE 4.2
+            break;
+
+        case XC(CONVERT): //part of SSE
+            switch (opcode) {
+                case XO(CVTPD2PS):
+                case XO(CVTSD2SS):
+                    emitConvert2Op(instr, uops, 2, 2, PORT_1, PORT_5);
+                    break;
+                case XO(CVTPS2PD):
+                    emitConvert2Op(instr, uops, 1, 1, PORT_0, PORT_5);
+                    break;
+                case XO(CVTSS2SD):
+                    emitBasicOp(instr, uops, 1, PORT_0);
+                    break;
+                case XO(CVTDQ2PS):
+                case XO(CVTPS2DQ):
+                case XO(CVTTPS2DQ):
+                    emitBasicOp(instr, uops, 3+2 /*domain change*/, PORT_1);
+                    break;
+                case XO(CVTDQ2PD):
+                case XO(CVTPD2DQ):
+                case XO(CVTTPD2DQ):
+                    emitConvert2Op(instr, uops, 2, 2+2 /*domain change*/, PORT_1, PORT_5);
+                    break;
+                case XO(CVTPI2PS):
+                case XO(CVTPS2PI):
+                case XO(CVTTPS2PI):
+                    emitBasicOp(instr, uops, 3+2 /*domain change*/, PORT_1);
+                    break;
+                case XO(CVTPI2PD):
+                case XO(CVTPD2PI):
+                case XO(CVTTPD2PI):
+                    emitConvert2Op(instr, uops, 2, 2+2 /*domain change*/, PORT_1, PORT_0 | PORT_5);
+                    break;
+                case XO(CVTSI2SS):
+                case XO(CVTSS2SI):
+                case XO(CVTTSS2SI):
+                    emitBasicOp(instr, uops, 3+2 /*domain change*/, PORT_1);
+                    break;
+                case XO(CVTSI2SD):
+                    emitConvert2Op(instr, uops, 2, 2+2 /*domain change*/, PORT_1, PORT_0);
+                    break;
+                case XO(CVTSD2SI):
+                case XO(CVTTSD2SI):
+                    emitBasicOp(instr, uops, 3+2 /*domain change*/, PORT_1);
+                    break;
+                case XO(CBW):
+                case XO(CWDE):
+                case XO(CDQE):
+                    emitBasicOp(instr, uops, 1, PORTS_015);
+                    break;
+                case XO(CWD):
+                case XO(CDQ):
+                case XO(CQO):
+                    emitBasicOp(instr, uops, 1, PORT_0 | PORT_5);
+                    break;
+
+                default: // AVX converts
+                    inaccurate = true;
+            }
+            break;
+
+        case XC(AVX):
+            //TODO: Whatever, Nehalem has no AVX
+            break;
+
+        case XC(BROADCAST): //part of AVX
+            //TODO: Same as AVX
+            break;
+
+        case XC(AES):
+            break;
+
+        case XC(PCLMULQDQ): //CLMUL extension (carryless multiply, generally related to AES-NI)
+            break;
+
+        case XC(XSAVE):
+        case XC(XSAVEOPT): //hold your horses, it's optimized!! (AVX)
+            break;
+
+        /* Control flow ops (branches, jumps) */
+        case XC(COND_BR):
+        case XC(UNCOND_BR):
+            // We model all branches and jumps with a latency of 1. Far jumps are really expensive, but they should be exceedingly rare (from Intel's manual, they are used for call gates, task switches, etc.)
+            emitBasicOp(instr, uops, 1, PORT_5);
+            if (opcode == XO(JMP_FAR)) inaccurate = true;
+            break;
+
+        /* Stack operations */
+        case XC(CALL):
+        case XC(RET):
+            /* Call and ret are both unconditional branches and stack operations; however, Pin does not list RSP as source or destination for them */
+            //dropStackRegister(instr); //stack engine kills accesses to RSP
+            emitBasicOp(instr, uops, 1, PORT_5);
+            if (opcode != XO(CALL_NEAR) && opcode != XO(RET_NEAR)) inaccurate = true; //far call/ret or irets are far more complex
+            break;
+
+        case XC(POP):
+        case XC(PUSH):
+            //Again, RSP is not included here, so no need to remove it.
+            switch (opcode) {
+                case XO(POP):
+                case XO(PUSH):
+                    //Basic PUSH/POP are just moves. They are always to/from memory, so PORTS is irrelevant
+                    emitBasicMove(instr, uops, 1, PORTS_015);
+                    break;
+                case XO(POPF):
+                case XO(POPFD):
+                case XO(POPFQ):
+                    //Java uses POPFx/PUSHFx variants. POPF is complicated, 8 uops... microsequenced
+                    inaccurate = true;
+                    emitBasicOp(instr, uops, 14, PORTS_015);
+                    break;
+                case XO(PUSHF):
+                case XO(PUSHFD):
+                case XO(PUSHFQ):
+                    //This one we can handle... 2 exec uops + store and reciprocal thput of 1
+                    {
+                        uint32_t lats[] = {1, 1};
+                        uint8_t ports[] = {PORTS_015, PORTS_015};
+                        emitChainedOp(instr, uops, 2, lats, ports);
+                    }
+                    break;
+
+                default:
+                    inaccurate = true;
+            }
+            break;
+
+        /* Prefetches */
+        case XC(PREFETCH):
+            //A prefetch is just a load that doesn't feed into any register (or REG_TEMP in this case)
+            //NOTE: Not exactly, because this will serialize future loads under TSO
+            emitLoads(instr, uops);
+            break;
+
+        /* Stuff on the system side (some of these are privileged) */
+        case XC(INTERRUPT):
+        case XC(SYSCALL):
+        case XC(SYSRET):
+        case XC(IO):
+            break;
+
+        case XC(SYSTEM):
+            //TODO: Privileged ops are not included
+            /*switch(opcode) {
+                case XO(RDTSC):
+                case XO(RDTSCP):
+                    opLat = 24;
+                    break;
+                case XO(RDPMC):
+                    opLat = 40;
+                    break;
+                default: ;
+            }*/
+            break;
+
+        case XC(SEGOP):
+            //TODO: These are privileged, right? They are expensive but rare anyhow
+            break;
+
+        case XC(VTX): //virtualization, hmmm
+            //TODO
+            break;
+
+
+        /* String ops (I'm reading the manual and they seem just like others... wtf?) */
+        case XC(STRINGOP):
+            switch (opcode) {
+                case XO(STOSB):
+                case XO(STOSW):
+                case XO(STOSD):
+                case XO(STOSQ):
+                    //mov [rdi] <- rax
+                    //add rdi, 8
+                    //emitBasicOp(instr, uops, 1, PORTS_015); //not really, this emits the store later and there's no dep (the load is direct to reg)
+                    emitStore(instr, 0, uops, REG_RAX);
+                    emitExecUop(REG_RDI, 0, REG_RDI, 0, uops, 1, PORTS_015);
+                    break;
+                case XO(LODSB):
+                case XO(LODSW):
+                case XO(LODSD):
+                case XO(LODSQ):
+                    //mov rax <- [rsi]
+                    //add rsi, 8
+                    emitLoad(instr, 0, uops, REG_RAX);
+                    emitExecUop(REG_RSI, 0, REG_RSI, 0, uops, 1, PORTS_015);
+                    break;
+                case XO(MOVSB):
+                case XO(MOVSW):
+                case XO(MOVSD):
+                case XO(MOVSQ):
+                    //lodsX + stosX
+                    emitLoad(instr, 0, uops, REG_RAX);
+                    emitStore(instr, 0, uops, REG_RAX);
+                    emitExecUop(REG_RSI, 0, REG_RSI, 0, uops, 1, PORTS_015);
+                    emitExecUop(REG_RDI, 0, REG_RDI, 0, uops, 1, PORTS_015);
+                    break;
+                case XO(CMPSB):
+                case XO(CMPSW):
+                case XO(CMPSD):
+                case XO(CMPSQ):
+                    //load [rsi], [rdi], compare them, and add the other 2
+                    //Agner's tables say all exec uops can go anywhere, but I'm betting the comp op only goes in port5
+                    emitLoad(instr, 0, uops, REG_LOAD_TEMP);
+                    emitLoad(instr, 0, uops, REG_LOAD_TEMP+1);
+                    emitExecUop(REG_LOAD_TEMP, REG_LOAD_TEMP+1, REG_RFLAGS, 0, uops, 1, PORT_5);
+                    emitExecUop(REG_RSI, 0, REG_RSI, 0, uops, 1, PORTS_015);
+                    emitExecUop(REG_RDI, 0, REG_RDI, 0, uops, 1, PORTS_015);
+                    break;
+                default: //SCAS and other dragons I have not seen yet
+                    inaccurate = true;
+            }
+            break;
+        case XC(IOSTRINGOP):
+            //TODO: These seem to make sense with REP, which Pin unfolds anyway. Are they used al all?
+            break;
+
+        /* Stuff not even the Intel guys know how to classify :P */
+        case XC(MISC):
+            if (opcode == XO(LEA)) {
+                emitBasicOp(instr, uops, 1, PORT_1);
+            } else if (opcode == XO(PAUSE)) {
+                //Pause is weird. It takes 9 cycles, issues 5 uops (to be treated like a complex instruction and put a wrench on the decoder?),
+                //and those uops are issued to PORT_015. No idea about how individual uops are sized, but in ubenchs I cannot put even an ADD
+                //between pauses for free, so I'm assuming it's 9 solid cycles total.
+                emitExecUop(0, 0, 0, 0, uops, 9, PORTS_015, 8); //9, longest first
+                emitExecUop(0, 0, 0, 0, uops, 5, PORTS_015, 4); //NOTE: latency does not matter
+                emitExecUop(0, 0, 0, 0, uops, 5, PORTS_015, 4);
+                emitExecUop(0, 0, 0, 0, uops, 4, PORTS_015, 3);
+                emitExecUop(0, 0, 0, 0, uops, 4, PORTS_015, 3);
+            }
+            /*switch (opcode) {
+                case CPUID:
+                case ENTER:
+                case LEAVE:
+                case LEA:
+                case LFENCE:
+                case MFENCE:
+                case SFENCE:
+                case MONITOR:
+                case MWAIT:
+                case UD2:
+                case XLAT:
+            }*/
+            //TODO
+            break;
+
+        default: {}
+            //panic("Invalid instruction category");
+    }
+
+    //Try to produce something approximate...
+    if (uops.size() - initialUops == isLocked? 1 : 0) { //if it's locked, we have the initial fence for an empty instr
+        emitBasicOp(instr, uops, 1, PORTS_015, 0, false /* don't report unhandled cases */);
+        inaccurate = true;
+    }
+
+    //NOTE: REP instructions are unrolled by PIN, so they are accurately simulated (they are treated as predicated in Pin)
+    //See section "Optimizing Instrumentation of REP Prefixed Instructions" on the Pin manual
+
+    //Add ld/st fence to all locked instructions
+    if (isLocked) {
+        //inaccurate = true; //this is now fairly accurate
+        emitFence(uops, 9); //locked ops introduce an additional uop and cache locking takes 14 cycles/instr per the perf counters; latencies match with 9 cycles of fence latency
+    }
+
+    assert(uops.size() - initialUops < MAX_UOPS_PER_INSTR);
+    //assert_msg(uops.size() - initialUops < MAX_UOPS_PER_INSTR, "%ld -> %ld uops", initialUops, uops.size());
+    return inaccurate;
+}
+
+// See Agner Fog's uarch doc, macro-op fusion for Core 2 / Nehalem
+bool Decoder::canFuse(INS ins) {
+    xed_iclass_enum_t opcode = (xed_iclass_enum_t) INS_Opcode(ins);
+    if (!(opcode == XO(CMP) || opcode == XO(TEST))) return false;
+    //Discard if immediate
+    for (uint32_t op = 0; op < INS_OperandCount(ins); op++) if (INS_OperandIsImmediate(ins, op)) return false;
+
+    //OK so far, let's check the branch
+    INS nextIns = INS_Next(ins);
+    if (!INS_Valid(nextIns)) return false;
+    xed_iclass_enum_t nextOpcode = (xed_iclass_enum_t) INS_Opcode(nextIns);
+    xed_category_enum_t nextCategory = (xed_category_enum_t) INS_Category(nextIns);
+    if (nextCategory != XC(COND_BR)) return false;
+    if (!INS_IsDirectBranch(nextIns)) return false; //according to PIN's API, this s only true for PC-rel near branches
+
+    switch (nextOpcode) {
+        case XO(JZ):  //or JZ
+        case XO(JNZ): //or JNE
+        case XO(JB):
+        case XO(JBE):
+        case XO(JNBE): //or JA
+        case XO(JNB):  //or JAE
+        case XO(JL):
+        case XO(JLE):
+        case XO(JNLE): //or JG
+        case XO(JNL):  //or JGE
+            return true;
+        case XO(JO):
+        case XO(JNO):
+        case XO(JP):
+        case XO(JNP):
+        case XO(JS):
+        case XO(JNS):
+            return opcode == XO(TEST); //CMP cannot fuse with these
+        default:
+            return false; //other instrs like LOOP don't fuse
+    }
+}
+
+bool Decoder::decodeFusedInstrs(INS ins, DynUopVec& uops) {
+    //assert(canFuse(ins)); //this better be true :)
+
+    Instr instr(ins);
+    Instr branch(INS_Next(ins));
+
+    //instr should have 2 inputs (regs/mem), and 1 output (rflags), and branch should have 2 inputs (rip, rflags) and 1 output (rip)
+
+    if (instr.numOutRegs != 1  || instr.outRegs[0] != REG_RFLAGS ||
+        branch.numOutRegs != 1 || branch.outRegs[0] != REG_RIP)
+    {
+        reportUnhandledCase(instr, "decodeFusedInstrs");
+        reportUnhandledCase(branch, "decodeFusedInstrs");
+    } else {
+        instr.outRegs[1] = REG_RIP;
+        instr.numOutRegs++;
+    }
+
+    emitBasicOp(instr, uops, 1, PORT_5);
+    return false; //accurate
+}
+
+
+#ifdef BBL_PROFILING
+
+//All is static for now...
+#define MAX_BBLS (1<<24) //16M
+
+static lock_t bblIdxLock = 0;
+static uint64_t bblIdx = 0;
+
+static uint64_t bblCount[MAX_BBLS];
+static std::vector<uint32_t>* bblApproxOpcodes[MAX_BBLS];
+
+#endif
+
+BblInfo* Decoder::decodeBbl(BBL bbl, bool oooDecoding) {
+    uint32_t instrs = BBL_NumIns(bbl);
+    uint32_t bytes = BBL_Size(bbl);
+    BblInfo* bblInfo;
+
+    if (oooDecoding) {
+        //Decode BBL
+        uint32_t approxInstrs = 0;
+        uint32_t curIns = 0;
+        DynUopVec uopVec;
+
+#ifdef BBL_PROFILING
+        std::vector<uint32_t> approxOpcodes;
+
+        //XED decoder init
+        xed_state_t dstate;
+        xed_decoded_inst_t xedd;
+        xed_state_zero(&dstate);
+        xed_state_init(&dstate, XED_MACHINE_MODE_LONG_64, XED_ADDRESS_WIDTH_64b, XED_ADDRESS_WIDTH_64b);
+        xed_decoded_inst_zero_set_mode(&xedd, &dstate);
+#endif
+
+        //Gather some info about instructions needed to model decode stalls
+        std::vector<ADDRINT> instrAddr;
+        std::vector<uint32_t> instrBytes;
+        std::vector<uint32_t> instrUops;
+        std::vector<INS> instrDesc;
+
+        //Decode
+        for (INS ins = BBL_InsHead(bbl); INS_Valid(ins); ins = INS_Next(ins)) {
+            bool inaccurate = false;
+            uint32_t prevUops = uopVec.size();
+            if (Decoder::canFuse(ins)) {
+                inaccurate = Decoder::decodeFusedInstrs(ins, uopVec);
+                instrAddr.push_back(INS_Address(ins));
+                instrBytes.push_back(INS_Size(ins));
+                instrUops.push_back(uopVec.size() - prevUops);
+                instrDesc.push_back(ins);
+
+                ins = INS_Next(ins); //skip the JMP
+
+                instrAddr.push_back(INS_Address(ins));
+                instrBytes.push_back(INS_Size(ins));
+                instrUops.push_back(0);
+                instrDesc.push_back(ins);
+
+                curIns+=2;
+            } else {
+                inaccurate = Decoder::decodeInstr(ins, uopVec);
+
+                instrAddr.push_back(INS_Address(ins));
+                instrBytes.push_back(INS_Size(ins));
+                instrUops.push_back(uopVec.size() - prevUops);
+                instrDesc.push_back(ins);
+
+                curIns++;
+            }
+#ifdef PROFILE_ALL_INSTRS
+            inaccurate = true; //uncomment to profile everything
+#endif
+            if (inaccurate) {
+                approxInstrs++;
+#ifdef BBL_PROFILING
+                xed_decoded_inst_zero_keep_mode(&xedd); //need to do this per instruction
+                xed_iform_enum_t iform = XED_IFORM_INVALID;
+                uint8_t buf[16];
+                //Using safecopy, we bypass pagefault uglyness due to out-of-bounds accesses
+                size_t insBytes = PIN_SafeCopy(buf, INS_Address(ins), 15);
+                xed_error_enum_t err = xed_decode(&xedd, buf, insBytes);
+                if (err != XED_ERROR_NONE) {
+                    panic("xed_decode failed: %s", xed_error_enum_t2str(err));
+                } else {
+                    iform = xed_decoded_inst_get_iform_enum(&xedd);
+                }
+                approxOpcodes.push_back((uint32_t)iform);
+#endif
+                //info("Approx decoding: %s", INS_Disassemble(ins).c_str());
+            }
+        }
+        assert(curIns == instrs);
+
+        //Instr predecoder and decode stage modeling; we assume clean slate between BBLs, which is typical because
+        //optimizing compilers 16B-align most branch targets (and if it doesn't happen, the error introduced is fairly small)
+
+        //1. Predecoding
+        uint32_t predecCycle[instrs];
+        uint32_t pcyc = 0;
+        uint32_t psz = 0;
+        uint32_t pcnt = 0;
+        uint32_t pblk = 0;
+
+        ADDRINT startAddr = (INS_Address(instrDesc[0]) >> 4) << 4;
+
+        for (uint32_t i = 0; i < instrs; i++) {
+            INS ins = instrDesc[i];
+            ADDRINT addr = INS_Address(ins);
+            uint32_t bytes = INS_Size(ins);
+            uint32_t block = (addr - startAddr) >> 4;
+            psz += bytes;
+            pcnt++;
+            if (psz > 16 /*leftover*/|| pcnt > 6 /*max predecs*/|| block > pblk /*block switch*/) {
+                psz = bytes;
+                pcnt = 1;
+                pblk = block;
+                pcyc++;
+            }
+
+            //Length-changing prefix introduce a 6-cycle penalty regardless;
+            //In 64-bit mode, only operand size prefixes are LCPs; addr size prefixes are fine
+            // UPDATE (dsm): This was introducing significant errors in some benchmarks (e.g., astar)
+            // Turns out, only SOME LCPs (false LCPs) cause this delay
+            // see http://www.jaist.ac.jp/iscenter-new/mpc/altix/altixdata/opt/intel/vtune/doc/users_guide/mergedProjects/analyzer_ec/mergedProjects/reference_olh/pentiumm_hh/pentiummy_hh/lipsmy/instructions_that_require_slow_decoding.htm
+            // At this point I'm going to assume that gcc is smart enough to not produce these
+            //if (INS_OperandSizePrefix(ins)) pcyc += 6;
+
+            predecCycle[i] = pcyc;
+            //info("PREDEC %2d: 0x%08lx %2d %d %d %d", i, instrAddr[i], instrBytes[i], instrUops[i], block, predecCycle[i]);
+        }
+
+        //2. Decoding
+        //4-1-1-1 rules: Small decoders can only take instructions that produce 1 uop AND are at most 7 bytes long
+        uint32_t uopIdx = 0;
+
+        uint32_t dcyc = 0;
+        uint32_t dsimple = 0;
+        uint32_t dcomplex = 0;
+
+        for (uint32_t i = 0; i < instrs; i++) {
+            if (instrUops[i] == 0) continue; //fused branch
+
+            uint32_t pcyc = predecCycle[i];
+            if (pcyc > dcyc) {
+                dcyc = pcyc;
+                dsimple = 0;
+                dcomplex = 0;
+            }
+
+            bool simple = (instrUops[i] == 1) && (instrBytes[i] < 8);
+
+            if ((simple && dsimple + dcomplex == 4) || (!simple && dcomplex == 1)) { // Do: (!simple /*&& dcomplex == 1*/) to be conservative?
+                dcyc++;
+                dsimple = 0;
+                dcomplex = 0;
+            }
+
+            if (simple) dsimple++;
+            else dcomplex++;
+
+            //info("   DEC %2d: 0x%08lx %2d %d %d %d (%d %d)", i, instrAddr[i], instrBytes[i], instrUops[i], simple, dcyc, dcomplex, dsimple);
+
+            for (uint32_t j = 0; j < instrUops[i]; j++) {
+                uopVec[uopIdx + j].decCycle = dcyc;
+            }
+
+            uopIdx += instrUops[i];
+        }
+
+        assert(uopIdx == uopVec.size());
+
+        //Allocate
+        uint32_t objBytes = offsetof(BblInfo, oooBbl) + DynBbl::bytes(uopVec.size());
+        bblInfo = static_cast<BblInfo*>(gm_malloc(objBytes));  // can't use type-safe interface
+
+        //Initialize ooo part
+        DynBbl& dynBbl = bblInfo->oooBbl[0];
+        dynBbl.addr = BBL_Address(bbl);
+        dynBbl.uops = uopVec.size();
+        dynBbl.approxInstrs = approxInstrs;
+        for (uint32_t i = 0; i < dynBbl.uops; i++) dynBbl.uop[i] = uopVec[i];
+
+#ifdef BBL_PROFILING
+        futex_lock(&bblIdxLock);
+        dynBbl.bblIdx = bblIdx++;
+        assert(dynBbl.bblIdx < MAX_BBLS);
+        if (approxInstrs) {
+            bblApproxOpcodes[dynBbl.bblIdx] = new std::vector<uint32_t>(approxOpcodes);  // copy
+        }
+        //info("DECODED BBL IDX %d", bblIdx);
+
+        futex_unlock(&bblIdxLock);
+#endif
+    } else {
+        bblInfo = gm_malloc<BblInfo>();
+    }
+
+    //Initialize generic part
+    bblInfo->instrs = instrs;
+    bblInfo->bytes = bytes;
+
+    return bblInfo;
+}
+
+
+#ifdef BBL_PROFILING
+void Decoder::profileBbl(uint64_t bblIdx) {
+    assert(bblIdx < MAX_BBLS);
+    __sync_fetch_and_add(&bblCount[bblIdx], 1);
+}
+
+void Decoder::dumpBblProfile() {
+    uint32_t numOpcodes = xed_iform_enum_t_last() + 1;
+    uint64_t approxOpcodeCount[numOpcodes];
+    for (uint32_t i = 0; i < numOpcodes; i++) approxOpcodeCount[i] = 0;
+    for (uint32_t i = 0; i < bblIdx; i++) {
+        if (bblApproxOpcodes[i]) for (uint32_t& j : *bblApproxOpcodes[i]) approxOpcodeCount[j] += bblCount[i];
+    }
+
+    std::ofstream out("approx_instrs.stats");
+    out << std::setw(16) << "Category" << std::setw(16) << "Iclass" << std::setw(32) << "Iform" << std::setw(16) << "Count" << std::endl;
+    for (uint32_t i = 0; i < numOpcodes; i++) {
+        if (approxOpcodeCount[i]) {
+            //out << xed_iclass_enum_t2str((xed_iclass_enum_t)i) << "\t " << approxOpcodeCount[i] << std::endl;
+            xed_iform_enum_t iform = (xed_iform_enum_t)i;
+            xed_category_enum_t cat = xed_iform_to_category(iform);
+            xed_iclass_enum_t iclass = xed_iform_to_iclass(iform);
+
+            out << std::setw(16) << xed_category_enum_t2str(cat) << std::setw(16) << xed_iclass_enum_t2str(iclass) << std::setw(32) << xed_iform_enum_t2str(iform) << std::setw(16) << approxOpcodeCount[i] << std::endl;
+        }
+    }
+
+    //Uncomment to dump a bbl profile
+    //for (uint32_t i = 0; i < bblIdx; i++) out << std::setw(8) << i << std::setw(8) <<  bblCount[i] << std::endl;
+
+    out.close();
+}
+
+#endif
+
diff --git a/src/decoder.h b/src/decoder.h
new file mode 100644
index 00000000..f7e496ac
--- /dev/null
+++ b/src/decoder.h
@@ -0,0 +1,191 @@
+/** $lic$
+ * Copyright (C) 2012-2014 by Massachusetts Institute of Technology
+ * Copyright (C) 2010-2013 by The Board of Trustees of Stanford University
+ *
+ * This file is part of zsim.
+ *
+ * zsim is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU General Public License as published by the Free Software
+ * Foundation, version 2.
+ *
+ * If you use this software in your research, we request that you reference
+ * the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of
+ * Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the
+ * source of the simulator in any publications that use this software, and that
+ * you send us a citation of your work.
+ *
+ * zsim is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef DECODER_H_
+#define DECODER_H_
+
+#include <stdint.h>
+#include <vector>
+#include "pin.H"
+
+// Uncomment to get a count of BBLs run. This is currently used to get a distribution of inaccurate instructions decoded that are actually run
+// NOTE: This is not multiprocess-safe
+// #define BBL_PROFILING
+// #define PROFILE_ALL_INSTRS
+
+// uop reg limits
+#define MAX_UOP_SRC_REGS 2
+#define MAX_UOP_DST_REGS 2
+
+/* NOTE this uses stronly typed enums, a C++11 feature. This saves a bunch of typecasts while keeping UopType enums 1-byte long.
+ * If you use gcc < 4.6 or some other compiler, either go back to casting or lose compactness in the layout.
+ */
+enum UopType : uint8_t {UOP_GENERAL, UOP_LOAD, UOP_STORE, UOP_STORE_ADDR, UOP_FENCE};
+
+struct DynUop {
+    uint16_t rs[MAX_UOP_SRC_REGS];
+    uint16_t rd[MAX_UOP_DST_REGS];
+    uint16_t lat;
+    uint16_t decCycle;
+    UopType type; //1 byte
+    uint8_t portMask;
+    uint8_t extraSlots; //FU exec slots
+    uint8_t pad; //pad to 4-byte multiple
+
+    void clear();
+};  // 16 bytes. TODO(dsm): check performance with wider operands
+
+struct DynBbl {
+#ifdef BBL_PROFILING
+    uint64_t bblIdx;
+#endif
+    uint64_t addr;
+    uint32_t uops;
+    uint32_t approxInstrs;
+    DynUop uop[1];
+
+    static uint32_t bytes(uint32_t uops) {
+        return offsetof(DynBbl, uop) + sizeof(DynUop)*uops /*wtf... offsetof doesn't work with uop[uops]*/;
+    }
+
+    void init(uint64_t _addr, uint32_t _uops, uint32_t _approxInstrs) {
+        // NOTE: this is a POD type, so we don't need to call a constructor; otherwise, we should use placement new
+        uops = _uops;
+        approxInstrs = _approxInstrs;
+    }
+};
+
+struct BblInfo;  // defined in core.h
+
+/* These are absolute maximums per instruction. If there is some non-conforming instruction, either increase these limits or
+ * treat it as a special case.
+ */
+#define MAX_INSTR_LOADS 4
+#define MAX_INSTR_REG_READS 4
+#define MAX_INSTR_REG_WRITES 4
+#define MAX_INSTR_STORES 4
+
+#define MAX_UOPS_PER_INSTR 12  // technically, even full decoders produce 1-4 uops; we increase this for common microsequenced instructions (e.g. xchg).
+
+/* Temporary register offsets */
+#define REG_LOAD_TEMP (REG_LAST + 1)  // REG_LAST defined by PIN
+#define REG_STORE_TEMP (REG_LOAD_TEMP + MAX_INSTR_LOADS)
+#define REG_STORE_ADDR_TEMP (REG_STORE_TEMP + MAX_INSTR_STORES)
+#define REG_EXEC_TEMP (REG_STORE_ADDR_TEMP + MAX_INSTR_STORES)
+
+#define MAX_REGISTERS (REG_EXEC_TEMP + 64)
+
+typedef std::vector<DynUop> DynUopVec;
+
+//Nehalem-style decoder. Fully static for now
+class Decoder {
+    private:
+        struct Instr {
+            INS ins;
+
+            uint32_t loadOps[MAX_INSTR_LOADS];
+            uint32_t numLoads;
+
+            //These contain the register indices; by convention, flags registers are stored last
+            uint32_t inRegs[MAX_INSTR_REG_READS];
+            uint32_t numInRegs;
+            uint32_t outRegs[MAX_INSTR_REG_WRITES];
+            uint32_t numOutRegs;
+
+            uint32_t storeOps[MAX_INSTR_STORES];
+            uint32_t numStores;
+
+            explicit Instr(INS _ins);
+
+            private:
+                //Put registers in some canonical order -- non-flags first
+                void reorderRegs(uint32_t* regArray, uint32_t numRegs);
+        };
+
+    public:
+        //If oooDecoding is true, produces a DynBbl with DynUops that can be used in OOO cores
+        static BblInfo* decodeBbl(BBL bbl, bool oooDecoding);
+
+#ifdef BBL_PROFILING
+        static void profileBbl(uint64_t bblIdx);
+        static void dumpBblProfile();
+#endif
+
+    private:
+        //Return true if inaccurate decoding, false if accurate
+        static bool decodeInstr(INS ins, DynUopVec& uops);
+
+        /* Every emit function can produce 0 or more uops; it returns the number of uops. These are basic templates to make our life easier */
+
+        //By default, these emit to temporary registers that depend on the index; this can be overriden, e.g. for moves
+        static void emitLoad(Instr& instr, uint32_t idx, DynUopVec& uops, uint32_t destReg = 0);
+        static void emitStore(Instr& instr, uint32_t idx, DynUopVec& uops, uint32_t srcReg = 0);
+
+        //Emit all loads and stores for this uop
+        static void emitLoads(Instr& instr, DynUopVec& uops);
+        static void emitStores(Instr& instr, DynUopVec& uops);
+
+        //Emits a load-store fence uop
+        static void emitFence(DynUopVec& uops, uint32_t lat);
+
+        static void emitExecUop(uint32_t rs0, uint32_t rs1, uint32_t rd0, uint32_t rd1,
+                DynUopVec& uops, uint32_t lat, uint8_t ports, uint8_t extraSlots = 0);
+
+        /* Instruction emits */
+
+        static void emitBasicMove(Instr& instr, DynUopVec& uops, uint32_t lat, uint8_t ports);
+        static void emitConditionalMove(Instr& instr, DynUopVec& uops, uint32_t lat, uint8_t ports);
+
+        // 1 "exec" uop, 0-2 inputs, 0-2 outputs
+        static void emitBasicOp(Instr& instr, DynUopVec& uops, uint32_t lat, uint8_t ports,
+                uint8_t extraSlots = 0, bool reportUnhandled = true);
+
+        // >1 exec uops in a chain: each uop takes 2 inputs, produces 1 output to the next op
+        // in the chain; the final op writes to the 0-2 outputs
+        static void emitChainedOp(Instr& instr, DynUopVec& uops, uint32_t numUops,
+                uint32_t* latArray, uint8_t* portsArray);
+
+        // Some convert ops need 2 chained exec uops, though they have a single input and output
+        static void emitConvert2Op(Instr& instr, DynUopVec& uops, uint32_t lat1, uint32_t lat2,
+                uint8_t ports1, uint8_t ports2);
+
+        /* Specific cases */
+        static void emitXchg(Instr& instr, DynUopVec& uops);
+        static void emitMul(Instr& instr, DynUopVec& uops);
+        static void emitDiv(Instr& instr, DynUopVec& uops);
+
+        static void emitCompareAndExchange(Instr&, DynUopVec&);
+
+        /* Other helper functions */
+        static void reportUnhandledCase(Instr& instr, const char* desc);
+        static void populateRegArrays(Instr& instr, uint32_t* srcRegs, uint32_t* dstRegs);
+        static void dropStackRegister(Instr& instr);
+
+        /* Macro-op (ins) fusion */
+        static bool canFuse(INS ins);
+        static bool decodeFusedInstrs(INS ins, DynUopVec& uops);
+};
+
+#endif  // DECODER_H_
diff --git a/src/detailed_mem.cpp b/src/detailed_mem.cpp
new file mode 100644
index 00000000..4593b0fe
--- /dev/null
+++ b/src/detailed_mem.cpp
@@ -0,0 +1,1437 @@
+/** $lic$
+ * Copyright (C) 2012-2014 by Massachusetts Institute of Technology
+ * Copyright (C) 2010-2013 by The Board of Trustees of Stanford University
+ *
+ * This file is part of zsim.
+ *
+ * zsim is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU General Public License as published by the Free Software
+ * Foundation, version 2.
+ *
+ * If you use this software in your research, we request that you reference
+ * the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of
+ * Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the
+ * source of the simulator in any publications that use this software, and that
+ * you send us a citation of your work.
+ *
+ * zsim is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "detailed_mem.h"
+#include "zsim.h"
+#include "tick_event.h"
+#include <algorithm>
+
+MemRankBase::MemRankBase(uint32_t _myId, uint32_t _parentId, uint32_t _bankCount) {
+    myId = _myId;
+    parentId = _parentId;
+    bankCount = _bankCount;
+
+    bankinfo = gm_calloc<bool>(bankCount);
+    lastType = gm_calloc<MemAccessType>(bankCount);
+    lastRow = gm_calloc<uint32_t>(bankCount);
+    lastActCycle = gm_calloc<uint64_t>(bankCount);
+    lastRdWrCycle = gm_calloc<uint64_t>(bankCount);
+    lastPreCycle = gm_calloc<uint64_t>(bankCount);
+    tFAWCycle = gm_calloc<uint64_t>(bankCount);
+
+    lastBank = 0;
+    lastAccessCycle = 0;
+    lastRefreshCycle = 0;
+    refreshNum = 0;
+    accessInRefresh = 0;
+    tFAWIndex = 0;
+
+    activateCount = 0;
+    prechargeCount = 0;
+    readBurstCount = 0;
+    writeBurstCount = 0;
+
+    idlePowerDownCycle = 0;
+    actvPowerDownCycle = 0;
+    idleStandbyCycle = 0;
+
+    prevIdlePowerDownCycle = 0;
+    prevActvPowerDownCycle = 0;
+    prevIdleStandbyCycle = 0;
+}
+
+MemRankBase::~MemRankBase() {
+    gm_free(bankinfo);
+    gm_free(lastRow);
+    gm_free(lastType);
+    gm_free(lastActCycle);
+    gm_free(lastRdWrCycle);
+    gm_free(lastPreCycle);
+    gm_free(tFAWCycle);
+}
+
+void MemRankBase::access(uint64_t accessCycle, uint64_t issuedCycle, uint32_t row, uint32_t col, uint32_t bank, MemAccessType type) {
+    // If the difference between read latency and write latency is large,
+    // a latter access may overtake the prior one by the scheduling in intraIssueCycleble.
+    // assert(lastAccessCycle < accessCycle);
+    lastAccessCycle = std::max(lastAccessCycle, accessCycle);
+    assert(lastRdWrCycle[bank] < issuedCycle);
+    lastRdWrCycle[bank] = issuedCycle;
+    lastRow[bank] = row;
+    lastType[bank] = type;
+    lastBank = bank;
+
+    if (type == READ) {
+        IncReadBurstCount();
+    } else {
+        IncWriteBurstCount();
+    }
+}
+
+void MemRankBase::refresh(uint64_t lastCycle) {
+    for (uint32_t i = 0; i < bankCount; i++) {
+        bankinfo[i] = false;
+    }
+    assert(lastRefreshCycle < lastCycle);
+    lastRefreshCycle = lastCycle;
+}
+
+uint32_t MemRankBase::GetActiveBankCount(void) {
+    uint32_t count = 0;
+    for (uint32_t i = 0; i < bankCount; i++) {
+        count += (bankinfo[i] == true)? 1 : 0;
+    }
+    return count;
+}
+
+void MemRankBase::SaveBackgroundCycles(void) {
+    prevIdlePowerDownCycle = idlePowerDownCycle;
+    prevActvPowerDownCycle = actvPowerDownCycle;
+    prevIdleStandbyCycle = idleStandbyCycle;
+}
+
+
+MemChannelBase::MemChannelBase(uint32_t _myId, MemParam *_mParam) {
+    myId = _myId;
+    mParam = _mParam;
+    accessLog.reserve(mParam->accessLogDepth);
+
+    uint32_t rankCount = mParam->rankCount;
+    ranks.resize(rankCount);
+    for(uint32_t i = 0; i< rankCount; i++) {
+        ranks[i] = new MemRankBase(i, myId, mParam->bankCount);
+    }
+}
+
+MemChannelBase::~MemChannelBase(void) {
+    for(uint32_t i = 0; i< mParam->rankCount; i++) {
+        delete ranks[i];
+    }
+}
+
+bool MemChannelBase::IsRowBufferHit(uint32_t row, uint32_t rank, uint32_t bank) {
+    return ((ranks[rank]->GetBankOpen(bank) == true) && (ranks[rank]->GetLastRow(bank) == row));
+}
+
+
+uint32_t MemChannelBase::UpdateRefreshNum(uint32_t rank, uint64_t arrivalCycle) {
+    //////////////////////////////////////////////////////////////////////
+    // Auto Refresh Final Version ////////////////////////////////////////
+    //////////////////////////////////////////////////////////////////////
+    //
+    // ## tRPab = 0 in Close Policy
+    //
+    //           < - - - - - -   tREFI   - - - - -  >
+    //        |  tRPab tRFC                           tRPab tRFC
+    //-----------|---|-------|------------------------|---|-------|-----
+    // *      *     (*)  (*)->*~~~~*           *
+    // |      |~~~>* |    |   |    |           |
+    // A1     A2   | B1   B2  B1   B2          C
+    //             |
+    //             accessCycle
+    //           <->
+    //            diff (refOverlap)
+    //
+    //
+    // A:  Access (A2) comes before refresh and the last access (A1) is
+    //     in the same refresh period
+    // => No refresh penalty for A1 (normal access)
+    // => if A2 latency overlaps on refresh period (tRPab+tRFC),
+    //          all the access are shifted to the end of refresh (Case B)
+    // => refOverlap is added to the head of B1 as additional constraint
+    //          to get pseudo refresh shift by A2 latency
+    // B:  2 or more access (B1, B2) come in refresh period
+    // => B1 is shifted to the end of Refresh and B2 is shifted
+    //          to the end of B1 access. Even if B2 is after refresh,
+    //          B2 is shifted to the end of B1 access.
+    // C:  Beyond tREFI from previous access
+    // => Count refnum and multiply the time & power
+    //
+    //////////////////////////////////////////////////////////////////////
+    uint64_t lastRefreshCycle = ranks[rank]->GetLastRefreshCycle();
+    uint32_t refreshNum = 0;
+    if (arrivalCycle >= lastRefreshCycle) {
+        refreshNum = (arrivalCycle - lastRefreshCycle)/mParam->tREFI;
+    }
+    uint32_t totalNum = ranks[rank]->GetRefreshNum() + refreshNum;
+    ranks[rank]->SetRefreshNum(totalNum);
+    return refreshNum;
+}
+
+uint64_t MemChannelBase::UpdateLastRefreshCycle(uint32_t rank, uint64_t arrivalCycle, uint32_t refreshNum) {
+    uint64_t lastRefreshCycle = ranks[rank]->GetLastRefreshCycle();
+    if (refreshNum > 0) {
+        // Updating Activate & Precharge count / rank from each bank
+        for (uint32_t j = 0; j < mParam->bankCount; j++) {
+            if (ranks[rank]->GetBankOpen(j) == true)
+                ranks[rank]->IncPrechargeCount();
+        }
+        lastRefreshCycle += mParam->tREFI * refreshNum;
+        ranks[rank]->refresh(lastRefreshCycle);// banks are closed
+    }
+    return lastRefreshCycle;
+}
+
+void MemChannelBase::UpdateDataBusCycle(uint64_t start, uint64_t end) {
+    std::pair<uint64_t, uint64_t> tmpPair = std::make_pair(start, end);
+    accessLog.push_back(tmpPair);
+    sort(accessLog.begin(), accessLog.end());
+    if (accessLog.size() > mParam->accessLogDepth) {
+        accessLog.erase(accessLog.begin());
+        accessLog[0].first = 0;
+    }
+}
+
+uint64_t MemChannelBase::CalcIntraIssueCycle(bool rowHit, uint32_t rank, MemAccessType type, uint64_t arrivalCycle, uint32_t refreshNum) {
+    uint32_t lastBank = ranks[rank]->GetLastBank();
+    uint32_t lastType = ranks[rank]->GetLastType(lastBank);
+
+    // Check last access Cycle is overlapped in ref period (tRPab + tRFC)
+    uint32_t refCycle = mParam->GetRefreshCycle();
+    uint64_t lastAccessCycle = ranks[rank]->GetLastAccessCycle();
+    uint64_t lastRefreshCycle = ranks[rank]->GetLastRefreshCycle();
+    uint64_t refOverlap = lastAccessCycle - lastRefreshCycle;
+    if (refreshNum == 0) {
+        // This is not the first access after refresh.
+        ranks[rank]->SetAccessInRefresh(0);
+    }
+    if ((lastRefreshCycle != 0) &&
+       (refCycle >= refOverlap) && (lastAccessCycle >= lastRefreshCycle) ) {
+        //2nd access is during refresh
+        uint32_t accessInRefresh = ranks[rank]->GetAccessInRefresh();
+        ranks[rank]->SetAccessInRefresh(accessInRefresh+1);
+    } else {
+        ranks[rank]->SetAccessInRefresh(0);
+    }
+
+    // When Access comes during refresh period
+    uint32_t accessInRefresh = ranks[rank]->GetAccessInRefresh();
+    if (accessInRefresh != 0) {
+        uint64_t issuableCycle = lastRefreshCycle + refCycle + refOverlap;
+        issuableCycle = std::max(issuableCycle, arrivalCycle);
+        if (accessInRefresh >= 2) { //2nd access in refresh
+            issuableCycle += mParam->GetRdWrDelay(type, lastType);
+        }
+        return issuableCycle;
+    }
+
+    // Get constraint for same Rank and different Rank access
+    uint64_t intraIssuableCycle = arrivalCycle;
+    uint64_t lastIssuedCycle = ranks[rank]->GetLastRdWrCycle(lastBank);
+    if (lastIssuedCycle != 0) {
+        intraIssuableCycle = lastIssuedCycle;
+        if (rowHit == true)
+            intraIssuableCycle += mParam->GetRdWrDelay(type, lastType);
+        else
+            intraIssuableCycle += 1;// for command bus conflict
+        intraIssuableCycle = std::max(intraIssuableCycle, arrivalCycle);
+    }
+    return intraIssuableCycle;
+}
+
+uint64_t MemChannelBase::CalcInterIssueCycle(MemAccessType type, uint64_t arrivalCycle) {
+    // find out the slot
+    uint32_t tWait = mParam->GetDataLatency(type);
+    uint32_t tSlot = mParam->GetDataSlot(type) + mParam->tRTRS;
+    uint64_t tStart = arrivalCycle + tWait;
+    uint64_t tEnd = tStart + tSlot;
+    for(uint32_t i = 0; i < accessLog.size(); i++) {
+        uint64_t busStart = accessLog[i].first;
+        uint64_t busEnd = accessLog[i].second + mParam->tRTRS;
+        if (((busStart < tEnd) && (tEnd <= busEnd)) || ((busStart <= tStart) && (tStart < busEnd))) {
+            tStart = busEnd;
+            tEnd = tStart + tSlot;
+        } else if (busStart > tEnd) {
+            break;
+        }
+    }
+    return tStart - tWait;
+}
+
+uint64_t MemChannelBase::CalcActConst(uint32_t rank, uint32_t bank, uint64_t issuableCycle) {
+    uint64_t updateCycle = issuableCycle;
+    if (ranks[rank]->GetLastActCycle(bank) == 0)
+        return updateCycle;
+
+    // tRC Constraint Check
+    uint64_t currentBankActCycle = ranks[rank]->GetLastActCycle(bank);
+    uint64_t tRC_const = currentBankActCycle + mParam->tRC;
+    updateCycle = std::max(updateCycle, tRC_const);
+
+    // tRP Constraint Check
+    uint64_t lastPreCycle = ranks[rank]->GetLastPreCycle(bank);
+    if (lastPreCycle != 0) {
+        uint64_t tRP_const = lastPreCycle + mParam->tRP;
+        updateCycle = std::max(updateCycle, tRP_const);
+    }
+
+    // tRRD Constraint Check
+    uint64_t latestActCycle = 0;
+    for(uint32_t i = 0; i < mParam->bankCount; i++) {
+        uint64_t bankActCycle = ranks[rank]->GetLastActCycle(i);
+        latestActCycle = std::max(latestActCycle, bankActCycle);
+    }
+    uint64_t tRRD_const = latestActCycle + mParam->tRRD;
+    updateCycle = std::max(updateCycle, tRRD_const);
+
+    // tFAW Constraint Check
+    uint64_t tFAW_const = ranks[rank]->GetFAWCycle() + mParam->tFAW;
+    updateCycle = std::max(updateCycle, tFAW_const);
+
+    return updateCycle;
+}
+
+uint64_t MemChannelBase::CalcPreConst(uint32_t rank, uint32_t bank, MemAccessType type, uint64_t issuableCycle) {
+    uint64_t updateCycle = issuableCycle;
+
+    // read/write to precharge Constraint Check
+    uint64_t lastRdWrCycle = ranks[rank]->GetLastRdWrCycle(bank);
+    uint64_t tRW_const = lastRdWrCycle + mParam->GetPreDelay(type);
+    updateCycle = std::max(updateCycle, tRW_const);
+
+    // tRAS Constraint Check
+    uint64_t lastActCycle = ranks[rank]->GetLastActCycle(bank);
+    uint64_t tRAS_const = lastActCycle + mParam->tRAS;
+    updateCycle = std::max(updateCycle, tRAS_const);
+
+    return updateCycle;
+}
+
+uint64_t MemChannelBase::CalcRdWrConst(uint32_t rank, MemAccessType type, uint64_t issuableCycle) {
+    uint64_t updateCycle = issuableCycle;
+    uint32_t lastBank = ranks[rank]->GetLastBank();
+    uint32_t lastType = ranks[rank]->GetLastType(lastBank);
+
+    // read/write to read/write Constraint Check
+    uint64_t lastIssuedCycle = ranks[rank]->GetLastRdWrCycle(lastBank);
+    uint64_t rdwr_const = lastIssuedCycle + mParam->GetRdWrDelay(type, lastType);
+    updateCycle = std::max(updateCycle, rdwr_const);
+
+    return updateCycle;
+}
+
+void MemChannelBase::IssueActivate(uint32_t rank, uint32_t bank, uint64_t issuedCycle) {
+    ranks[rank]->SetFAWCycle(issuedCycle);
+    ranks[rank]->SetLastActCycle(bank, issuedCycle);
+    ranks[rank]->SetBankOpen(bank);
+    ranks[rank]->IncActivateCount();
+}
+
+void MemChannelBase::IssuePrecharge(uint32_t rank, uint32_t bank, uint64_t issuedCycle, bool continuous) {
+    ranks[rank]->SetLastPreCycle(bank, issuedCycle);
+    ranks[rank]->SetBankClose(bank);
+    if (continuous == false)
+        ranks[rank]->IncPrechargeCount();
+}
+
+uint64_t MemChannelBase::LatencySimulate(Address lineAddr, uint64_t arrivalCycle, uint64_t lastPhaseCycle, MemAccessType type) {
+    uint32_t row, col, rank, bank;
+    AddressMap(lineAddr, row, col, rank, bank);
+
+    uint32_t refreshNum = UpdateRefreshNum(rank, arrivalCycle);
+    // Require to call here. Between RefreshNum is updated, but LastRefreshNum is not Updated.
+    uint32_t pd_penalty = GetPowerDownPenalty(rank, arrivalCycle);
+    UpdatePowerDownCycle(rank, arrivalCycle, lastPhaseCycle, refreshNum);
+    UpdateLastRefreshCycle(rank, arrivalCycle, refreshNum);
+
+    // save rowBufferHit at this point
+    bool rowHit = IsRowBufferHit(row, rank, bank);
+
+    uint64_t preIssueCycle = (uint64_t)-1;
+    uint64_t actIssueCycle = (uint64_t)-1;
+    bool continuous = false;
+    if (mParam->IsOpenRowBufPolicy()) {
+        // rowbuffer hit -> intra constraint for read write command
+        if (rowHit == false) {
+            uint64_t issueCycle = CalcIntraIssueCycle(rowHit, rank, type,
+                                                      arrivalCycle, refreshNum);
+            if (ranks[rank]->GetBankOpen(bank) == true) {
+                MemAccessType lastType = ranks[rank]->GetLastType(bank);
+                preIssueCycle = CalcPreConst(rank, bank, lastType, issueCycle);
+                assert(preIssueCycle >= issueCycle);
+                IssuePrecharge(rank, bank, preIssueCycle);
+                actIssueCycle = preIssueCycle + mParam->tRP;
+            } else {
+                // Issue only Activate after refresh
+                actIssueCycle = issueCycle;
+            }
+        }
+    } else { // Closed-row Policy
+        assert(rowHit == false);
+        continuous = CheckContinuousAccess(arrivalCycle, rank, bank, row);
+        if (continuous == false) {
+            actIssueCycle = CalcIntraIssueCycle(rowHit, rank, type,
+                                                arrivalCycle, refreshNum);
+        }
+    }
+    if (actIssueCycle != (uint64_t)-1) {
+        actIssueCycle = CalcActConst(rank, bank, actIssueCycle);
+        IssueActivate(rank, bank, actIssueCycle);
+        assert(actIssueCycle >= arrivalCycle);
+    }
+
+    // Find Read Write command issue slot
+    uint64_t rdwrStart = arrivalCycle;
+    if (actIssueCycle == (uint64_t)-1) {
+        // read/write to read/write constraint check
+        if (continuous == true) {
+            rdwrStart = CalcRdWrConst(rank, type, arrivalCycle);
+        } else {
+            // open page only
+            assert(rowHit == true);
+            assert(mParam->IsOpenRowBufPolicy());
+            rdwrStart = CalcIntraIssueCycle(rowHit, rank, type,
+                                            arrivalCycle, refreshNum);
+        }
+    } else {
+        rdwrStart = actIssueCycle + mParam->tRCD;
+        rdwrStart = CalcRdWrConst(rank, type, rdwrStart);
+    }
+    assert(rdwrStart >= arrivalCycle);
+    uint64_t rdwrIssueCycle = CalcInterIssueCycle(type, rdwrStart);
+    assert(rdwrIssueCycle >= arrivalCycle);
+    uint64_t issueDelay = rdwrIssueCycle - arrivalCycle;
+    uint32_t dataDelay = mParam->GetDataDelay(type);
+
+    // total delay from the request arrival from CPU
+    uint64_t latency = issueDelay + dataDelay + pd_penalty;
+    uint64_t latency_mem = latency + (mParam->tTrans - mParam->tTransCrit);
+
+    // Update Current Read/Write Command Information
+    uint64_t accessCycle = arrivalCycle + latency_mem;
+    ranks[rank]->access(accessCycle, rdwrIssueCycle, row, col, bank, type);
+
+    // last, issue precharge in close policy
+    if (mParam->IsCloseRowBufPolicy()) {
+        // In close policy Precharge is issued in each access
+        preIssueCycle = CalcPreConst(rank, bank, type, rdwrIssueCycle);
+        assert(preIssueCycle >= rdwrIssueCycle);
+        IssuePrecharge(rank, bank, preIssueCycle, continuous);
+    }
+
+    // save access cycle for inter constraint
+    uint64_t busEndCycle = arrivalCycle + latency_mem;
+    uint64_t busStartCycle = busEndCycle - mParam->GetDataSlot(type);
+    UpdateDataBusCycle(busStartCycle, busEndCycle);
+
+    return latency;
+}
+
+uint32_t MemChannelBase::GetPowerDownPenalty(uint32_t rank, uint64_t arrivalCycle) {
+    uint32_t penalty = 0;
+    if (mParam->powerDownCycle != 0) {
+        uint64_t lastAccessCycle = ranks[rank]->GetLastAccessCycle();
+        uint64_t lastPowerDownCycle = lastAccessCycle + mParam->powerDownCycle;
+        if (arrivalCycle > lastPowerDownCycle) { // check if arrival cycle needs to be issuedCycle
+            penalty = mParam->tXP;
+        }
+    }
+    return penalty;
+}
+
+void MemChannelBase::UpdatePowerDownCycle(uint32_t rank, uint64_t arrivalCycle, uint64_t lastPhaseCycle, uint32_t refreshNum) {
+    uint32_t powerDownCycle = mParam->powerDownCycle;
+    if (powerDownCycle == 0)
+        return;
+
+    uint64_t lastAccessCycle = ranks[rank]->GetLastAccessCycle();
+    uint64_t lastPowerDownCycle = lastAccessCycle + powerDownCycle;
+    if (lastPowerDownCycle < lastPhaseCycle) {
+        lastPowerDownCycle = lastPhaseCycle;
+        powerDownCycle = 0;
+    }
+
+    uint32_t bankCount = mParam->bankCount;
+    uint32_t actbanknum = ranks[rank]->GetActiveBankCount();
+    uint32_t idlbanknum = bankCount - actbanknum;
+
+    uint64_t idle_pd_cycle = ranks[rank]->GetIdlePowerDownCycle();
+    uint64_t actv_pd_cycle = ranks[rank]->GetActvPowerDownCycle();
+    uint64_t idle_sb_cycle = ranks[rank]->GetIdleStandbyCycle();
+
+    if (lastAccessCycle == 0 && lastPhaseCycle == 0) {// This is a first Access for the rank
+        idle_pd_cycle += arrivalCycle;
+    } else if (arrivalCycle <= lastAccessCycle) {
+        // add to actv_sb_cycle, so nothing to do here.
+    } else if (arrivalCycle > lastAccessCycle && arrivalCycle <= lastPowerDownCycle) {
+        uint64_t diffPowerDownCycle = arrivalCycle - lastAccessCycle;
+        if (mParam->IsCloseRowBufPolicy()) {
+            idle_sb_cycle += diffPowerDownCycle;
+        } else {// Open Page Policy
+            idle_sb_cycle += idlbanknum * diffPowerDownCycle / bankCount;
+        }
+    } else  {
+        uint64_t powerDownDuration = arrivalCycle - lastPowerDownCycle;
+        if (mParam->IsCloseRowBufPolicy()) {
+            idle_pd_cycle += powerDownDuration;
+            actv_pd_cycle += 0;
+            idle_sb_cycle += powerDownCycle;
+
+        } else {// Open Page Policy
+            if (refreshNum == 0) {
+                idle_pd_cycle += idlbanknum * powerDownDuration / bankCount;
+                actv_pd_cycle += actbanknum * powerDownDuration / bankCount;
+                idle_sb_cycle += idlbanknum * powerDownCycle    / bankCount;
+            } else {
+                uint32_t tREFI = mParam->tREFI;
+                uint64_t lastRefreshCycle = ranks[rank]->GetLastRefreshCycle();
+                uint64_t refreshEndCycle1 = (lastRefreshCycle + tREFI);
+                uint64_t refreshEndCycle2 = (lastRefreshCycle + tREFI * refreshNum);
+
+                assert_msg(arrivalCycle >= refreshEndCycle2,
+                           "arrivalCycle(%ld) must be greater or equal than refreshEndCycle2(%ld)",
+                           arrivalCycle, refreshEndCycle2);
+
+                uint64_t diffArrivalCycle = arrivalCycle - refreshEndCycle2;
+                uint64_t diffRefreshCycle = 0;
+                if (refreshEndCycle1 > lastPowerDownCycle) {
+                    diffRefreshCycle = refreshEndCycle1 - lastPowerDownCycle;
+                }
+                idle_pd_cycle += ((idlbanknum * diffRefreshCycle)
+                                  + (((refreshNum - 1) * tREFI ) + diffArrivalCycle)) / bankCount;
+                actv_pd_cycle += (actbanknum * diffRefreshCycle) / bankCount;
+                idle_sb_cycle += idlbanknum * powerDownCycle / bankCount;
+            }
+        }
+    }
+    assert_msg(arrivalCycle >= (idle_pd_cycle + actv_pd_cycle + idle_sb_cycle),
+               "PowerDown calc Error. arrival=%ld, idle_pd=%ld, actv_pd=%ld, idle_sb=%ld",
+               arrivalCycle, idle_pd_cycle, actv_pd_cycle, idle_sb_cycle);
+    ranks[rank]->SetIdlePowerDownCycle(idle_pd_cycle);
+    ranks[rank]->SetActvPowerDownCycle(actv_pd_cycle);
+    ranks[rank]->SetIdleStandbyCycle(idle_sb_cycle);
+}
+
+void MemChannelBase::PeriodicUpdatePower(uint64_t phaseCycle, uint64_t lastPhaseCycle) {
+    for(uint32_t i = 0; i < mParam->rankCount; i++) {
+        if (ranks[i]->GetLastAccessCycle() < phaseCycle) {
+            uint32_t refreshNum = UpdateRefreshNum(i, phaseCycle);
+            UpdatePowerDownCycle(i, phaseCycle, lastPhaseCycle, refreshNum);
+            UpdateLastRefreshCycle(i, phaseCycle, refreshNum);
+        }
+    }
+}
+
+bool MemChannelBase::CheckContinuousAccess(uint64_t arrivalCycle, uint32_t rank, uint32_t bank, uint32_t row) {
+     //////////////////////////////////////////////////////////////////////
+     // Continuous Case in Close Policy ///////////////////////////////////
+     //////////////////////////////////////////////////////////////////////
+     //  # If next access comes before PRE, MEMC will not issue PRE and deal
+     //  # it as continuous (Limited Open Policy = w/o Precharge) access
+     //
+     // 1.last access is Write
+     //
+     //        ACT   WRT                 PRE
+     // last  --|-----|-------------------|--
+     //          tRCD  tCWD  tTrans   tWR |
+     //          <---> <--> ******** <--->|
+     //                        < - - - - >|  continuousLatency
+     //                      WRT           <---->PRE
+     // current ------*-------|-------------------|--
+     // (write)       |        tCWD  tTrans   tWR
+     //               |        <--> ******** <--->
+     //               |
+     //              arrivalCycle
+     //                             continuousLatency
+     //                W->R const  <-------->
+     //                 - - - - >RD         PRE
+     // current ------*----------|-----------|---
+     // (read)        |           tCAS  tTrans
+     //               |           <--> ********
+     //               |
+     //              arrivalCycle
+     //
+     // 2.last access is Read
+     //
+     //        ACT   RD<-------->PRE
+     // last  --|-----|----------|----------
+     //          tRCD  tCAS  tTrans
+     //          <---> <--> ********
+     //
+     //                R->W          continuousLatency
+     //               - - - >WRT    <------->    PRE
+     // current ------*-------|-------------------|--
+     // (write)       |         tCWD  tTrans   tWR
+     //               |        <---> ******** <--->
+     //               |
+     //              arrivalCycle
+     //
+     //                       RD        PRE
+     // current ------*-------|----------|--
+     // (read)        |        tCAS  tTrans
+     //               |        <--> ********
+     //               |             <------>
+     //              arrivalCycle    continuousLatency
+     //////////////////////////////////////////////////////////////////////
+    if (mParam->mergeContinuous == false)
+        return false;
+
+     uint64_t lastPreCycle = ranks[rank]->GetLastPreCycle(bank);
+     if ((arrivalCycle < lastPreCycle) &&
+        (ranks[rank]->GetLastRow(bank) == row)) { // w/o ACT
+         return true;
+     } else {
+         return false;
+     }
+}
+
+// See also MemControllerBase::ReturnChannel
+void MemChannelBase::AddressMap(Address addr, uint32_t& row, uint32_t& col, uint32_t& rank, uint32_t& bank) {
+    // FIXME (dsm): This is needlessly complex. See how addressing is done in DDRMemory (along with sizing)
+    //
+    // Address is cache line address. it has already shifted for containg process id.
+    // interleaveType == 0: | Row | ColH | Bank | Rank | Chnl | ColL | DataBus |
+    // interleaveType == 1: | Row | ColH | Rank | Bank | Chnl | ColL | DataBus |
+    // interleaveType == 2: | Row | Bank | ColH | Rank | Chnl | ColL | DataBus |
+    // interleaveType == 3: | Row | Rank | ColH | Bank | Chnl | ColL | DataBus |
+    // interleaveType == 4: | Row | Bank | Rank | ColH | Chnl | ColL | DataBus |
+    // interleaveType == 5: | Row | Rank | Bank | ColH | Chnl | ColL | DataBus |
+    // interleaveType == 6: | Row | Rank | Bank | Chnl | Column | DataBus |
+    // interleaveType == 7: | Row | Rank | Chnl | Bank | Column | DataBus |
+    // interleaveType == 8: | Row | Chnl | Rank | Bank | Column | DataBus |
+
+    uint32_t colLowWidth = 0;
+    uint32_t colLow = 0;
+    if (mParam->channelDataWidthLog < mParam->byteOffsetWidth) {
+        colLowWidth = mParam->byteOffsetWidth - mParam->channelDataWidthLog;
+        colLow = addr & ((1L << colLowWidth) - 1);
+        addr >>= colLowWidth;
+    }
+
+    uint32_t chnl = (uint32_t)-1;
+    if (mParam->interleaveType >= 0 && mParam->interleaveType <= 5) {
+        // for non-power of 2 channels
+        chnl = addr % mParam->channelCount;
+        addr /= mParam->channelCount;
+    }
+
+    uint32_t colHighWidth = mParam->colAddrWidth - colLowWidth;
+    uint32_t colHigh = 0;
+    if (mParam->interleaveType >= 4) {
+        colHigh = addr & ((1L << colHighWidth) - 1);
+        col = (colHigh << colLowWidth) | colLow;
+        addr >>= colHighWidth;
+    }
+
+    switch(mParam->interleaveType) {
+    case 0:
+        rank = addr & ((1L << mParam->rankWidth) - 1);
+        addr >>= mParam->rankWidth;
+        bank = addr & ((1L << mParam->bankWidth) - 1);
+        addr >>= mParam->bankWidth;
+        colHigh = addr & ((1L << colHighWidth) - 1);
+        addr >>= colHighWidth;
+        col = (colHigh << colLowWidth) | colLow;
+        break;
+    case 1:
+        bank = addr & ((1L << mParam->bankWidth) - 1);
+        addr >>= mParam->bankWidth;
+        rank = addr & ((1L << mParam->rankWidth) - 1);
+        addr >>= mParam->rankWidth;
+        colHigh = addr & ((1L << colHighWidth) - 1);
+        addr >>= colHighWidth;
+        col = (colHigh << colLowWidth) | colLow;
+        break;
+    case 2:
+        rank = addr & ((1L << mParam->rankWidth) - 1);
+        addr >>= mParam->rankWidth;
+        colHigh = addr & ((1L << colHighWidth) - 1);
+        addr >>= colHighWidth;
+        col = (colHigh << colLowWidth) | colLow;
+        bank = addr & ((1L << mParam->bankWidth) - 1);
+        addr >>= mParam->bankWidth;
+        break;
+    case 3:
+        bank = addr & ((1L << mParam->bankWidth) - 1);
+        addr >>= mParam->bankWidth;
+        colHigh = addr & ((1L << colHighWidth) - 1);
+        addr >>= colHighWidth;
+        col = (colHigh << colLowWidth) | colLow;
+        rank = addr & ((1L << mParam->rankWidth) - 1);
+        addr >>= mParam->rankWidth;
+        break;
+    case 4:
+        rank = addr & ((1L << mParam->rankWidth) - 1);
+        addr >>= mParam->rankWidth;
+        bank = addr & ((1L << mParam->bankWidth) - 1);
+        addr >>= mParam->bankWidth;
+        break;
+    case 5:
+        bank = addr & ((1L << mParam->bankWidth) - 1);
+        addr >>= mParam->bankWidth;
+        rank = addr & ((1L << mParam->rankWidth) - 1);
+        addr >>= mParam->rankWidth;
+        break;
+    case 6:
+        chnl = addr % mParam->channelCount;
+        addr /= mParam->channelCount;
+        bank = addr & ((1L << mParam->bankWidth) - 1);
+        addr >>= mParam->bankWidth;
+        rank = addr & ((1L << mParam->rankWidth) - 1);
+        addr >>= mParam->rankWidth;
+        break;
+    case 7:
+        bank = addr & ((1L << mParam->bankWidth) - 1);
+        addr >>= mParam->bankWidth;
+        chnl = addr % mParam->channelCount;
+        addr /= mParam->channelCount;
+        rank = addr & ((1L << mParam->rankWidth) - 1);
+        addr >>= mParam->rankWidth;
+        break;
+    case 8:
+        bank = addr & ((1L << mParam->bankWidth) - 1);
+        addr >>= mParam->bankWidth;
+        rank = addr & ((1L << mParam->rankWidth) - 1);
+        addr >>= mParam->rankWidth;
+        chnl = addr % mParam->channelCount;
+        addr /= mParam->channelCount;
+        break;
+    }
+
+    assert(myId == chnl);
+
+    row = addr;
+    //row != addr & ((1L<<mParam->rowAddrWidth)-1);
+    // row address may contains large number, even if it exceed memory capacity size.
+    // Becase memory model receives PID + VA as a access address.
+    // But it's OK. row address is only used for checking row buffer hit,
+    // and no address translation is almost same as ideotically address translation.
+}
+
+uint64_t MemChannelBase::GetActivateCount(void) {
+    uint64_t actCount = 0;
+    for (uint32_t i = 0; i < mParam->rankCount; i++) {
+        actCount +=  ranks[i]->GetActivateCount();
+    }
+    return actCount;
+}
+
+uint64_t MemChannelBase::GetPrechargeCount(void) {
+    uint64_t preCount = 0;
+    for (uint32_t i = 0; i < mParam->rankCount; i++) {
+        preCount += ranks[i]->GetPrechargeCount();
+    }
+    return preCount;
+}
+
+uint64_t MemChannelBase::GetRefreshCount(void) {
+    uint64_t refnum = 0;
+    for(uint32_t i = 0; i < mParam->rankCount; i++) {
+        refnum += ranks[i]->GetRefreshNum();
+    }
+    return refnum;
+}
+
+uint64_t MemChannelBase::GetBurstEnergy(void) {
+    uint64_t writeBurstCount = 0;
+    uint64_t readBurstCount = 0;
+    for(uint32_t i = 0; i < mParam->rankCount; i++) {
+        writeBurstCount += ranks[i]->GetWriteBurstCount();
+        readBurstCount  += ranks[i]->GetReadBurstCount();
+    }
+
+    uint64_t burstPower = 0;
+    uint64_t burstPower1;
+    assert_msg((mParam->IDD_VDD1.IDD4W >= mParam->IDD_VDD1.IDD3N), "IDD4W must be larger or equal than IDD3N");
+    assert_msg((mParam->IDD_VDD1.IDD4R >= mParam->IDD_VDD1.IDD3N), "IDD4R must be larger or equal than IDD3N");
+    burstPower1 = writeBurstCount * (mParam->IDD_VDD1.IDD4W - mParam->IDD_VDD1.IDD3N) * mParam->tTrans;
+    burstPower1 += readBurstCount * (mParam->IDD_VDD1.IDD4R - mParam->IDD_VDD1.IDD3N) * mParam->tTrans;
+    burstPower += burstPower1 * mParam->VDD1;
+    burstPower *= mParam->chipCountPerRank;
+    burstPower /= 1000; // uW -> mW
+    return burstPower;
+}
+
+uint64_t MemChannelBase::GetActPreEnergy(void) {
+    uint64_t actPreCount = GetActivateCount();
+    uint64_t actPrePower = 0;
+    uint64_t actPrePower1;
+    assert_msg((mParam->tRC >= mParam->tRAS), "tRC must be larger or equal than tRAS");
+    actPrePower1 = actPreCount * ( (mParam->IDD_VDD1.IDD0  * mParam->tRC)
+                                   - ((mParam->IDD_VDD1.IDD3N * mParam->tRAS)
+                                   + (mParam->IDD_VDD1.IDD2N * (mParam->tRC - mParam->tRAS))));
+    actPrePower += actPrePower1 * mParam->VDD1;
+    actPrePower *= mParam->chipCountPerRank;
+    actPrePower /= 1000; // uW -> mW
+    return actPrePower;
+}
+
+uint64_t MemChannelBase::GetRefreshEnergy(void) {
+    uint64_t refnum = GetRefreshCount();
+    uint64_t refreshPower = 0;
+    uint64_t refreshPower1;
+    assert_msg((mParam->IDD_VDD1.IDD5 >= mParam->IDD_VDD1.IDD3N), "IDD5 must be larger or equal than IDD3N");
+    refreshPower1 = refnum * (mParam->IDD_VDD1.IDD5 - mParam->IDD_VDD1.IDD3N) * mParam->tRFC;
+    refreshPower += refreshPower1 * mParam->VDD1;
+
+    refreshPower *= mParam->chipCountPerRank;
+    refreshPower /= 1000; // uW -> mW
+    return refreshPower;
+}
+
+uint64_t MemChannelBase::GetBackGroundEnergy(uint64_t memCycle, uint64_t lastMemCycle, bool bInstant) {
+    assert(lastMemCycle < memCycle);
+    uint64_t tickCycle = bInstant ? (memCycle - lastMemCycle) : memCycle;
+
+    uint64_t backgroundPower = 0;
+    for(uint32_t i = 0; i < mParam->rankCount; i++) {
+        uint64_t lastAccessCycle = ranks[i]->GetLastAccessCycle();
+        uint64_t idlePowerDownCycle;
+        uint64_t actvPowerDownCycle;
+        uint64_t idleStandbyCycle;
+        if (mParam->powerDownCycle == 0) {
+            idlePowerDownCycle = 0;
+            actvPowerDownCycle = 0;
+            idleStandbyCycle= 0;
+        } else if (bInstant == false) {
+            idlePowerDownCycle = ranks[i]->GetIdlePowerDownCycle();
+            actvPowerDownCycle = ranks[i]->GetActvPowerDownCycle();
+            idleStandbyCycle = ranks[i]->GetIdleStandbyCycle();
+        } else {
+            if (lastAccessCycle < lastMemCycle) {// No Access
+                idlePowerDownCycle = tickCycle;
+                actvPowerDownCycle = 0;
+                idleStandbyCycle = 0;
+            } else {
+                idlePowerDownCycle = ranks[i]->GetIdlePowerDownCycle();
+                idlePowerDownCycle -= ranks[i]->GetPrevIdlePowerDownCycle();
+                actvPowerDownCycle = ranks[i]->GetActvPowerDownCycle();
+                actvPowerDownCycle -= ranks[i]->GetPrevActvPowerDownCycle();
+                idleStandbyCycle = ranks[i]->GetIdleStandbyCycle();
+                idleStandbyCycle   -= ranks[i]->GetPrevIdleStandbyCycle();
+            }
+            ranks[i]->SaveBackgroundCycles();
+        }
+        uint64_t actvStandbyCycle = tickCycle - idlePowerDownCycle - actvPowerDownCycle - idleStandbyCycle;
+        assert_msg(tickCycle >= (idlePowerDownCycle + actvPowerDownCycle + idleStandbyCycle),
+                   "Power down cycle calculation error. bInstant = %d, memCycle=%ld, idlePowerDown=%ld, actvPowerDown=%ld, idleStandby=%ld",
+                   bInstant, tickCycle, idlePowerDownCycle, actvPowerDownCycle, idleStandbyCycle);
+        uint64_t idlePowerDown = mParam->VDD1 * (idlePowerDownCycle * mParam->IDD_VDD1.IDD2P) / tickCycle;
+        uint64_t actPowerDown = mParam->VDD1 * (actvPowerDownCycle * mParam->IDD_VDD1.IDD3P) / tickCycle;
+        uint64_t idleStandby = mParam->VDD1 * (idleStandbyCycle   * mParam->IDD_VDD1.IDD2N) / tickCycle;
+        uint64_t actvStandby = mParam->VDD1 * (actvStandbyCycle   * mParam->IDD_VDD1.IDD3N) / tickCycle;
+        backgroundPower += (idlePowerDown + actPowerDown + idleStandby + actvStandby);
+    }
+    backgroundPower *= mParam->chipCountPerRank;
+    backgroundPower /= 1000;// uW -> mW
+    return backgroundPower;
+}
+
+
+////////////////////////////////////////////////////////////////////////
+// Default Memory Scheduler Class
+MemSchedulerDefault::MemSchedulerDefault(uint32_t id, MemParam* mParam, MemChannelBase* mChnl)
+    : MemSchedulerBase(id, mParam, mChnl)
+{
+    prioritizedAccessType = READ; 
+    wrQueueSize = mParam->schedulerQueueCount;
+    wrQueueHighWatermark = mParam->schedulerQueueCount * 2 / 3;
+    wrQueueLowWatermark = mParam->schedulerQueueCount * 1 / 3;
+}
+
+MemSchedulerDefault::~MemSchedulerDefault() {}
+
+bool MemSchedulerDefault::CheckSetEvent(MemAccessEventBase* ev) {
+    // Write Queue Hit Check
+    g_vector<MemSchedQueueElem>::iterator it;
+    for(it = wrQueue.begin(); it != wrQueue.end(); it++) {
+        if (it->second == ev->getAddr()) {
+            if (ev->getType() == WRITE) {
+                wrQueue.erase(it);
+                wrQueue.push_back(MemSchedQueueElem(NULL, ev->getAddr()));
+            }
+            return true;
+        }
+    }
+
+    // Write Done Queue Hit Check
+    for(it = wrDoneQueue.begin(); it != wrDoneQueue.end(); it++) {
+        if (it->second == ev->getAddr()) {
+            if (ev->getType() == READ) {
+                // Update LRU
+                wrDoneQueue.erase(it);
+                wrDoneQueue.push_back(MemSchedQueueElem(NULL, ev->getAddr()));
+            } else { // Write
+                // Update for New Data
+                wrDoneQueue.erase(it);
+                wrQueue.push_back(MemSchedQueueElem(NULL, ev->getAddr()));
+            }
+            return true;
+        }
+    }
+
+    // No Hit
+    if (ev->getType() == READ) {
+        rdQueue.push_back(MemSchedQueueElem(ev, ev->getAddr()));
+    } else { // Write
+        wrQueue.push_back(MemSchedQueueElem(NULL, ev->getAddr()));
+        if (wrQueue.size() + wrDoneQueue.size() == wrQueueSize) {
+            // Overflow case
+            if (wrDoneQueue.empty() == false) {
+                wrDoneQueue.erase(wrDoneQueue.begin());
+            } else {
+                // FIXME: Need to handle this - HK
+                warn("Write Buffer Overflow!!");
+            }
+        }
+    }
+    return false;
+}
+
+bool MemSchedulerDefault::GetEvent(MemAccessEventBase*& ev, Address& addr, MemAccessType& type) {
+    bool bRet = false;
+
+    // Check Priority
+    if (wrQueue.size() >= wrQueueHighWatermark)
+        prioritizedAccessType = WRITE; // Write Priority
+    else if (wrQueue.size() <= wrQueueLowWatermark)
+        prioritizedAccessType = READ; // Read Priority
+
+    //info("Id%d: Read Queue = %ld, Write Queue = %ld, Schedule = %d",
+    //myId, rdQueue.size(), wrQueue.size(), prioritizedAccessType);
+
+    uint32_t idx;
+    g_vector<MemSchedQueueElem>::iterator it;
+    if (prioritizedAccessType == READ) {
+        bRet = FindBestRequest(&rdQueue, idx);
+        if (bRet) {
+            it = rdQueue.begin() + idx;
+            ev = it->first;
+            addr = ev->getAddr();
+            type = ev->getType();
+            rdQueue.erase(it);
+        }
+    }
+
+    if (!bRet) { // Write Priority or No Read Entry
+        bRet = FindBestRequest(&wrQueue, idx);
+        if (bRet) {
+            it = wrQueue.begin() + idx;
+            ev = NULL;
+            addr = it->second;
+            type = WRITE;
+            wrQueue.erase(it);
+            wrDoneQueue.push_back(MemSchedQueueElem(NULL, addr));
+        }
+    }
+
+    return bRet;
+}
+
+bool MemSchedulerDefault::FindBestRequest(g_vector<MemSchedQueueElem> *queue, uint32_t& idx) {
+    idx = 0;
+    uint32_t tmpIdx = 0;
+    g_vector<MemSchedQueueElem>::iterator it;
+    for(it = queue->begin(); it!= queue->end(); it++) {
+        Address addr = it->second;
+        uint32_t row, col, rank, bank;
+        mChnl->AddressMap(addr, row, col, rank, bank);
+        if (mChnl->IsRowBufferHit(row, rank, bank) == true) {
+            idx = tmpIdx;
+            break;
+        }
+        tmpIdx++;
+    }
+
+    return !queue->empty();
+}
+
+
+// Main Memory Class
+MemControllerBase::MemControllerBase(g_string _memCfg, uint32_t _cacheLineSize, uint32_t _sysFreqMHz, uint32_t _domain, g_string& _name) {
+    name = _name;
+    domain = _domain;
+    info("%s: domain %d", name.c_str(), domain);
+
+    lastPhaseCycle = 0;
+    lastAccessedCycle = 0;
+    cacheLineSize = _cacheLineSize;
+
+    futex_init(&updateLock);
+
+    mParam = new MemParam();
+    mParam->LoadConfig(_memCfg, _cacheLineSize);
+
+    // Calculate Frequency
+    sysFreqKHz = _sysFreqMHz * 1000;
+    memFreqKHz = 1e9 / mParam->tCK / 1e3;
+    info("MemControllerBase: sysFreq = %ld KHz memFreq = %ld KHz", sysFreqKHz, memFreqKHz);
+
+    if (mParam->schedulerQueueCount != 0) {
+        //Processor tick, memory ticks only every Nth cycle where N is proc:mem freq ratio
+        // for Memory Scheduler
+        nextSysTick = std::max((uint64_t)1, memToSysCycle(1));
+    } else {
+        // for periodic performance report
+        // for avoiding tick scheduler limitation
+        nextSysTick = usecToSysCycle(10);// once every 10us
+    }
+    reportPeriodCycle = usecToSysCycle(mParam->reportPhase);
+
+    // setup controller parameters
+    memMinLatency[0] = memToSysCycle(mParam->GetDataLatency(0));// Read
+    memMinLatency[1] = memToSysCycle(mParam->GetDataLatency(1));// Write
+    if (mParam->schedulerQueueCount == 0) {
+        minLatency[0] = mParam->GetDataLatency(0);// Read
+        minLatency[1] = mParam->GetDataLatency(1);// Write
+    } else {
+        minLatency[0] = 1;// scheduler queue hit case
+        minLatency[1] = 1;// scheduler queue hit case
+    }
+    minLatency[0] = memToSysCycle(minLatency[0]) + mParam->controllerLatency;
+    minLatency[1] = memToSysCycle(minLatency[1]) + mParam->controllerLatency;
+    preDelay[0] = minLatency[0] / 2;
+    preDelay[1] = minLatency[1] / 2;
+    postDelay[0] = minLatency[0] - preDelay[0];
+    postDelay[1] = minLatency[1] - preDelay[1];
+    info("Latency: read minLatency is %d, write minLatency is %d", minLatency[0], minLatency[1]);
+
+    memset(&lastPower, 0, sizeof(powerValue));
+    lastAccesses = 0;
+    maxBandwidth = 0;
+    minBandwidth = (uint64_t)-1;
+
+    chnls.resize(mParam->channelCount);
+    sches.resize(mParam->channelCount);
+    for(uint32_t i = 0; i < mParam->channelCount; i++) {
+        chnls[i] = new MemChannelBase (i, mParam);
+        sches[i] = new MemSchedulerDefault(i, mParam, chnls[i]);
+    }
+
+    if (mParam->schedulerQueueCount != 0) {
+        TickEvent<MemControllerBase >* tickEv = new TickEvent<MemControllerBase >(this, domain);
+        tickEv->queue(0); //start the sim at time 0
+        info("MemControllerBase::tick() will be call in each %ld sysCycle", nextSysTick);
+    }
+
+    addrTraceLog = NULL;
+    if (mParam->addrTrace == true) {
+        g_string gzFileName = g_string("ZsimMemAddrTrace_") + name.c_str() + ".gz";
+        addrTraceLog = gzopen(gzFileName.c_str(), "wb1");
+        if (addrTraceLog == NULL)
+            panic("Fail to open file %s for addrTraceLog.", gzFileName.c_str());
+    }
+}
+
+MemControllerBase::~MemControllerBase() {
+    if (mParam != NULL) {
+        for(uint32_t i = 0; i < mParam->channelCount; i++) {
+            delete chnls[i];
+            delete sches[i];
+        }
+        delete mParam;
+    }
+}
+
+void MemControllerBase::enqueue(MemAccessEventBase* ev, uint64_t cycle) {
+    if (mParam->schedulerQueueCount == 0) {
+        MemAccessType type = ev->getType();
+        uint64_t startCycle = cycle - preDelay[type] + mParam->controllerLatency;
+        // FIXME: Shouldn't we use the next memCycle following startCycle as the 
+        // starting cycle of the dram request?
+        uint64_t latency = LatencySimulate(ev->getAddr(), startCycle, type);
+        ev->done(cycle + latency - minLatency[type] + mParam->controllerLatency);
+        return;
+    }
+
+    // Write Queue Hit Check
+    uint32_t channel = ReturnChannel(ev->getAddr());
+    bool bRet = sches[channel]->CheckSetEvent(ev);
+    if (ev->getType() == READ) {
+        if (bRet)
+            ev->done(cycle - minLatency[0] + mParam->controllerLatency);
+        else
+            ev->hold();
+    } else { // Write
+        // Write must be enqueued.
+        ev->done(cycle - minLatency[1] + mParam->controllerLatency);
+    }
+
+    return;
+}
+
+uint32_t MemControllerBase::tick(uint64_t sysCycle) {
+    // tick will be called each memCycle
+    // for memory scheduler
+    if (mParam->schedulerQueueCount != 0) {
+        TickScheduler(sysCycle);
+    }
+
+    return nextSysTick;
+}
+
+void MemControllerBase::TickScheduler(uint64_t sysCycle) {
+    for(uint32_t i = 0; i < mParam->channelCount; i++) {
+        MemAccessEventBase* ev = NULL;
+        Address  addr = 0;
+        MemAccessType type = READ;
+        bool bRet = sches[i]->GetEvent(ev, addr, type);
+        if (bRet) {
+            uint64_t latency = LatencySimulate(addr, sysCycle, type);
+            if (type == READ) {
+                // Write has already ev->done
+                ev->release();
+                ev->done(sysCycle - minLatency[0] + latency);
+            }
+        }
+    }
+}
+
+uint64_t MemControllerBase::access(MemReq& req) {
+    switch (req.type) {
+        case PUTS:
+        case PUTX:
+            *req.state = I;
+            break;
+        case GETS:
+            *req.state = E;
+            break;
+        case GETX:
+            *req.state = M;
+            break;
+
+        default: panic("!?");
+    }
+
+    if (req.type == PUTS)
+        return req.cycle;
+
+    MemAccessType accessType = (req.type == PUTS || req.type == PUTX) ? WRITE : READ;
+    uint64_t respCycle = req.cycle + minLatency[accessType];
+    assert(respCycle >= req.cycle);
+
+    if ((req.type != PUTS) && zinfo->eventRecorders[req.srcId]) {
+        Address addr = req.lineAddr;
+        MemAccessEventBase* memEv =
+            new (zinfo->eventRecorders[req.srcId])
+            MemAccessEventBase(this, accessType, addr, domain, preDelay[accessType], postDelay[accessType]);
+        memEv->setMinStartCycle(req.cycle);
+        TimingRecord tr = {addr, req.cycle, respCycle, req.type, memEv, memEv};
+        zinfo->eventRecorders[req.srcId]->pushRecord(tr);
+    }
+    return respCycle;
+}
+
+void MemControllerBase::initStats(AggregateStat* parentStat) {
+    AggregateStat* memStats = new AggregateStat();
+    memStats->init(name.c_str(), "Memory controller stats");
+
+    profActivate.init("act", "Activate command Times");
+    memStats->append(&profActivate);
+    profReads.init("rd", "Read request command Times");
+    memStats->append(&profReads);
+    profWrites.init("wr", "Write request command Times");
+    memStats->append(&profWrites);
+    profPrecharge.init("pre", "Precharge command Times");
+    memStats->append(&profPrecharge);
+    profRefresh.init("ref", "Refresh command Times");
+    memStats->append(&profRefresh);
+
+    if (mParam->accAvgPowerReport == true) {
+        AggregateStat* apStats = new AggregateStat();
+        apStats->init("ap", "Cumulative Average Power Report");
+        profAccAvgPower[0].init("total",  "Total average power (mW)");
+        profAccAvgPower[1].init("actpre", "ActPre average power (mW)");
+        profAccAvgPower[2].init("burst",  "Burst average power (mW)");
+        profAccAvgPower[3].init("refr",   "Refersh average power (mW)");
+        profAccAvgPower[4].init("bgnd",   "Background average power (mW)");
+        profAccAvgPower[5].init("dq",     "DQ average power (mW)");
+        profAccAvgPower[6].init("term",   "Terminate average power (mW)");
+        for(uint32_t i = 0; i < pwCounterNum; i++)
+            apStats->append(&profAccAvgPower[i]);
+        memStats->append(apStats);
+    }
+
+    if (mParam->curAvgPowerReport == true) {
+        AggregateStat* cpStats = new AggregateStat();
+        cpStats->init("cp", "Current Average Power Report");
+        profCurAvgPower[0].init("total",  "Total instant power (mW)");
+        profCurAvgPower[1].init("actpre", "ActPre instant power (mW)");
+        profCurAvgPower[2].init("burst",  "Burst instant power (mW)");
+        profCurAvgPower[3].init("refr",   "Refersh instant power (mW)");
+        profCurAvgPower[4].init("bgnd",   "Background instant power (mW)");
+        profCurAvgPower[5].init("dq",     "DQ instant power (mW)");
+        profCurAvgPower[6].init("term",   "Terminate instant power (mW)");
+        for(uint32_t i = 0; i < pwCounterNum; i++)
+            cpStats->append(&profCurAvgPower[i]);
+        memStats->append(cpStats);
+    }
+
+    if (mParam->bandwidthReport == true) {
+        AggregateStat* bwStats = new AggregateStat();
+        bwStats->init("bw", "Bandwidth Report");
+        profBandwidth[0].init("all", "Cumulative Average bandwidth (MB/s)");
+        profBandwidth[1].init("cur", "Current Average bandwidth (MB/s)");
+        profBandwidth[2].init("max", "Maximum bandwidth (MB/s)");
+        profBandwidth[3].init("min", "Minimum bandwidth (MB/s)");
+        for(uint32_t i = 0; i < bwCounterNum; i++)
+            bwStats->append(&profBandwidth[i]);
+        memStats->append(bwStats);
+    }
+
+    profTotalRdLat.init("rdlat", "Total latency experienced by read requests");
+    memStats->append(&profTotalRdLat);
+    profTotalWrLat.init("wrlat", "Total latency experienced by write requests");
+    memStats->append(&profTotalWrLat);
+
+    lhBinSize = 10;
+    lhNumBins = 200;
+    latencyHist.init("mlh","latency histogram for memory requests", lhNumBins);
+    memStats->append(&latencyHist);
+
+    parentStat->append(memStats);
+}
+
+void MemControllerBase::updateStats(void) {
+    uint64_t sysCycle = zinfo->globPhaseCycles;
+    uint64_t realTime = sysToMicroSec(sysCycle);
+    uint64_t lastRealTime = sysToMicroSec(lastPhaseCycle);
+    if (mParam->accAvgPowerReport == true || mParam->curAvgPowerReport == true)
+        EstimatePowers(sysCycle);
+    if (mParam->bandwidthReport == true)
+        EstimateBandwidth(realTime, lastRealTime);
+    UpdateCmdCounters();
+    lastPhaseCycle = sysCycle;
+}
+
+void MemControllerBase::finish(void) {
+    // This function will be called at the last process termination.
+    uint64_t minCycle = usecToSysCycle(1);
+    uint64_t endCycle = std::max(zinfo->globPhaseCycles, minCycle);
+    uint64_t realTime = sysToMicroSec(endCycle);
+    uint64_t lastRealTime = sysToMicroSec(lastPhaseCycle);
+
+    if (mParam->anyReport == true)
+        info("=== %s: Final Performance Report @ %ld usec (duration is %ld usec) ===",
+             name.c_str(), realTime, realTime - lastRealTime);
+    EstimatePowers(endCycle, true);
+    EstimateBandwidth(realTime, lastRealTime, true);
+    UpdateCmdCounters();
+
+    if (addrTraceLog != NULL)
+        gzclose(addrTraceLog);
+}
+
+// See also MemChannelBase::AddressMap
+uint64_t MemControllerBase::ReturnChannel(Address addr) {
+
+    // addr is cache line address. it has already shifted for containg process id.
+
+    uint32_t colLowWidth = 0;
+    if (mParam->channelDataWidthLog < mParam->byteOffsetWidth) {
+        colLowWidth = mParam->byteOffsetWidth - mParam->channelDataWidthLog;
+        addr >>= colLowWidth;
+    }
+
+    uint64_t result = addr;
+
+    //for non-power of 2 channels, simply shift and get modulo
+    switch (mParam->interleaveType) {
+        case 0:
+        case 1:
+        case 2:
+        case 3:
+        case 4:
+        case 5:
+            // Cache block interleave
+            result  %= mParam->channelCount;
+            break;
+        case 6:
+            result >>= (mParam->colAddrWidth - colLowWidth);
+            result  %= mParam->channelCount;
+            break;
+        case 7:
+            result >>= (mParam->colAddrWidth - colLowWidth);
+            result >>= mParam->bankWidth;
+            result  %= mParam->channelCount;
+            break;
+        case 8:
+            result >>= (mParam->colAddrWidth - colLowWidth);
+            result >>= mParam->bankWidth;
+            result >>= mParam->rankWidth;
+            result  %= mParam->channelCount;
+            break;
+        default:
+            panic("Invalid interleaveType!");
+    }
+    return result;
+}
+
+uint64_t MemControllerBase::LatencySimulate(Address lineAddr, uint64_t sysCycle, MemAccessType type) {
+    uint32_t channel = ReturnChannel(lineAddr);
+    uint64_t memCycle = sysToMemCycle(sysCycle);
+    uint64_t lastMemCycle = sysToMemCycle(lastPhaseCycle);
+    uint64_t memLatency = chnls[channel]->LatencySimulate(lineAddr, memCycle, lastMemCycle, type);
+    uint64_t sysLatency = memToSysCycle(memLatency);
+    assert_msg(sysLatency  >= (memMinLatency[type]),
+               "Memory Model returned lower latency than memMinLatency! latency = %ld, memMinLatency = %d",
+               sysLatency, memMinLatency[type]);
+    uint32_t bin = std::min(sysLatency/lhBinSize, (uint64_t)(lhNumBins-1));
+    latencyHist.inc(bin);
+
+    if (addrTraceLog != NULL)
+        gzwrite(addrTraceLog, (char*)&lineAddr, sizeof(uint64_t));
+
+    if (type == WRITE) {
+        profWrites.atomicInc();
+        profTotalWrLat.atomicInc(sysLatency);
+    } else { // READ
+        profReads.atomicInc();
+        profTotalRdLat.atomicInc(sysLatency);
+    }
+
+    lastAccessedCycle = sysCycle;
+
+    return sysLatency;
+}
+
+void MemControllerBase::UpdateCmdCounters(void) {
+    uint64_t activateCnt = 0;
+    uint64_t prechargeCnt = 0;
+    uint64_t refreshCnt = 0;
+    for(uint32_t i = 0; i < mParam->channelCount; i++) {
+        activateCnt   += chnls[i]->GetActivateCount();
+        prechargeCnt  += chnls[i]->GetPrechargeCount();
+        refreshCnt    += chnls[i]->GetRefreshCount();
+    }
+    profActivate.set(activateCnt);
+    profPrecharge.set(prechargeCnt);
+    profRefresh.set(refreshCnt);
+}
+
+void MemControllerBase::EstimatePowers(uint64_t sysCycle, bool finish) {
+    uint64_t memCycle = sysToMemCycle(sysCycle);
+    uint64_t lastMemCycle = sysToMemCycle(lastPhaseCycle);
+    uint64_t instCycle = memCycle - lastMemCycle;
+    assert(memCycle > lastMemCycle);
+
+    // 1/10V * 1/100mA = uW / 1000 = mW
+    // dq & terminate : uW
+    powerValue accPower;
+    memset(&accPower, 0, sizeof(powerValue));
+    powerValue curPower;
+    memset(&curPower, 0, sizeof(powerValue));
+    for(uint32_t i = 0; i < mParam->channelCount; i++) {
+        chnls[i]->PeriodicUpdatePower(memCycle, lastMemCycle);
+
+        accPower.actPre     += chnls[i]->GetActPreEnergy();
+        accPower.burst      += chnls[i]->GetBurstEnergy();
+        accPower.refresh    += chnls[i]->GetRefreshEnergy();
+        accPower.background += chnls[i]->GetBackGroundEnergy(memCycle, lastMemCycle, false);
+        if (mParam->curAvgPowerReport == true)
+            curPower.background += chnls[i]->GetBackGroundEnergy(memCycle, lastMemCycle, true);
+    }
+
+    uint64_t avgRdActivity = profReads.count()  * mParam->tTrans;
+    uint64_t avgWrActivity = profWrites.count() * mParam->tTrans;
+    // readDq, writeDq: uW, DQ power in current accessed rank, calculate from Whole Chip full usage power
+    accPower.dq = ((avgRdActivity * mParam->readDqPin) + (avgWrActivity * mParam->writeDqPin)) * mParam->chipCountPerRank;
+    // readTerm, writeTerm: uW, terminate power in the other ranks, calculate from Whole Chip full usage power
+    accPower.terminate = ((avgRdActivity * mParam->readTermPin) + (avgWrActivity * mParam->writeTermPin)) * mParam->chipCountPerRank;
+    accPower.terminate *= (mParam->rankCount - 1);
+
+    if (mParam->curAvgPowerReport == true) {
+        // compute instant power
+        // Regarding memory which has VDDQ domain like LPDDRx, VDDQ power is added to dq power.
+        curPower.actPre = (accPower.actPre - lastPower.actPre) / instCycle;
+        curPower.burst = (accPower.burst - lastPower.burst) / instCycle;
+        curPower.refresh = (accPower.refresh - lastPower.refresh) / instCycle;
+        curPower.dq = CalcDQTermCur(accPower.dq, lastPower.dq, instCycle, memCycle, lastMemCycle);
+        curPower.terminate = (accPower.terminate - lastPower.terminate) / instCycle / 1000;
+        curPower.total = curPower.burst + curPower.actPre + curPower.refresh + curPower.background + curPower.dq + curPower.terminate;
+
+        // assertion
+        assert_msg((accPower.burst >= lastPower.burst), "Burst power calculation problem.");
+        assert_msg((accPower.actPre >= lastPower.actPre), "ActPre power calculation problem.");
+        assert_msg((accPower.refresh >= lastPower.refresh), "Refresh power calculation problem.");
+        assert_msg((accPower.dq >= lastPower.dq), "DQ power calculation problem.");
+        assert_msg((accPower.terminate >= lastPower.terminate), "Terminate power calculation problem.");
+
+        // profile update
+        for(uint32_t i =0; i < pwCounterNum; i++)
+            profCurAvgPower[i].set(*(&curPower.total + i));
+
+        // backup for next compute
+        lastPower = accPower;
+    }
+
+    if (mParam->accAvgPowerReport == true) {
+        // compute average power
+        // Regarding memory which has VDDQ domain like LPDDRx, VDDQ power is added to dq power.
+        accPower.actPre = accPower.actPre / memCycle;
+        accPower.burst = accPower.burst  / memCycle;
+        accPower.refresh = accPower.refresh / memCycle;
+        //accPower.background =accPower.background;
+        accPower.dq = CalcDQTermAcc(accPower.dq, memCycle, lastMemCycle);
+        accPower.terminate = accPower.terminate / memCycle / 1000;
+        accPower.total = accPower.burst + accPower.actPre + accPower.refresh + accPower.background + accPower.dq + accPower.terminate;
+        // profile update
+        for(uint32_t i =0; i < pwCounterNum; i++)
+            profAccAvgPower[i].set(*(&accPower.total + i));
+    }
+
+    if (mParam->accAvgPowerReport == true && finish == true) {
+        info("Cummulative Average Power(mW): Total= %6ld, ActPre= %6ld, Burst= %6ld, Refresh= %6ld, BackGrnd= %6ld, ModuleDQ= %6ld, Terminate= %6ld",
+             accPower.total, accPower.actPre, accPower.burst, accPower.refresh,
+             accPower.background, accPower.dq, accPower.terminate);
+    }
+    //info("Current Average Power(mW): Total= %6ld, ActPre= %6ld, Burst= %6ld, Refresh= %6ld, BackGrnd= %6ld, ModuleDQ= %6ld, Terminate= %6ld",
+    //curPower.total, curPower.actPre, curPower.burst, curPower.refresh,
+    //curPower.background, curPower.dq, curPower.terminate);
+}
+
+uint64_t MemControllerBase::CalcDQTermCur(uint64_t acc_dq, uint64_t last_dq, uint64_t instCycle, uint64_t memCycle, uint64_t lastMemCycle) {
+    // memCycle and lastMemCycle are used in LPDDRx mode
+    return (acc_dq - last_dq) / instCycle / 1000;
+};
+
+uint64_t MemControllerBase::CalcDQTermAcc(uint64_t acc_dq, uint64_t memCycle, uint64_t lastMemCycle) {
+    // memCycle and lastMemCycle are used in LPDDRx mode
+    return acc_dq / memCycle / 1000;
+};
+
+void MemControllerBase::EstimateBandwidth(uint64_t realTime, uint64_t lastTime, bool finish) {
+    // Access Count
+    assert(realTime > lastTime);
+    uint64_t totalAccesses = profReads.count() + profWrites.count();
+    uint64_t avgBandwidth = (totalAccesses * cacheLineSize) / realTime;
+    uint64_t curBandwidth = (totalAccesses - lastAccesses) * cacheLineSize / (realTime - lastTime);
+    maxBandwidth = std::max(maxBandwidth, curBandwidth);
+    minBandwidth = std::min(minBandwidth, curBandwidth);
+
+    // Profile Update
+    profBandwidth[0].set(avgBandwidth);
+    profBandwidth[1].set(curBandwidth);
+    profBandwidth[2].set(maxBandwidth);
+    profBandwidth[3].set(minBandwidth);
+
+    lastAccesses = totalAccesses;
+
+    if (mParam->bandwidthReport == true && finish == true) {
+        info("BandWidth (MB/s): CumulativeAvg= %ld, Current= %ld, Max= %ld, Min= %ld",
+             avgBandwidth, curBandwidth, maxBandwidth, minBandwidth);
+    }
+}
+
diff --git a/src/detailed_mem.h b/src/detailed_mem.h
new file mode 100644
index 00000000..55f003ae
--- /dev/null
+++ b/src/detailed_mem.h
@@ -0,0 +1,348 @@
+/** $lic$
+ * Copyright (C) 2012-2014 by Massachusetts Institute of Technology
+ * Copyright (C) 2010-2013 by The Board of Trustees of Stanford University
+ *
+ * This file is part of zsim.
+ *
+ * zsim is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU General Public License as published by the Free Software
+ * Foundation, version 2.
+ *
+ * If you use this software in your research, we request that you reference
+ * the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of
+ * Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the
+ * source of the simulator in any publications that use this software, and that
+ * you send us a citation of your work.
+ *
+ * zsim is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef __DETAILED_MEM_H__
+#define __DETAILED_MEM_H__
+
+#include "detailed_mem_params.h"
+#include "g_std/g_string.h"
+#include "memory_hierarchy.h"
+#include "stats.h"
+#include "timing_event.h"
+#include <zlib.h>
+
+/* Detailed memory model from Makoto/Kenta */
+
+// FIXME(dsm): This enum should not be our here, esp with such generic names!
+enum MemAccessType { READ, WRITE, NUM_ACCESS_TYPES};
+
+// DRAM rank base class
+class MemRankBase : public GlobAlloc {
+    protected:
+        uint32_t myId;
+        uint32_t parentId;
+        uint32_t bankCount;
+
+        uint32_t lastBank;
+        uint64_t lastAccessCycle;
+
+        uint64_t lastRefreshCycle;
+        uint32_t refreshNum;
+        uint32_t accessInRefresh;
+        uint32_t tFAWIndex;
+
+        bool *bankinfo;
+        uint32_t *lastRow;
+        MemAccessType* lastType;
+        uint64_t *lastActCycle;
+        uint64_t *lastRdWrCycle;
+        uint64_t *lastPreCycle;
+        uint64_t *tFAWCycle;
+
+        uint64_t activateCount;
+        uint64_t prechargeCount;
+        uint64_t readBurstCount;
+        uint64_t writeBurstCount;
+
+        uint64_t idlePowerDownCycle;
+        uint64_t actvPowerDownCycle;
+        uint64_t idleStandbyCycle;
+
+        uint64_t prevIdlePowerDownCycle;
+        uint64_t prevActvPowerDownCycle;
+        uint64_t prevIdleStandbyCycle;
+
+    public:
+        MemRankBase(uint32_t _myId, uint32_t _parentId, uint32_t _bankCount);
+        virtual ~MemRankBase();
+
+        virtual void access(uint64_t accessCycle, uint64_t issuedCycle, uint32_t row, uint32_t col, uint32_t bank, MemAccessType type);
+        virtual void refresh(uint64_t lastCycle);
+
+        // FIXME(dsm): This huge amount of getters/setters is a telltale sign
+        // of bad design (if an external class needs to access all these
+        // fields, why is not the logic of that class here? and if the logic is
+        // not here, why is this not a struct internal to that class?)
+        uint32_t GetBankCount(void) {return bankCount; }
+        uint32_t GetLastBank(void) { return lastBank; }
+        uint32_t GetLastRow(uint32_t bank) { return lastRow[bank]; }
+        MemAccessType GetLastType(uint32_t bank) { return lastType[bank]; }
+        uint64_t GetLastRdWrCycle(uint32_t bank) { return lastRdWrCycle[bank]; }
+        uint64_t GetLastRefreshCycle(void) { return lastRefreshCycle; }
+
+        bool GetBankOpen(uint32_t bank) { return bankinfo[bank]; }
+        void SetBankOpen(uint32_t bank)  { bankinfo[bank] = true; }
+        void SetBankClose(uint32_t bank) { bankinfo[bank] = false; }
+        uint32_t GetActiveBankCount(void);
+        uint64_t GetLastAccessCycle(void) { return lastAccessCycle; }
+
+        uint64_t GetActivateCount(void) { return activateCount; }
+        void IncActivateCount(void) { activateCount++; }
+        uint32_t GetPrechargeCount(void) { return prechargeCount; }
+        void IncPrechargeCount(void) { prechargeCount++; }
+
+        uint64_t GetReadBurstCount(void) { return readBurstCount; }
+        void IncReadBurstCount(void) { readBurstCount++; }
+        uint64_t GetWriteBurstCount(void) { return writeBurstCount; }
+        void IncWriteBurstCount(void) { writeBurstCount++; }
+
+        uint64_t GetIdlePowerDownCycle(void) { return idlePowerDownCycle; }
+        uint64_t GetActvPowerDownCycle(void) { return actvPowerDownCycle; }
+        uint64_t GetIdleStandbyCycle(void) { return idleStandbyCycle; }
+        uint64_t GetPrevIdlePowerDownCycle(void) { return prevIdlePowerDownCycle; }
+        uint64_t GetPrevActvPowerDownCycle(void) { return prevActvPowerDownCycle; }
+        uint64_t GetPrevIdleStandbyCycle(void) { return prevIdleStandbyCycle; }
+
+        void SetIdlePowerDownCycle(uint64_t cycle) { assert(idlePowerDownCycle <= cycle); idlePowerDownCycle = cycle; }
+        void SetActvPowerDownCycle(uint64_t cycle) { assert(actvPowerDownCycle <= cycle); actvPowerDownCycle = cycle; }
+        void SetIdleStandbyCycle(uint64_t cycle) { assert(idleStandbyCycle <= cycle); idleStandbyCycle = cycle; }
+        void SaveBackgroundCycles(void);
+
+        void SetRefreshNum(uint32_t num) { assert(refreshNum <= num); refreshNum = num;}
+        uint32_t GetRefreshNum(void) { return refreshNum; }
+        void SetAccessInRefresh(uint32_t num) { accessInRefresh = num; }
+        uint32_t GetAccessInRefresh(void) { return accessInRefresh; }
+
+        uint64_t GetLastActCycle(uint32_t bank) { return lastActCycle[bank]; }
+        void SetLastActCycle(uint32_t bank, uint64_t cycle) { assert(lastActCycle[bank] <= cycle); lastActCycle[bank] = cycle; }
+        uint64_t GetLastPreCycle(uint32_t bank) { return lastPreCycle[bank]; }
+        void SetLastPreCycle(uint32_t bank, uint64_t cycle) { assert(lastPreCycle[bank] <= cycle); lastPreCycle[bank] = cycle; }
+        uint64_t GetFAWCycle(void) { return tFAWCycle[tFAWIndex]; }
+        void SetFAWCycle(uint32_t bank, uint64_t cycle) { assert(tFAWCycle[bank] <= cycle); tFAWCycle[bank] = cycle; }
+        uint64_t GetFAWCycle(uint32_t bank) { return tFAWCycle[bank]; }
+        void SetFAWCycle(uint64_t cycle) { assert(tFAWCycle[tFAWIndex] <= cycle); tFAWCycle[tFAWIndex++] = cycle; tFAWIndex %= 4; }
+};
+
+// DRAM channel base class
+class MemChannelBase : public GlobAlloc {
+    protected:
+        uint32_t myId;
+        MemParam *mParam;
+
+        g_vector <MemRankBase*> ranks;
+        std::vector<std::pair<uint64_t, uint64_t> > accessLog;
+
+        virtual uint32_t UpdateRefreshNum(uint32_t rank, uint64_t arrivalCycle);
+        virtual uint64_t UpdateLastRefreshCycle(uint32_t rank, uint64_t arrivalCycle, uint32_t refreshNum);
+        virtual void UpdatePowerDownCycle(uint32_t rank, uint64_t arrivalCycle, uint64_t lastPhaseCycle, uint32_t refreshNum);
+        virtual void UpdateDataBusCycle(uint64_t start, uint64_t end);
+
+        virtual void IssueActivate(uint32_t rank, uint32_t bank, uint64_t issuedCycle);
+        virtual void IssuePrecharge(uint32_t rank, uint32_t bank, uint64_t issuedCycle, bool continuous = false);
+
+        virtual uint64_t CalcIntraIssueCycle(bool rowHit, uint32_t rank, MemAccessType type, uint64_t arrivalCycle, uint32_t refreshNum);
+        virtual uint64_t CalcInterIssueCycle(MemAccessType type, uint64_t arrivalCycle);
+        virtual uint64_t CalcActConst(uint32_t rank, uint32_t bank, uint64_t issuableCycle);
+        virtual uint64_t CalcPreConst(uint32_t rank, uint32_t bank, MemAccessType type, uint64_t issuableCycle);
+        virtual uint64_t CalcRdWrConst(uint32_t rank, MemAccessType type, uint64_t issuableCycle);
+
+        virtual uint32_t GetPowerDownPenalty(uint32_t rank, uint64_t arrivalCycle);
+        virtual bool     CheckContinuousAccess(uint64_t arrivalCycle, uint32_t rank, uint32_t bank, uint32_t row);
+
+    public:
+        MemChannelBase(uint32_t _myId, MemParam *_mParam);
+        virtual ~MemChannelBase();
+
+        virtual uint64_t LatencySimulate(Address lineAddr, uint64_t arrivalCycle, uint64_t lastPhaseCycle, MemAccessType type);
+        virtual void AddressMap(Address addr, uint32_t& row, uint32_t& col, uint32_t& rank, uint32_t& bank);
+        bool IsRowBufferHit(uint32_t row, uint32_t rank, uint32_t bank);
+
+        virtual uint64_t GetActivateCount(void);
+        virtual uint64_t GetPrechargeCount(void);
+        virtual uint64_t GetRefreshCount(void);
+
+        virtual uint64_t GetBurstEnergy(void);
+        virtual uint64_t GetActPreEnergy(void);
+        virtual uint64_t GetRefreshEnergy(void);
+        virtual uint64_t GetBackGroundEnergy(uint64_t memCycle, uint64_t lastMemCycle, bool bInstant = false);
+
+        virtual void PeriodicUpdatePower(uint64_t phaseCycle, uint64_t lastPhaseCycle);
+};
+
+class MemAccessEventBase;
+
+// DRAM scheduler base class
+class MemSchedulerBase : public GlobAlloc {
+    protected:
+        // HK: Umm...MemAccessEventBase already has a member named addr. How is the
+        // Address in MemSchedQueueElem different from this?
+        typedef std::pair<MemAccessEventBase*, Address> MemSchedQueueElem;
+
+        uint32_t id;
+        MemParam* mParam;
+        MemChannelBase* mChnl;
+
+    public:
+
+        MemSchedulerBase(uint32_t id, MemParam* mParam, MemChannelBase* mChnl)
+            : id(id), mParam(mParam), mChnl(mChnl) {}
+
+        virtual ~MemSchedulerBase() {}
+
+        virtual bool CheckSetEvent(MemAccessEventBase* ev) = 0;
+
+        // HK: I hope there's a good reason to be using a reference to a pointer here
+        // Don't know the code enough at the moment to be able to tell.
+        //
+        // Hmm...so upon further investigation it looks like all of these arguments are 
+        // written by the function. I am not a big fan of passing WRITE arguments by 
+        // reference. Even more distrubingly, MemSchedQueueElem uses its MemAccessEventBase
+        // member in a weird way, with the member var being NULL signifying something (I don't
+        // know what yet). Will look into this further
+        //
+        // FIXME(dsm): refpointer? pointeref? Hmmm...
+        virtual bool GetEvent(MemAccessEventBase*& ev, Address& addr, MemAccessType& type) = 0;
+};
+
+class MemSchedulerDefault : public MemSchedulerBase {
+    private:
+        MemAccessType prioritizedAccessType;
+        uint32_t wrQueueSize;
+        uint32_t wrQueueHighWatermark;
+        uint32_t wrQueueLowWatermark;
+
+        g_vector <MemSchedQueueElem> rdQueue;
+        g_vector <MemSchedQueueElem> wrQueue;
+        g_vector <MemSchedQueueElem> wrDoneQueue;
+
+        bool FindBestRequest(g_vector <MemSchedQueueElem> *queue, uint32_t& idx);
+
+    public:
+        MemSchedulerDefault(uint32_t id, MemParam* mParam, MemChannelBase* mChnl);
+        ~MemSchedulerDefault();
+        bool CheckSetEvent(MemAccessEventBase* ev);
+        bool GetEvent(MemAccessEventBase*& ev, Address& addr, MemAccessType& type);
+};
+
+// DRAM controller base class
+class MemControllerBase : public MemObject {
+    protected:
+        g_string name;
+        uint32_t domain;
+        uint32_t cacheLineSize;
+
+        MemParam* mParam;
+        g_vector <MemChannelBase*> chnls;
+        g_vector <MemSchedulerBase*> sches;
+        lock_t updateLock;
+
+        uint64_t sysFreqKHz;
+        uint64_t memFreqKHz;
+
+        uint64_t lastPhaseCycle;
+        uint64_t lastAccessedCycle;
+        uint64_t nextSysTick;
+        uint64_t reportPeriodCycle;
+
+        // latency
+        uint32_t minLatency[NUM_ACCESS_TYPES];
+        uint32_t preDelay[NUM_ACCESS_TYPES];
+        uint32_t postDelay[NUM_ACCESS_TYPES];
+        uint32_t memMinLatency[NUM_ACCESS_TYPES];
+
+        virtual uint64_t ReturnChannel(Address addr);
+        virtual uint64_t LatencySimulate(Address lineAddr, uint64_t sysCycle, MemAccessType type);
+        virtual void UpdateCmdCounters(void);
+        virtual void EstimatePowers(uint64_t sysCycle, bool finish = false);
+        virtual void EstimateBandwidth(uint64_t realTime, uint64_t lastTime, bool finish = false);
+        virtual uint64_t CalcDQTermCur(uint64_t acc_dq, uint64_t last_dq, uint64_t instCycle, uint64_t memCycle, uint64_t lastMemCycle);
+        virtual uint64_t CalcDQTermAcc(uint64_t acc_dq, uint64_t memCycle, uint64_t lastMemCycle);
+        virtual void TickScheduler(uint64_t sysCycle);
+
+        inline uint64_t sysToMemCycle(uint64_t sysCycle) { return sysCycle*memFreqKHz/sysFreqKHz; }
+        inline uint64_t sysToMicroSec(uint64_t sysCycle) { return sysCycle*1000/sysFreqKHz; }
+        inline uint64_t usecToSysCycle(uint64_t usec)    { return usec*sysFreqKHz/1000; }
+        inline uint64_t memToSysCycle(uint64_t memCycle) { return memCycle*sysFreqKHz/memFreqKHz; }
+        inline uint64_t memToMicroSec(uint64_t memCycle) { return memCycle*1000/memFreqKHz; }
+
+        // profiles
+        Counter profReads;
+        Counter profWrites;
+        Counter profTotalRdLat;
+        Counter profTotalWrLat;
+        VectorCounter latencyHist;
+        uint32_t lhBinSize;
+        uint32_t lhNumBins;
+
+        Counter profActivate;
+        Counter profPrecharge;
+        Counter profRefresh;
+
+        static const uint32_t pwCounterNum = 7;
+        Counter profAccAvgPower[pwCounterNum];
+        Counter profCurAvgPower[pwCounterNum];
+        static const uint32_t bwCounterNum = 4;
+        Counter profBandwidth[bwCounterNum];
+
+        uint64_t lastAccesses;
+        uint64_t maxBandwidth;
+        uint64_t minBandwidth;
+
+        gzFile addrTraceLog;
+
+        // Power
+        uint64_t lastMemCycle;
+        struct powerValue {
+            uint64_t total;
+            uint64_t actPre;
+            uint64_t burst;
+            uint64_t refresh;
+            uint64_t background;
+            uint64_t dq;
+            uint64_t terminate;
+        };
+        powerValue lastPower;
+
+
+    public:
+        MemControllerBase(g_string _memCfg, uint32_t _cacheLineSize, uint32_t _sysFreqMHz, uint32_t _domain, g_string& _name);
+        virtual ~MemControllerBase();
+
+        const char* getName() { return name.c_str(); }
+        void enqueue(MemAccessEventBase* ev, uint64_t cycle);
+        uint64_t access(MemReq& req);
+        uint32_t tick(uint64_t sysCycle);
+        void initStats(AggregateStat* parentStat);
+        void updateStats(void);
+        void finish(void);
+};
+
+// DRAM access event base class
+class MemAccessEventBase : public TimingEvent {
+    private:
+        MemControllerBase* dram;
+        MemAccessType type;
+        Address addr;
+
+    public:
+        MemAccessEventBase(MemControllerBase* _dram, MemAccessType _type, Address _addr, int32_t domain, uint32_t preDelay, uint32_t postDelay)
+            : TimingEvent(preDelay, postDelay, domain), dram(_dram), type(_type), addr(_addr) {}
+
+        void simulate(uint64_t startCycle) { dram->enqueue(this, startCycle); }
+        MemAccessType getType() const { return type; }
+        Address getAddr() const { return addr; }
+};
+
+#endif
diff --git a/src/detailed_mem_params.cpp b/src/detailed_mem_params.cpp
new file mode 100644
index 00000000..c5bdb0ce
--- /dev/null
+++ b/src/detailed_mem_params.cpp
@@ -0,0 +1,273 @@
+/** $lic$
+ * Copyright (C) 2012-2014 by Massachusetts Institute of Technology
+ * Copyright (C) 2010-2013 by The Board of Trustees of Stanford University
+ *
+ * This file is part of zsim.
+ *
+ * zsim is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU General Public License as published by the Free Software
+ * Foundation, version 2.
+ *
+ * If you use this software in your research, we request that you reference
+ * the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of
+ * Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the
+ * source of the simulator in any publications that use this software, and that
+ * you send us a citation of your work.
+ *
+ * zsim is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "detailed_mem_params.h"
+#include <math.h>
+#include <string.h>
+#include "bithacks.h"
+
+// FIXME(dsm): Here be dragons. I don't know why this uses a separate cfg file to begin with, it makes runs much harder to script.
+
+MemParam::MemParam()
+{
+    rowBufferPolicy = RB_CLOSE;
+    memset(constraints,  0, sizeof(uint32_t) * 32);
+}
+
+MemParam::~MemParam()
+{
+}
+
+bool MemParam::IsOpenRowBufPolicy() {
+    return (rowBufferPolicy == RB_OPEN);
+}
+
+bool MemParam::IsCloseRowBufPolicy() {
+    return (rowBufferPolicy == RB_CLOSE);
+}
+
+void MemParam::LoadConfig(g_string _cfgFile, uint32_t _cacheLineSize)
+{
+    info("Loading Memory Config from %s", _cfgFile.c_str());
+    Config cfg(_cfgFile.c_str());
+    LoadConfigMain(cfg, _cacheLineSize);
+    LoadTiming(cfg);
+    LoadPower(cfg);
+    // Make Constraints from Timing Paramteres
+    MakeConstraints();
+}
+
+void MemParam::LoadConfigMain(Config &cfg, uint32_t _cacheLineSize)
+{
+    // loading Simulation paramters
+    reportPhase = cfg.get<uint32_t>("sim.reportPhase", 10000);
+    if(reportPhase == 0) {
+        warn("!!! Please set non-0 value to sim.reportPhase.");
+        assert(false);
+    }
+    reportStart = (uint64_t)cfg.get<uint32_t>("sim.reportStart", 0);
+    reportFinish = (uint64_t)cfg.get<uint32_t>("sim.reportFinish", 0);
+    if(reportFinish == 0)
+        reportFinish = (uint64_t)-1;
+    accAvgPowerReport = cfg.get<bool>("sim.accAvgPowerReport", false);
+    curAvgPowerReport = cfg.get<bool>("sim.curAvgPowerReport", false);
+    bandwidthReport = cfg.get<bool>("sim.bandwidthReport", false);
+    anyReport = (accAvgPowerReport | curAvgPowerReport | bandwidthReport);
+    info("AccAvgPower=%d, CurAvgPower=%d, BandWidth=%d will be reported.",
+         accAvgPowerReport, curAvgPowerReport, bandwidthReport);
+    info("Reports are in each %d phase, Start phase cycle=%ld, Finish phase cycle=%ld",
+         reportPhase, reportStart, reportFinish);
+
+    addrTrace = cfg.get<bool>("sim.addressTrace", false);
+    if(addrTrace == true) {
+        info("Address Traces are output to file");
+    }
+
+    // loading Memory Controller parameters
+    totalCapacity = cfg.get<uint32_t>("mc_spec.capacityMB", 4096);
+    channelCount = cfg.get<uint32_t>("mc_spec.channels", 2);
+    channelDataWidth = cfg.get<uint32_t>("mc_spec.channelDataWidth", 64);
+    g_string _rowBufferPolicy = cfg.get<const char*>("mc_spec.rowBufferPolicy", "close");
+    rowBufferPolicy = (_rowBufferPolicy == "open") ? RB_OPEN : RB_CLOSE;
+    interleaveType = cfg.get<uint32_t>("mc_spec.interleaveType", 0);
+    powerDownCycle = cfg.get<uint32_t>("mc_spec.powerDownCycle", 50);
+    controllerLatency = cfg.get<uint32_t>("mc_spec.controllerLatency", 0);
+    schedulerQueueCount = cfg.get<uint32_t>("mc_spec.schedulerQueueCount", 0);
+    accessLogDepth = cfg.get<uint32_t>("mc_spec.accessLogDepth", 4);
+    mergeContinuous  = cfg.get<bool>("mc_spec.mergeContinuous", false);
+    cacheLineSize = _cacheLineSize;
+
+    // loading Memory parameters
+    chipCapacity = cfg.get<uint32_t>("mem_spec.capacityMb", 2048);
+    bankCount    = cfg.get<uint32_t>("mem_spec.bankCount", 2);
+    rowAddrWidth = cfg.get<uint32_t>("mem_spec.rowAddrWidth", 10);
+    colAddrWidth = cfg.get<uint32_t>("mem_spec.colAddrWidth", 10);
+    dataBusWidth = cfg.get<uint32_t>("mem_spec.dataBusWidth", 8);
+
+    // Calculate paramers
+    chipCountPerRank = channelDataWidth / dataBusWidth;
+    rankCount = (totalCapacity * 8) / (chipCapacity * chipCountPerRank * channelCount);
+    if(rankCount == 0)
+        panic("Illegal specs!!! Please check mc_spec.capacityMB, mc_spec.channels, mem_spec.cpacityMb and mem_spec.dataBusWidth.");
+    if((totalCapacity % channelCount) != 0)
+        panic("Illegal specs!!! mc_spec.capacityMB must be multiple of mc_spec.channels.");
+
+    info("totalCapacity = %d MBytes, chipCapacity = %d Mbits", totalCapacity, chipCapacity);
+    info("channel data width = %d, chips per rank = %d, rank per channel = %d",
+         channelDataWidth, chipCountPerRank, rankCount);
+
+    rankWidth   = ilog2(rankCount);
+    channelWidth = ilog2(channelCount);
+    channelDataWidthLog = ilog2(channelDataWidth);
+    bankWidth   = ilog2(bankCount);
+    byteOffsetWidth = ilog2(cacheLineSize);
+}
+
+void MemParam::LoadTiming(Config &cfg)
+{
+    info("MemParam: Loading Timing Paramters");
+
+    tCK  = cfg.get<double>("mem_spec.timing.tCK", 1.0);
+    tCMD = ceil(cfg.get<double>("mem_spec.timing.tCMD", tCK)  / tCK);
+    tRC  = ceil(cfg.get<double>("mem_spec.timing.tRC",  tCK)  / tCK);
+    tRAS = ceil(cfg.get<double>("mem_spec.timing.tRAS", tCK)  / tCK);
+    tRCD = ceil(cfg.get<double>("mem_spec.timing.tRCD", tCK)  / tCK);
+    tRP  = ceil(cfg.get<double>("mem_spec.timing.tRP",  tCK)  / tCK);
+    tRPab = ceil(cfg.get<double>("mem_spec.timing.tRPab", tCK) / tCK);
+    tRTRS = ceil(cfg.get<double>("mem_spec.timing.tRTRS", tCK) / tCK);
+    tRRD = ceil(cfg.get<double>("mem_spec.timing.tRRD", tCK)  / tCK);
+    tWR  = ceil(cfg.get<double>("mem_spec.timing.tWR",  tCK)  / tCK);
+    tWTR = ceil(cfg.get<double>("mem_spec.timing.tWTR", tCK)  / tCK);
+    tCAS = ceil(cfg.get<double>("mem_spec.timing.tCAS", tCK)  / tCK);
+    tCWD = ceil(cfg.get<double>("mem_spec.timing.tCWD", tCK)  / tCK);
+    tCCD = ceil(cfg.get<double>("mem_spec.timing.tCCD", tCK)  / tCK);
+    tTrans = ceil(cfg.get<double>("mem_spec.timing.tTrans", tCK*4) / tCK);
+    tTransCrit = tTrans / 4;
+    tXP  = ceil(cfg.get<double>("mem_spec.timing.tXP", tCK)   / tCK);
+    tREFI = ceil(cfg.get<double>("mem_spec.timing.tREFI", tCK) / tCK);
+    tRFC = ceil(cfg.get<double>("mem_spec.timing.tRFC", tCK)  / tCK);
+    tFAW = ceil(cfg.get<double>("mem_spec.timing.tFAW", tCK)  / tCK);
+    tRTP = ceil(cfg.get<double>("mem_spec.timing.tRTP", tCK)  / tCK);
+
+    info("tCK  = %f", tCK);
+    info("tCMD = %d tCK", tCMD);
+    info("tRC  = %d tCK", tRC);
+    info("tRAS = %d tCK", tRAS);
+    info("tRCD = %d tCK", tRCD);
+    info("tRP  = %d tCK", tRP);
+    info("tRPab = %d tCK", tRPab);
+    info("tRTRS = %d tCK", tRTRS);
+    info("tRRD = %d tCK", tRRD);
+    info("tWR  = %d tCK", tWR);
+    info("tWTR = %d tCK", tWTR);
+    info("tCAS = %d tCK", tCAS);
+    info("tCWD = %d tCK", tCWD);
+    info("tCCD = %d tCK", tCCD);
+    info("tTrans = %d tCK", tTrans);
+    info("tTransCrit = %d tCK", tTransCrit);
+    info("tXP  = %d tCK", tXP);
+    info("tREFI = %d tCK", tREFI);
+    info("tRFC = %d tCK", tRFC);
+    info("tFAW = %d tCK", tFAW);
+    info("tRTP = %d tCK", tRTP);
+}
+
+void MemParam::LoadPower(Config &cfg)
+{
+    // loading Power Paramters
+    // V -> 1/10V
+    VDD1          = cfg.get<double>("mem_spec.power.VDD1.VDD1",  1.5) * 10;
+    // mA -> 1/100mA
+    IDD_VDD1.IDD0 = cfg.get<double>("mem_spec.power.VDD1.IDD0",  0.0) * 1e2;
+    IDD_VDD1.IDD2P = cfg.get<double>("mem_spec.power.VDD1.IDD2P", 0.0) * 1e2;
+    IDD_VDD1.IDD2N = cfg.get<double>("mem_spec.power.VDD1.IDD2N", 0.0) * 1e2;
+    IDD_VDD1.IDD3P = cfg.get<double>("mem_spec.power.VDD1.IDD3P", 0.0) * 1e2;
+    IDD_VDD1.IDD3N = cfg.get<double>("mem_spec.power.VDD1.IDD3N", 0.0) * 1e2;
+    IDD_VDD1.IDD4R = cfg.get<double>("mem_spec.power.VDD1.IDD4R", 0.0) * 1e2;
+    IDD_VDD1.IDD4W = cfg.get<double>("mem_spec.power.VDD1.IDD4W", 0.0) * 1e2;
+    IDD_VDD1.IDD5 = cfg.get<double>("mem_spec.power.VDD1.IDD5", 0.0)  * 1e2;
+    // mW -> uW
+    readDqPin = cfg.get<double>("mem_spec.power.pins.readDQ", 0.0) * 1e3;
+    writeDqPin = cfg.get<double>("mem_spec.power.pins.writeDQ", 0.0) * 1e3;
+    readTermPin = cfg.get<double>("mem_spec.power.pins.readTerm", 0.0) * 1e3;
+    writeTermPin = cfg.get<double>("mem_spec.power.pins.writeTerm", 0.0) * 1e3;
+
+    info("Loading Memory Power Parameters");
+    info("VDD1 (mV)      = %d", VDD1 * 100);
+    info("VDD1.IDD0 (uA) = %d", IDD_VDD1.IDD0  * 10);
+    info("VDD1.IDD2P (uA) = %d", IDD_VDD1.IDD2P * 10);
+    info("VDD1.IDD2N (uA) = %d", IDD_VDD1.IDD2N * 10);
+    info("VDD1.IDD3P (uA) = %d", IDD_VDD1.IDD3P * 10);
+    info("VDD1.IDD3N (uA) = %d", IDD_VDD1.IDD3N * 10);
+    info("VDD1.IDD4R (uA) = %d", IDD_VDD1.IDD4R * 10);
+    info("VDD1.IDD4W (uA) = %d", IDD_VDD1.IDD4W * 10);
+    info("VDD1.IDD5 (uA) = %d", IDD_VDD1.IDD5  * 10);
+    info("readDq (uW)    = %d", readDqPin);
+    info("writeDq (uW)   = %d", writeDqPin);
+    info("readTerm (uW)  = %d", readTermPin);
+    info("writeTerm (uW) = %d", writeTermPin);
+}
+
+uint32_t MemParam::GetDataLatency(uint32_t type)
+{
+    // Return read/write to data latency
+    return (type == 0) ? tCAS : tCWD;
+}
+
+uint32_t MemParam::GetDataDelay(uint32_t type)
+{
+    // Return read/write to first data
+    return ((type == 0) ? tCAS : tCWD) + tTransCrit;
+}
+
+uint32_t MemParam::GetDataSlot(uint32_t type)
+{
+    // Return data length
+    return tTrans;
+}
+
+uint32_t MemParam::GetPreDelay(uint32_t type)
+{
+    // Return read/write to precharge
+    return (type == 0) ? tRTP : (tCWD + tTrans + tWR);
+}
+
+uint32_t MemParam::GetRefreshCycle(void)
+{
+    // Return required cycle for refresh
+    if(IsOpenRowBufPolicy())
+        return tRFC + tRPab;
+    else
+        return tRFC;
+}
+
+uint32_t MemParam::GetRdWrDelay(uint32_t type, uint32_t lastType)
+{
+    // Return read/write to read/write constraint
+    uint32_t index = lastType << 1 | type;
+    return constraints[index];
+}
+
+void MemParam::MakeConstraints(void)
+{
+    //////////////////////////////////////////
+    // make constraint for read/write to read/write command
+    // [0bAB] : A=lastType, B=type
+    //////////////////////////////////////////
+    info("Generate DDR3 Timing Constraints for read/write to read/write");
+
+    //idx=0 R->R
+    constraints[0b00] = std::max(tTrans, tCCD);
+
+    //idx=1 R->W
+    constraints[0b01] = tCAS + tCCD/2 + 2 - tCWD;
+
+    //idx=2 W->R
+    constraints[0b10] = tCWD + tTrans + tWTR;
+
+    //idx=3 W->W
+    constraints[0b11] = std::max(tCCD, tTrans);
+}
+
diff --git a/src/detailed_mem_params.h b/src/detailed_mem_params.h
new file mode 100644
index 00000000..62656aae
--- /dev/null
+++ b/src/detailed_mem_params.h
@@ -0,0 +1,147 @@
+/** $lic$
+ * Copyright (C) 2012-2014 by Massachusetts Institute of Technology
+ * Copyright (C) 2010-2013 by The Board of Trustees of Stanford University
+ *
+ * This file is part of zsim.
+ *
+ * zsim is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU General Public License as published by the Free Software
+ * Foundation, version 2.
+ *
+ * If you use this software in your research, we request that you reference
+ * the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of
+ * Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the
+ * source of the simulator in any publications that use this software, and that
+ * you send us a citation of your work.
+ *
+ * zsim is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef __DETAILED_MEM_PARAMS_H__
+#define __DETAILED_MEM_PARAMS_H__
+
+#include "g_std/g_string.h"
+#include "config.h"
+
+class MemParam : public GlobAlloc{
+    protected:
+        enum eRowBufferPolicy {
+            RB_CLOSE = 0,
+            RB_OPEN
+        };
+        uint32_t rowBufferPolicy;
+        int32_t constraints[4];
+
+        virtual void LoadConfigMain(Config &cfg, uint32_t _chacheLineSize = 64);
+        virtual void LoadTiming(Config &cfg);
+        virtual void LoadPower(Config &cfg);
+        virtual void MakeConstraints(void);
+
+    public:
+        MemParam();
+        virtual ~MemParam();
+
+        virtual void LoadConfig(g_string _cfgFile, uint32_t _chacheLineSize = 64);
+
+        bool IsOpenRowBufPolicy();
+        bool IsCloseRowBufPolicy();
+        virtual uint32_t GetDataLatency(uint32_t type);
+        virtual uint32_t GetDataDelay(uint32_t type);
+        virtual uint32_t GetDataSlot(uint32_t type);
+        virtual uint32_t GetPreDelay(uint32_t type);
+        virtual uint32_t GetRefreshCycle(void);
+        virtual uint32_t GetRdWrDelay(uint32_t type, uint32_t lastType);
+
+        // Simulation Parameter
+        uint32_t reportPhase;
+        uint64_t reportStart;
+        uint64_t reportFinish;
+
+        // FIXME(dsm): These violate transparency... use info/warn!
+        // I'm also not sure why these are here; can we move all the power-related reporting to a separate tool?
+        bool anyReport;
+        bool accAvgPowerReport;
+        bool curAvgPowerReport;
+        bool bandwidthReport;
+        bool addrTrace;
+
+        // Memory Controller Parameter
+        uint32_t totalCapacity; // mega byte
+        uint32_t channelCount;
+        uint32_t interleaveType;
+        uint32_t powerDownCycle;
+        uint32_t controllerLatency;
+        uint32_t cacheLineSize;
+        uint32_t byteOffsetWidth;
+        uint32_t accessLogDepth;
+        bool mergeContinuous;
+        uint32_t schedulerQueueCount;
+
+        // Device Architectural Parameter
+        uint32_t chipCapacity; // megabits
+        uint32_t bankCount;
+        uint32_t rowAddrWidth;
+        uint32_t colAddrWidth;
+        uint32_t dataBusWidth;
+
+        uint32_t chipCountPerRank;
+        uint32_t rankCount;
+        uint32_t rankWidth;
+        uint32_t channelWidth;
+        uint32_t bankWidth;
+        uint32_t channelDataWidth; // Data bus bits (= JEDEC_BUS_WIDTH)
+        uint32_t channelDataWidthLog; // ilog2(Datawdith / 8)
+
+        // Timing Parameters
+        double tCK;
+        uint32_t tCMD;
+        uint32_t tRC;
+        uint32_t tRAS;
+        uint32_t tRCD;
+        uint32_t tRP;
+        uint32_t tRPab;
+        uint32_t tRTRS;
+        uint32_t tRRD;
+        uint32_t tWR;
+        uint32_t tWTR;
+        uint32_t tCAS;
+        uint32_t tCWD;
+        uint32_t tCCD;
+        uint32_t tTrans;
+        uint32_t tTransCrit;
+        uint32_t tXP;
+        uint32_t tREFI;
+        uint32_t tRFC;
+        uint32_t tFAW;
+        uint32_t tRTP;
+
+        // Power Parameters
+        // Voltage
+        uint32_t VDD1;
+
+        struct IDDs {
+            uint32_t IDD0;
+            uint32_t IDD2P;
+            uint32_t IDD2N;
+            uint32_t IDD3P;
+            uint32_t IDD3N;
+            uint32_t IDD4R;
+            uint32_t IDD4W;
+            uint32_t IDD5;
+        };
+        // Statically Allocate
+        IDDs IDD_VDD1;
+
+        uint32_t readDqPin;
+        uint32_t writeDqPin;
+        uint32_t readTermPin;
+        uint32_t writeTermPin;
+};
+
+#endif /* __DETAILED_MEM_PARAMS_H__ */
diff --git a/src/dramsim_mem_ctrl.cpp b/src/dramsim_mem_ctrl.cpp
new file mode 100644
index 00000000..07e451fd
--- /dev/null
+++ b/src/dramsim_mem_ctrl.cpp
@@ -0,0 +1,181 @@
+/** $lic$
+ * Copyright (C) 2012-2014 by Massachusetts Institute of Technology
+ * Copyright (C) 2010-2013 by The Board of Trustees of Stanford University
+ *
+ * This file is part of zsim.
+ *
+ * zsim is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU General Public License as published by the Free Software
+ * Foundation, version 2.
+ *
+ * If you use this software in your research, we request that you reference
+ * the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of
+ * Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the
+ * source of the simulator in any publications that use this software, and that
+ * you send us a citation of your work.
+ *
+ * zsim is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "dramsim_mem_ctrl.h"
+#include <map>
+#include <string>
+#include "event_recorder.h"
+#include "tick_event.h"
+#include "timing_event.h"
+#include "zsim.h"
+
+#ifdef _WITH_DRAMSIM_ //was compiled with dramsim
+#include "DRAMSim.h"
+
+using namespace DRAMSim; // NOLINT(build/namespaces)
+
+class DRAMSimAccEvent : public TimingEvent {
+    private:
+        DRAMSimMemory* dram;
+        bool write;
+        Address addr;
+
+    public:
+        uint64_t sCycle;
+
+        DRAMSimAccEvent(DRAMSimMemory* _dram, bool _write, Address _addr, int32_t domain) :  TimingEvent(0, 0, domain), dram(_dram), write(_write), addr(_addr) {}
+
+        bool isWrite() const {
+            return write;
+        }
+
+        Address getAddr() const {
+            return addr;
+        }
+
+        void simulate(uint64_t startCycle) {
+            sCycle = startCycle;
+            dram->enqueue(this, startCycle);
+        }
+};
+
+
+DRAMSimMemory::DRAMSimMemory(string& dramTechIni, string& dramSystemIni, string& outputDir, string& traceName,
+        uint32_t capacityMB, uint64_t cpuFreqHz, uint32_t _minLatency, uint32_t _domain, const g_string& _name)
+{
+    curCycle = 0;
+    minLatency = _minLatency;
+    // NOTE: this will alloc DRAM on the heap and not the glob_heap, make sure only one process ever handles this
+    dramCore = getMemorySystemInstance(dramTechIni, dramSystemIni, outputDir, traceName, capacityMB);
+    dramCore->setCPUClockSpeed(cpuFreqHz);
+
+    TransactionCompleteCB *read_cb = new Callback<DRAMSimMemory, void, unsigned, uint64_t, uint64_t>(this, &DRAMSimMemory::DRAM_read_return_cb);
+    TransactionCompleteCB *write_cb = new Callback<DRAMSimMemory, void, unsigned, uint64_t, uint64_t>(this, &DRAMSimMemory::DRAM_write_return_cb);
+    dramCore->RegisterCallbacks(read_cb, write_cb, NULL);
+
+    domain = _domain;
+    TickEvent<DRAMSimMemory>* tickEv = new TickEvent<DRAMSimMemory>(this, domain);
+    tickEv->queue(0);  // start the sim at time 0
+
+    name = _name;
+}
+
+void DRAMSimMemory::initStats(AggregateStat* parentStat) {
+    AggregateStat* memStats = new AggregateStat();
+    memStats->init(name.c_str(), "Memory controller stats");
+    profReads.init("rd", "Read requests"); memStats->append(&profReads);
+    profWrites.init("wr", "Write requests"); memStats->append(&profWrites);
+    profTotalRdLat.init("rdlat", "Total latency experienced by read requests"); memStats->append(&profTotalRdLat);
+    profTotalWrLat.init("wrlat", "Total latency experienced by write requests"); memStats->append(&profTotalWrLat);
+    parentStat->append(memStats);
+}
+
+uint64_t DRAMSimMemory::access(MemReq& req) {
+    switch (req.type) {
+        case PUTS:
+        case PUTX:
+            *req.state = I;
+            break;
+        case GETS:
+            *req.state = req.is(MemReq::NOEXCL)? S : E;
+            break;
+        case GETX:
+            *req.state = M;
+            break;
+
+        default: panic("!?");
+    }
+
+    uint64_t respCycle = req.cycle + minLatency;
+    assert(respCycle > req.cycle);
+
+    if ((req.type != PUTS /*discard clean writebacks*/) && zinfo->eventRecorders[req.srcId]) {
+        Address addr = req.lineAddr << lineBits;
+        bool isWrite = (req.type == PUTX);
+        DRAMSimAccEvent* memEv = new (zinfo->eventRecorders[req.srcId]) DRAMSimAccEvent(this, isWrite, addr, domain);
+        memEv->setMinStartCycle(req.cycle);
+        TimingRecord tr = {addr, req.cycle, respCycle, req.type, memEv, memEv};
+        zinfo->eventRecorders[req.srcId]->pushRecord(tr);
+    }
+
+    return respCycle;
+}
+
+uint32_t DRAMSimMemory::tick(uint64_t cycle) {
+    dramCore->update();
+    curCycle++;
+    return 1;
+}
+
+void DRAMSimMemory::enqueue(DRAMSimAccEvent* ev, uint64_t cycle) {
+    //info("[%s] %s access to %lx added at %ld, %ld inflight reqs", getName(), ev->isWrite()? "Write" : "Read", ev->getAddr(), cycle, inflightRequests.size());
+    dramCore->addTransaction(ev->isWrite(), ev->getAddr());
+    inflightRequests.insert(std::pair<Address, DRAMSimAccEvent*>(ev->getAddr(), ev));
+    ev->hold();
+}
+
+void DRAMSimMemory::DRAM_read_return_cb(uint32_t id, uint64_t addr, uint64_t memCycle) {
+    std::multimap<uint64_t, DRAMSimAccEvent*>::iterator it = inflightRequests.find(addr);
+    assert(it != inflightRequests.end());
+    DRAMSimAccEvent* ev = it->second;
+
+    uint32_t lat = curCycle+1 - ev->sCycle;
+    if (ev->isWrite()) {
+        profWrites.inc();
+        profTotalWrLat.inc(lat);
+    } else {
+        profReads.inc();
+        profTotalRdLat.inc(lat);
+    }
+
+    ev->release();
+    ev->done(curCycle+1);
+    inflightRequests.erase(it);
+    //info("[%s] %s access to %lx DONE at %ld (%ld cycles), %ld inflight reqs", getName(), it->second->isWrite()? "Write" : "Read", it->second->getAddr(), curCycle, curCycle-it->second->sCycle, inflightRequests.size());
+}
+
+void DRAMSimMemory::DRAM_write_return_cb(uint32_t id, uint64_t addr, uint64_t memCycle) {
+    //Same as read for now
+    DRAM_read_return_cb(id, addr, memCycle);
+}
+
+#else //no dramsim, have the class fail when constructed
+
+using std::string;
+
+DRAMSimMemory::DRAMSimMemory(string& dramTechIni, string& dramSystemIni, string& outputDir, string& traceName,
+        uint32_t capacityMB, uint64_t cpuFreqHz, uint32_t _minLatency, uint32_t _domain, const g_string& _name)
+{
+    panic("Cannot use DRAMSimMemory, zsim was not compiled with DRAMSim");
+}
+
+uint64_t DRAMSimMemory::access(MemReq& req) { panic("???"); return 0; }
+uint32_t DRAMSimMemory::tick(uint64_t cycle) { panic("???"); return 0; }
+void DRAMSimMemory::enqueue(DRAMSimAccEvent* ev, uint64_t cycle) { panic("???"); }
+void DRAMSimMemory::DRAM_read_return_cb(uint32_t id, uint64_t addr, uint64_t memCycle) { panic("???"); }
+void DRAMSimMemory::DRAM_write_return_cb(uint32_t id, uint64_t addr, uint64_t memCycle) { panic("???"); }
+
+#endif
+
diff --git a/src/dramsim_mem_ctrl.h b/src/dramsim_mem_ctrl.h
new file mode 100644
index 00000000..af4f6525
--- /dev/null
+++ b/src/dramsim_mem_ctrl.h
@@ -0,0 +1,111 @@
+/** $lic$
+ * Copyright (C) 2012-2014 by Massachusetts Institute of Technology
+ * Copyright (C) 2010-2013 by The Board of Trustees of Stanford University
+ *
+ * This file is part of zsim.
+ *
+ * zsim is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU General Public License as published by the Free Software
+ * Foundation, version 2.
+ *
+ * If you use this software in your research, we request that you reference
+ * the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of
+ * Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the
+ * source of the simulator in any publications that use this software, and that
+ * you send us a citation of your work.
+ *
+ * zsim is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef DRAMSIM_MEM_CTRL_H_
+#define DRAMSIM_MEM_CTRL_H_
+
+#include <map>
+#include <string>
+#include "g_std/g_string.h"
+#include "memory_hierarchy.h"
+#include "pad.h"
+#include "stats.h"
+
+namespace DRAMSim {
+    class MultiChannelMemorySystem;
+};
+
+class DRAMSimAccEvent;
+
+class DRAMSimMemory : public MemObject { //one DRAMSim controller
+    private:
+        g_string name;
+        uint32_t minLatency;
+        uint32_t domain;
+
+        DRAMSim::MultiChannelMemorySystem* dramCore;
+
+        std::multimap<uint64_t, DRAMSimAccEvent*> inflightRequests;
+
+        uint64_t curCycle; //processor cycle, used in callbacks
+
+        // R/W stats
+        PAD();
+        Counter profReads;
+        Counter profWrites;
+        Counter profTotalRdLat;
+        Counter profTotalWrLat;
+        PAD();
+
+    public:
+        DRAMSimMemory(std::string& dramTechIni, std::string& dramSystemIni, std::string& outputDir, std::string& traceName, uint32_t capacityMB,
+                uint64_t cpuFreqHz,  uint32_t _minLatency, uint32_t _domain, const g_string& _name);
+
+        const char* getName() {return name.c_str();}
+
+        void initStats(AggregateStat* parentStat);
+
+        // Record accesses
+        uint64_t access(MemReq& req);
+
+        // Event-driven simulation (phase 2)
+        uint32_t tick(uint64_t cycle);
+        void enqueue(DRAMSimAccEvent* ev, uint64_t cycle);
+
+    private:
+        void DRAM_read_return_cb(uint32_t id, uint64_t addr, uint64_t returnCycle);
+        void DRAM_write_return_cb(uint32_t id, uint64_t addr, uint64_t returnCycle);
+};
+
+//DRAMSIM does not support non-pow2 channels, so:
+// - Encapsulate multiple DRAMSim controllers
+// - Fan out addresses interleaved across banks, and change the address to a "memory address"
+class SplitAddrMemory : public MemObject {
+    private:
+        const g_vector<MemObject*> mems;
+        const g_string name;
+    public:
+        SplitAddrMemory(const g_vector<MemObject*>& _mems, const char* _name) : mems(_mems), name(_name) {}
+
+        uint64_t access(MemReq& req) {
+            Address addr = req.lineAddr;
+            uint32_t mem = addr % mems.size();
+            Address ctrlAddr = addr/mems.size();
+            req.lineAddr = ctrlAddr;
+            uint64_t respCycle = mems[mem]->access(req);
+            req.lineAddr = addr;
+            return respCycle;
+        }
+
+        const char* getName() {
+            return name.c_str();
+        }
+
+        void initStats(AggregateStat* parentStat) {
+            for (auto mem : mems) mem->initStats(parentStat);
+        }
+};
+
+#endif  // DRAMSIM_MEM_CTRL_H_
diff --git a/src/event_queue.h b/src/event_queue.h
new file mode 100644
index 00000000..6a326422
--- /dev/null
+++ b/src/event_queue.h
@@ -0,0 +1,122 @@
+/** $lic$
+ * Copyright (C) 2012-2014 by Massachusetts Institute of Technology
+ * Copyright (C) 2010-2013 by The Board of Trustees of Stanford University
+ *
+ * This file is part of zsim.
+ *
+ * zsim is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU General Public License as published by the Free Software
+ * Foundation, version 2.
+ *
+ * If you use this software in your research, we request that you reference
+ * the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of
+ * Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the
+ * source of the simulator in any publications that use this software, and that
+ * you send us a citation of your work.
+ *
+ * zsim is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef EVENT_QUEUE_H_
+#define EVENT_QUEUE_H_
+
+#include <stdint.h>
+#include "g_std/g_multimap.h"
+#include "galloc.h"
+#include "zsim.h"
+
+class Event : public GlobAlloc {
+    protected:
+        uint64_t period;
+
+    public:
+        explicit Event(uint64_t _period) : period(_period) {} //period == 0 events are one-shot
+        uint64_t getPeriod() const {return period;}
+        virtual void callback()=0;
+};
+
+/* Adjusts period to fire on the first phase following the target. Sets exponentially decreasing periods,
+ * so even if maxRate is horribly overestimated, it should have a very small cost (but there's room for
+ * optimization if this becomes an issue).
+ */
+template<typename G, typename F>
+class AdaptiveEvent : public Event {
+    private:
+        G get;
+        F fire;
+        uint64_t target;
+        uint64_t maxRate;
+
+    public:
+        AdaptiveEvent(G _get, F _fire, uint64_t _start, uint64_t _target, uint64_t _maxRate) : Event(0), get(_get), fire(_fire), target(_target), maxRate(_maxRate) {
+            assert(target >= _start);
+            period = (target - _start)/maxRate;
+            if (!period) period = 1;
+        }
+
+        // This will fire a bunch of times, we adjust the period to get the exact phase
+        // Gets called from an arbitrary process, cannot touch any proc-local state (including FFI)
+        void callback() {
+            uint64_t cur = get();
+            if (cur >= target) {
+                assert(cur - target <= maxRate); //otherwise, maxRate was wrong...
+                fire();
+                period = 0; //event queue will dispose of us
+            } else {
+                period = (target - cur)/maxRate;
+                if (period == 0) period = 1;
+            }
+        }
+};
+
+template <typename G, typename F>
+AdaptiveEvent<G, F>* makeAdaptiveEvent(G get, F fire, uint64_t start, uint64_t target, uint64_t maxRate) {
+    return new AdaptiveEvent<G, F>(get, fire, start, target, maxRate);
+}
+
+
+class EventQueue : public GlobAlloc {
+    private:
+        g_multimap<uint64_t, Event*> evMap;
+        lock_t qLock;
+
+    public:
+        EventQueue() { futex_init(&qLock); }
+
+        void tick() {
+            futex_lock(&qLock);
+            uint64_t curPhase = zinfo->numPhases;
+            g_multimap<uint64_t, Event*>::iterator it = evMap.begin();
+            while (it != evMap.end() && it->first <= curPhase) {
+                if (unlikely(it->first != curPhase)) panic("First event should have ticked on phase %ld, this is %ld", it->first, curPhase);
+                //if (it->first != curPhase) warn("First event should have ticked on phase %ld, this is %ld", it->first, curPhase);
+                Event* ev = it->second;
+                evMap.erase(it);
+                ev->callback(); //NOTE: Callback cannot call insert(), will deadlock (could use recursive locks if needed)
+                if (ev->getPeriod()) {
+                    evMap.insert(std::pair<uint64_t, Event*>(curPhase + ev->getPeriod(), ev));
+                } else {
+                    delete ev;
+                }
+                it = evMap.begin();
+            }
+            futex_unlock(&qLock);
+        }
+
+        void insert(Event* ev, int64_t startDelay = -1) {
+            futex_lock(&qLock);
+            uint64_t curPhase = zinfo->numPhases;
+            uint64_t eventPhase = (startDelay == -1)? (curPhase + ev->getPeriod()) : (curPhase + startDelay);
+            assert(eventPhase >= curPhase);
+            evMap.insert(std::pair<uint64_t, Event*>(eventPhase, ev));
+            futex_unlock(&qLock);
+        }
+};
+
+#endif  // EVENT_QUEUE_H_
diff --git a/src/event_recorder.h b/src/event_recorder.h
new file mode 100644
index 00000000..18597635
--- /dev/null
+++ b/src/event_recorder.h
@@ -0,0 +1,131 @@
+/** $lic$
+ * Copyright (C) 2012-2014 by Massachusetts Institute of Technology
+ * Copyright (C) 2010-2013 by The Board of Trustees of Stanford University
+ *
+ * This file is part of zsim.
+ *
+ * zsim is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU General Public License as published by the Free Software
+ * Foundation, version 2.
+ *
+ * If you use this software in your research, we request that you reference
+ * the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of
+ * Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the
+ * source of the simulator in any publications that use this software, and that
+ * you send us a citation of your work.
+ *
+ * zsim is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef EVENT_RECORDER_H_
+#define EVENT_RECORDER_H_
+
+#include "g_std/g_vector.h"
+#include "memory_hierarchy.h"
+#include "phase_slab_alloc.h"
+
+class TimingEvent;
+
+// Encodes an event that the core should capture for the contention simulation
+struct TimingRecord {
+    Address addr;
+    uint64_t reqCycle;
+    uint64_t respCycle;
+    AccessType type;
+    TimingEvent* startEvent;
+    TimingEvent* endEvent;
+};
+
+//class CoreRecorder;
+class CrossingEvent;
+typedef g_vector<CrossingEvent*> CrossingStack;
+
+class EventRecorder : public GlobAlloc {
+    private:
+        PhaseSlabAlloc slabAlloc;
+        g_vector<TimingRecord> trStack;
+        CrossingStack crossingStack;
+        //CoreRecorder* coreRec;
+        uint32_t srcId;
+
+        volatile uint64_t lastGapCycles;
+        PAD();
+        volatile uint64_t lastStartSlack;
+        PAD();
+
+    public:
+        EventRecorder() {}
+
+        //Alloc interface
+
+        template <typename T>
+        T* alloc() {
+            return slabAlloc.alloc<T>();
+        }
+
+        void* alloc(size_t sz) {
+            return slabAlloc.alloc(sz);
+        }
+
+        void advance(uint64_t prodCycle, uint64_t usedCycle) {
+            slabAlloc.advance(prodCycle, usedCycle);
+        }
+
+        //Event recording interface
+
+        void pushRecord(const TimingRecord& tr) {
+            trStack.push_back(tr);
+        }
+
+        void popRecord() {
+            trStack.pop_back();
+        }
+
+        inline size_t numRecords() const {
+            return trStack.size();
+        }
+
+        TimingRecord getRecord(size_t num) {
+            return trStack[num];
+        }
+
+        inline void clearRecords() {
+            trStack.clear();
+        }
+
+        //Called by crossing events
+        inline uint64_t getSlack(uint64_t origStartCycle) const {
+            return origStartCycle + lastStartSlack;
+        }
+
+        inline uint64_t getGapCycles() const {
+            return lastGapCycles;
+        }
+
+        //Called by the core's recorder
+        //infrequently
+        void setGapCycles(uint64_t gapCycles) {
+            lastGapCycles = gapCycles;
+        }
+
+        //frequently
+        inline void setStartSlack(uint64_t startSlack) {
+            //Avoid a write, it can cost a bunch of coherence misses
+            if (lastStartSlack != startSlack) lastStartSlack = startSlack;
+        }
+
+        uint32_t getSourceId() const {return srcId;}
+        void setSourceId(uint32_t i) {srcId = i;}
+
+        inline CrossingStack& getCrossingStack() {
+            return crossingStack;
+        }
+};
+
+#endif  // EVENT_RECORDER_H_
diff --git a/src/fftoggle.cpp b/src/fftoggle.cpp
new file mode 100644
index 00000000..cd168873
--- /dev/null
+++ b/src/fftoggle.cpp
@@ -0,0 +1,73 @@
+/** $lic$
+ * Copyright (C) 2012-2014 by Massachusetts Institute of Technology
+ * Copyright (C) 2010-2013 by The Board of Trustees of Stanford University
+ *
+ * This file is part of zsim.
+ *
+ * zsim is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU General Public License as published by the Free Software
+ * Foundation, version 2.
+ *
+ * If you use this software in your research, we request that you reference
+ * the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of
+ * Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the
+ * source of the simulator in any publications that use this software, and that
+ * you send us a citation of your work.
+ *
+ * zsim is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/* Small utility to control ff toggling. */
+
+#include <sched.h>
+#include <stdlib.h>
+#include <string>
+#include "galloc.h"
+#include "locks.h"
+#include "log.h"
+#include "zsim.h"
+
+int main(int argc, char *argv[]) {
+    InitLog("[T] ");
+    if (argc < 3 || argc > 4) {
+        info("Usage: %s <ff|pause|globpause|term> <shmid> [<procIdx>]", argv[0]);
+        exit(1);
+    }
+
+    const char* cmd = argv[1];
+    int shmid = atoi(argv[2]);
+    int procIdx = (argc == 4)? atoi(argv[3]) : -1;
+
+    gm_attach(shmid);
+    while (!gm_isready()) sched_yield(); //wait till proc idx 0 initializes everything; sched_yield to avoid livelock with lots of processes
+    GlobSimInfo* zinfo = static_cast<GlobSimInfo*>(gm_get_glob_ptr());
+
+    if (strcmp(cmd, "ff") == 0) {
+        if (procIdx < 0) panic("ff needs procIdx");
+        futex_unlock(&zinfo->ffToggleLocks[procIdx]);
+        info("Toggled fast-forward on process %d", procIdx);
+    } else if (strcmp(argv[1], "pause") == 0) {
+        if (procIdx < 0) panic("pause needs procIdx");
+        futex_unlock(&zinfo->pauseLocks[procIdx]);
+        info("Unpaused process %d", procIdx);
+    } else if (strcmp(argv[1], "globpause") == 0) {
+        if (procIdx >= 0) warn("globpause pauses the whole simulation, you specified a procIdx");
+        zinfo->globalPauseFlag = !zinfo->globalPauseFlag; //you will not be stupid enough to run multiple fftoggles at the same time.
+        __sync_synchronize();
+    } else if (strcmp(argv[1], "term") == 0) {
+        if (procIdx >= 0) warn("term terminates the whole simulation, you specified a procIdx");
+        zinfo->externalTermPending = true;
+        __sync_synchronize();
+        info("Marked simulation for termination");
+    } else {
+        panic("Invalid command: %s", cmd);
+    }
+    exit(0);
+}
+
diff --git a/src/filter_cache.h b/src/filter_cache.h
new file mode 100644
index 00000000..77becbd9
--- /dev/null
+++ b/src/filter_cache.h
@@ -0,0 +1,171 @@
+/** $lic$
+ * Copyright (C) 2012-2014 by Massachusetts Institute of Technology
+ * Copyright (C) 2010-2013 by The Board of Trustees of Stanford University
+ *
+ * This file is part of zsim.
+ *
+ * zsim is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU General Public License as published by the Free Software
+ * Foundation, version 2.
+ *
+ * If you use this software in your research, we request that you reference
+ * the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of
+ * Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the
+ * source of the simulator in any publications that use this software, and that
+ * you send us a citation of your work.
+ *
+ * zsim is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef FILTER_CACHE_H_
+#define FILTER_CACHE_H_
+
+#include "bithacks.h"
+#include "cache.h"
+#include "galloc.h"
+#include "zsim.h"
+
+/* Extends Cache with an L0 direct-mapped cache, optimized to hell for hits
+ *
+ * L1 lookups are dominated by several kinds of overhead (grab the cache locks,
+ * several virtual functions for the replacement policy, etc.). This
+ * specialization of Cache solves these issues by having a filter array that
+ * holds the most recently used line in each set. Accesses check the filter array,
+ * and then go through the normal access path. Because there is one line per set,
+ * it is fine to do this without grabbing a lock.
+ */
+
+class FilterCache : public Cache {
+    private:
+        struct FilterEntry {
+            volatile Address rdAddr;
+            volatile Address wrAddr;
+            volatile uint64_t availCycle;
+
+            void clear() {wrAddr = 0; rdAddr = 0; availCycle = 0;}
+        };
+
+        //Replicates the most accessed line of each set in the cache
+        FilterEntry* filterArray;
+        Address setMask;
+        uint32_t numSets;
+        uint32_t srcId; //should match the core
+        uint32_t reqFlags;
+
+        lock_t filterLock;
+        uint64_t fGETSHit, fGETXHit;
+
+    public:
+        FilterCache(uint32_t _numSets, uint32_t _numLines, CC* _cc, CacheArray* _array,
+                ReplPolicy* _rp, uint32_t _accLat, uint32_t _invLat, g_string& _name)
+            : Cache(_numLines, _cc, _array, _rp, _accLat, _invLat, _name)
+        {
+            numSets = _numSets;
+            setMask = numSets - 1;
+            filterArray = gm_memalign<FilterEntry>(CACHE_LINE_BYTES, numSets);
+            for (uint32_t i = 0; i < numSets; i++) filterArray[i].clear();
+            futex_init(&filterLock);
+            fGETSHit = fGETXHit = 0;
+            srcId = -1;
+            reqFlags = 0;
+        }
+
+        void setSourceId(uint32_t id) {
+            srcId = id;
+        }
+
+        void setFlags(uint32_t flags) {
+            reqFlags = flags;
+        }
+
+        void initStats(AggregateStat* parentStat) {
+            AggregateStat* cacheStat = new AggregateStat();
+            cacheStat->init(name.c_str(), "Filter cache stats");
+
+            ProxyStat* fgetsStat = new ProxyStat();
+            fgetsStat->init("fhGETS", "Filtered GETS hits", &fGETSHit);
+            ProxyStat* fgetxStat = new ProxyStat();
+            fgetxStat->init("fhGETX", "Filtered GETX hits", &fGETXHit);
+            cacheStat->append(fgetsStat);
+            cacheStat->append(fgetxStat);
+
+            initCacheStats(cacheStat);
+            parentStat->append(cacheStat);
+        }
+
+        inline uint64_t load(Address vAddr, uint64_t curCycle) {
+            Address vLineAddr = vAddr >> lineBits;
+            uint32_t idx = vLineAddr & setMask;
+            uint64_t availCycle = filterArray[idx].availCycle; //read before, careful with ordering to avoid timing races
+            if (vLineAddr == filterArray[idx].rdAddr) {
+                fGETSHit++;
+                return MAX(curCycle, availCycle);
+            } else {
+                return replace(vLineAddr, idx, true, curCycle);
+            }
+        }
+
+        inline uint64_t store(Address vAddr, uint64_t curCycle) {
+            Address vLineAddr = vAddr >> lineBits;
+            uint32_t idx = vLineAddr & setMask;
+            uint64_t availCycle = filterArray[idx].availCycle; //read before, careful with ordering to avoid timing races
+            if (vLineAddr == filterArray[idx].wrAddr) {
+                fGETXHit++;
+                //NOTE: Stores don't modify availCycle; we'll catch matches in the core
+                //filterArray[idx].availCycle = curCycle; //do optimistic store-load forwarding
+                return MAX(curCycle, availCycle);
+            } else {
+                return replace(vLineAddr, idx, false, curCycle);
+            }
+        }
+
+        uint64_t replace(Address vLineAddr, uint32_t idx, bool isLoad, uint64_t curCycle) {
+            Address pLineAddr = procMask | vLineAddr;
+            MESIState dummyState = MESIState::I;
+            futex_lock(&filterLock);
+            MemReq req = {pLineAddr, isLoad? GETS : GETX, 0, &dummyState, curCycle, &filterLock, dummyState, srcId, reqFlags};
+            uint64_t respCycle  = access(req);
+
+            //Due to the way we do the locking, at this point the old address might be invalidated, but we have the new address guaranteed until we release the lock
+
+            //Careful with this order
+            Address oldAddr = filterArray[idx].rdAddr;
+            filterArray[idx].wrAddr = isLoad? -1L : vLineAddr;
+            filterArray[idx].rdAddr = vLineAddr;
+
+            //For LSU simulation purposes, loads bypass stores even to the same line if there is no conflict,
+            //(e.g., st to x, ld from x+8) and we implement store-load forwarding at the core.
+            //So if this is a load, it always sets availCycle; if it is a store hit, it doesn't
+            if (oldAddr != vLineAddr) filterArray[idx].availCycle = respCycle;
+
+            futex_unlock(&filterLock);
+            return respCycle;
+        }
+
+        //NOTE: reqWriteback is pulled up to true, but not pulled down to false.
+        uint64_t invalidate(Address lineAddr, InvType type, bool* reqWriteback, uint64_t cycle, uint32_t srcId) {
+            futex_lock(&filterLock);
+            uint32_t idx = lineAddr & setMask; //works because of how virtual<->physical is done...
+            if ((filterArray[idx].rdAddr | procMask) == lineAddr) { //FIXME: If another process calls invalidate(), procMask will not match even though we may be doing a capacity-induced invalidation!
+                filterArray[idx].wrAddr = -1L;
+                filterArray[idx].rdAddr = -1L;
+            }
+            futex_unlock(&filterLock);
+            uint64_t respCycle = Cache::invalidate(lineAddr, type, reqWriteback, cycle, srcId);
+            return respCycle;
+        }
+
+        void contextSwitch() {
+            futex_lock(&filterLock);
+            for (uint32_t i = 0; i < numSets; i++) filterArray[i].clear();
+            futex_unlock(&filterLock);
+        }
+};
+
+#endif  // FILTER_CACHE_H_
diff --git a/src/g_heap/dlmalloc.h.c b/src/g_heap/dlmalloc.h.c
new file mode 100644
index 00000000..2a1a5f86
--- /dev/null
+++ b/src/g_heap/dlmalloc.h.c
@@ -0,0 +1,5798 @@
+/** $lic$
+ * Copyright (C) 2012-2014 by Massachusetts Institute of Technology
+ * Copyright (C) 2010-2013 by The Board of Trustees of Stanford University
+ *
+ * This file is part of zsim.
+ *
+ * zsim is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU General Public License as published by the Free Software
+ * Foundation, version 2.
+ *
+ * If you use this software in your research, we request that you reference
+ * the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of
+ * Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the
+ * source of the simulator in any publications that use this software, and that
+ * you send us a citation of your work.
+ *
+ * zsim is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+//dsm: zsim-specific configuration
+//No global space; we will use a single shared mspace
+#define ONLY_MSPACES 1
+
+//Do not use locks, even though this is multithreaded. Locking is done externally to dlmalloc,
+//because dlmalloc locks are hard to make work with Pin (multiprocess, and pthread_self() does not work)
+#define USE_LOCKS 0
+
+
+//All calls to system memory allocators (sbrk/mmap) must fail; We will provide the memory chunk to manage.
+#define HAVE_MORECORE 0
+#define HAVE_MMAP 0
+#define HAVE_MREMAP 0
+
+/* NOTE: I have made almost no changes beyond adding some #error defines (so
+ * e.g. this will kick and scream if you try to compile it under windows) and
+ * add some comments below.
+ *
+ * In terms of data structures and synchronization, this works as follows: With
+ * ONLY_MSPACES, the only global data structures are the global lock and the
+ * mparams data structure. In zsim, these are *per process*. This is totally
+ * fine, as the global lock is used to 1) Protect mparams initialization, and
+ * 2) Synchronize calls to the system allocator (mmap, etc). Since we initialize
+ * mparams per process and do not ever get memory from the system, this is fine. 
+ *
+ * The only code change is to set the magic number in init_mparams to be fixed
+ * instead of randomly derived. The reason is that there is one mparams per
+ * process, so it is initialized multiple times. 
+ */
+
+/*
+  This is a version (aka dlmalloc) of malloc/free/realloc written by
+  Doug Lea and released to the public domain, as explained at
+  http://creativecommons.org/licenses/publicdomain.  Send questions,
+  comments, complaints, performance data, etc to dl@cs.oswego.edu
+
+* Version 2.8.4 Wed May 27 09:56:23 2009  Doug Lea  (dl at gee)
+
+   Note: There may be an updated version of this malloc obtainable at
+           ftp://gee.cs.oswego.edu/pub/misc/malloc.c
+         Check before installing!
+
+* Quickstart
+
+  This library is all in one file to simplify the most common usage:
+  ftp it, compile it (-O3), and link it into another program. All of
+  the compile-time options default to reasonable values for use on
+  most platforms.  You might later want to step through various
+  compile-time and dynamic tuning options.
+
+  For convenience, an include file for code using this malloc is at:
+     ftp://gee.cs.oswego.edu/pub/misc/malloc-2.8.4.h
+  You don't really need this .h file unless you call functions not
+  defined in your system include files.  The .h file contains only the
+  excerpts from this file needed for using this malloc on ANSI C/C++
+  systems, so long as you haven't changed compile-time options about
+  naming and tuning parameters.  If you do, then you can create your
+  own malloc.h that does include all settings by cutting at the point
+  indicated below. Note that you may already by default be using a C
+  library containing a malloc that is based on some version of this
+  malloc (for example in linux). You might still want to use the one
+  in this file to customize settings or to avoid overheads associated
+  with library versions.
+
+* Vital statistics:
+
+  Supported pointer/size_t representation:       4 or 8 bytes
+       size_t MUST be an unsigned type of the same width as
+       pointers. (If you are using an ancient system that declares
+       size_t as a signed type, or need it to be a different width
+       than pointers, you can use a previous release of this malloc
+       (e.g. 2.7.2) supporting these.)
+
+  Alignment:                                     8 bytes (default)
+       This suffices for nearly all current machines and C compilers.
+       However, you can define MALLOC_ALIGNMENT to be wider than this
+       if necessary (up to 128bytes), at the expense of using more space.
+
+  Minimum overhead per allocated chunk:   4 or  8 bytes (if 4byte sizes)
+                                          8 or 16 bytes (if 8byte sizes)
+       Each malloced chunk has a hidden word of overhead holding size
+       and status information, and additional cross-check word
+       if FOOTERS is defined.
+
+  Minimum allocated size: 4-byte ptrs:  16 bytes    (including overhead)
+                          8-byte ptrs:  32 bytes    (including overhead)
+
+       Even a request for zero bytes (i.e., malloc(0)) returns a
+       pointer to something of the minimum allocatable size.
+       The maximum overhead wastage (i.e., number of extra bytes
+       allocated than were requested in malloc) is less than or equal
+       to the minimum size, except for requests >= mmap_threshold that
+       are serviced via mmap(), where the worst case wastage is about
+       32 bytes plus the remainder from a system page (the minimal
+       mmap unit); typically 4096 or 8192 bytes.
+
+  Security: static-safe; optionally more or less
+       The "security" of malloc refers to the ability of malicious
+       code to accentuate the effects of errors (for example, freeing
+       space that is not currently malloc'ed or overwriting past the
+       ends of chunks) in code that calls malloc.  This malloc
+       guarantees not to modify any memory locations below the base of
+       heap, i.e., static variables, even in the presence of usage
+       errors.  The routines additionally detect most improper frees
+       and reallocs.  All this holds as long as the static bookkeeping
+       for malloc itself is not corrupted by some other means.  This
+       is only one aspect of security -- these checks do not, and
+       cannot, detect all possible programming errors.
+
+       If FOOTERS is defined nonzero, then each allocated chunk
+       carries an additional check word to verify that it was malloced
+       from its space.  These check words are the same within each
+       execution of a program using malloc, but differ across
+       executions, so externally crafted fake chunks cannot be
+       freed. This improves security by rejecting frees/reallocs that
+       could corrupt heap memory, in addition to the checks preventing
+       writes to statics that are always on.  This may further improve
+       security at the expense of time and space overhead.  (Note that
+       FOOTERS may also be worth using with MSPACES.)
+
+       By default detected errors cause the program to abort (calling
+       "abort()"). You can override this to instead proceed past
+       errors by defining PROCEED_ON_ERROR.  In this case, a bad free
+       has no effect, and a malloc that encounters a bad address
+       caused by user overwrites will ignore the bad address by
+       dropping pointers and indices to all known memory. This may
+       be appropriate for programs that should continue if at all
+       possible in the face of programming errors, although they may
+       run out of memory because dropped memory is never reclaimed.
+
+       If you don't like either of these options, you can define
+       CORRUPTION_ERROR_ACTION and USAGE_ERROR_ACTION to do anything
+       else. And if if you are sure that your program using malloc has
+       no errors or vulnerabilities, you can define INSECURE to 1,
+       which might (or might not) provide a small performance improvement.
+
+  Thread-safety: NOT thread-safe unless USE_LOCKS defined
+       When USE_LOCKS is defined, each public call to malloc, free,
+       etc is surrounded with either a pthread mutex or a win32
+       spinlock (depending on WIN32). This is not especially fast, and
+       can be a major bottleneck.  It is designed only to provide
+       minimal protection in concurrent environments, and to provide a
+       basis for extensions.  If you are using malloc in a concurrent
+       program, consider instead using nedmalloc
+       (http://www.nedprod.com/programs/portable/nedmalloc/) or
+       ptmalloc (See http://www.malloc.de), which are derived
+       from versions of this malloc.
+
+  System requirements: Any combination of MORECORE and/or MMAP/MUNMAP
+       This malloc can use unix sbrk or any emulation (invoked using
+       the CALL_MORECORE macro) and/or mmap/munmap or any emulation
+       (invoked using CALL_MMAP/CALL_MUNMAP) to get and release system
+       memory.  On most unix systems, it tends to work best if both
+       MORECORE and MMAP are enabled.  On Win32, it uses emulations
+       based on VirtualAlloc. It also uses common C library functions
+       like memset.
+
+  Compliance: I believe it is compliant with the Single Unix Specification
+       (See http://www.unix.org). Also SVID/XPG, ANSI C, and probably
+       others as well.
+
+* Overview of algorithms
+
+  This is not the fastest, most space-conserving, most portable, or
+  most tunable malloc ever written. However it is among the fastest
+  while also being among the most space-conserving, portable and
+  tunable.  Consistent balance across these factors results in a good
+  general-purpose allocator for malloc-intensive programs.
+
+  In most ways, this malloc is a best-fit allocator. Generally, it
+  chooses the best-fitting existing chunk for a request, with ties
+  broken in approximately least-recently-used order. (This strategy
+  normally maintains low fragmentation.) However, for requests less
+  than 256bytes, it deviates from best-fit when there is not an
+  exactly fitting available chunk by preferring to use space adjacent
+  to that used for the previous small request, as well as by breaking
+  ties in approximately most-recently-used order. (These enhance
+  locality of series of small allocations.)  And for very large requests
+  (>= 256Kb by default), it relies on system memory mapping
+  facilities, if supported.  (This helps avoid carrying around and
+  possibly fragmenting memory used only for large chunks.)
+
+  All operations (except malloc_stats and mallinfo) have execution
+  times that are bounded by a constant factor of the number of bits in
+  a size_t, not counting any clearing in calloc or copying in realloc,
+  or actions surrounding MORECORE and MMAP that have times
+  proportional to the number of non-contiguous regions returned by
+  system allocation routines, which is often just 1. In real-time
+  applications, you can optionally suppress segment traversals using
+  NO_SEGMENT_TRAVERSAL, which assures bounded execution even when
+  system allocators return non-contiguous spaces, at the typical
+  expense of carrying around more memory and increased fragmentation.
+
+  The implementation is not very modular and seriously overuses
+  macros. Perhaps someday all C compilers will do as good a job
+  inlining modular code as can now be done by brute-force expansion,
+  but now, enough of them seem not to.
+
+  Some compilers issue a lot of warnings about code that is
+  dead/unreachable only on some platforms, and also about intentional
+  uses of negation on unsigned types. All known cases of each can be
+  ignored.
+
+  For a longer but out of date high-level description, see
+     http://gee.cs.oswego.edu/dl/html/malloc.html
+
+* MSPACES
+  If MSPACES is defined, then in addition to malloc, free, etc.,
+  this file also defines mspace_malloc, mspace_free, etc. These
+  are versions of malloc routines that take an "mspace" argument
+  obtained using create_mspace, to control all internal bookkeeping.
+  If ONLY_MSPACES is defined, only these versions are compiled.
+  So if you would like to use this allocator for only some allocations,
+  and your system malloc for others, you can compile with
+  ONLY_MSPACES and then do something like...
+    static mspace mymspace = create_mspace(0,0); // for example
+    #define mymalloc(bytes)  mspace_malloc(mymspace, bytes)
+
+  (Note: If you only need one instance of an mspace, you can instead
+  use "USE_DL_PREFIX" to relabel the global malloc.)
+
+  You can similarly create thread-local allocators by storing
+  mspaces as thread-locals. For example:
+    static __thread mspace tlms = 0;
+    void*  tlmalloc(size_t bytes) {
+      if (tlms == 0) tlms = create_mspace(0, 0);
+      return mspace_malloc(tlms, bytes);
+    }
+    void  tlfree(void* mem) { mspace_free(tlms, mem); }
+
+  Unless FOOTERS is defined, each mspace is completely independent.
+  You cannot allocate from one and free to another (although
+  conformance is only weakly checked, so usage errors are not always
+  caught). If FOOTERS is defined, then each chunk carries around a tag
+  indicating its originating mspace, and frees are directed to their
+  originating spaces.
+
+ -------------------------  Compile-time options ---------------------------
+
+Be careful in setting #define values for numerical constants of type
+size_t. On some systems, literal values are not automatically extended
+to size_t precision unless they are explicitly casted. You can also
+use the symbolic values MAX_SIZE_T, SIZE_T_ONE, etc below.
+
+WIN32                    default: defined if _WIN32 defined
+  Defining WIN32 sets up defaults for MS environment and compilers.
+  Otherwise defaults are for unix. Beware that there seem to be some
+  cases where this malloc might not be a pure drop-in replacement for
+  Win32 malloc: Random-looking failures from Win32 GDI API's (eg;
+  SetDIBits()) may be due to bugs in some video driver implementations
+  when pixel buffers are malloc()ed, and the region spans more than
+  one VirtualAlloc()ed region. Because dlmalloc uses a small (64Kb)
+  default granularity, pixel buffers may straddle virtual allocation
+  regions more often than when using the Microsoft allocator.  You can
+  avoid this by using VirtualAlloc() and VirtualFree() for all pixel
+  buffers rather than using malloc().  If this is not possible,
+  recompile this malloc with a larger DEFAULT_GRANULARITY.
+
+MALLOC_ALIGNMENT         default: (size_t)8
+  Controls the minimum alignment for malloc'ed chunks.  It must be a
+  power of two and at least 8, even on machines for which smaller
+  alignments would suffice. It may be defined as larger than this
+  though. Note however that code and data structures are optimized for
+  the case of 8-byte alignment.
+
+MSPACES                  default: 0 (false)
+  If true, compile in support for independent allocation spaces.
+  This is only supported if HAVE_MMAP is true.
+
+ONLY_MSPACES             default: 0 (false)
+  If true, only compile in mspace versions, not regular versions.
+
+USE_LOCKS                default: 0 (false)
+  Causes each call to each public routine to be surrounded with
+  pthread or WIN32 mutex lock/unlock. (If set true, this can be
+  overridden on a per-mspace basis for mspace versions.) If set to a
+  non-zero value other than 1, locks are used, but their
+  implementation is left out, so lock functions must be supplied manually,
+  as described below.
+
+USE_SPIN_LOCKS           default: 1 iff USE_LOCKS and on x86 using gcc or MSC
+  If true, uses custom spin locks for locking. This is currently
+  supported only for x86 platforms using gcc or recent MS compilers.
+  Otherwise, posix locks or win32 critical sections are used.
+
+FOOTERS                  default: 0
+  If true, provide extra checking and dispatching by placing
+  information in the footers of allocated chunks. This adds
+  space and time overhead.
+
+INSECURE                 default: 0
+  If true, omit checks for usage errors and heap space overwrites.
+
+USE_DL_PREFIX            default: NOT defined
+  Causes compiler to prefix all public routines with the string 'dl'.
+  This can be useful when you only want to use this malloc in one part
+  of a program, using your regular system malloc elsewhere.
+
+ABORT                    default: defined as abort()
+  Defines how to abort on failed checks.  On most systems, a failed
+  check cannot die with an "assert" or even print an informative
+  message, because the underlying print routines in turn call malloc,
+  which will fail again.  Generally, the best policy is to simply call
+  abort(). It's not very useful to do more than this because many
+  errors due to overwriting will show up as address faults (null, odd
+  addresses etc) rather than malloc-triggered checks, so will also
+  abort.  Also, most compilers know that abort() does not return, so
+  can better optimize code conditionally calling it.
+
+PROCEED_ON_ERROR           default: defined as 0 (false)
+  Controls whether detected bad addresses cause them to bypassed
+  rather than aborting. If set, detected bad arguments to free and
+  realloc are ignored. And all bookkeeping information is zeroed out
+  upon a detected overwrite of freed heap space, thus losing the
+  ability to ever return it from malloc again, but enabling the
+  application to proceed. If PROCEED_ON_ERROR is defined, the
+  static variable malloc_corruption_error_count is compiled in
+  and can be examined to see if errors have occurred. This option
+  generates slower code than the default abort policy.
+
+DEBUG                    default: NOT defined
+  The DEBUG setting is mainly intended for people trying to modify
+  this code or diagnose problems when porting to new platforms.
+  However, it may also be able to better isolate user errors than just
+  using runtime checks.  The assertions in the check routines spell
+  out in more detail the assumptions and invariants underlying the
+  algorithms.  The checking is fairly extensive, and will slow down
+  execution noticeably. Calling malloc_stats or mallinfo with DEBUG
+  set will attempt to check every non-mmapped allocated and free chunk
+  in the course of computing the summaries.
+
+ABORT_ON_ASSERT_FAILURE   default: defined as 1 (true)
+  Debugging assertion failures can be nearly impossible if your
+  version of the assert macro causes malloc to be called, which will
+  lead to a cascade of further failures, blowing the runtime stack.
+  ABORT_ON_ASSERT_FAILURE cause assertions failures to call abort(),
+  which will usually make debugging easier.
+
+MALLOC_FAILURE_ACTION     default: sets errno to ENOMEM, or no-op on win32
+  The action to take before "return 0" when malloc fails to be able to
+  return memory because there is none available.
+
+HAVE_MORECORE             default: 1 (true) unless win32 or ONLY_MSPACES
+  True if this system supports sbrk or an emulation of it.
+
+MORECORE                  default: sbrk
+  The name of the sbrk-style system routine to call to obtain more
+  memory.  See below for guidance on writing custom MORECORE
+  functions. The type of the argument to sbrk/MORECORE varies across
+  systems.  It cannot be size_t, because it supports negative
+  arguments, so it is normally the signed type of the same width as
+  size_t (sometimes declared as "intptr_t").  It doesn't much matter
+  though. Internally, we only call it with arguments less than half
+  the max value of a size_t, which should work across all reasonable
+  possibilities, although sometimes generating compiler warnings.
+
+MORECORE_CONTIGUOUS       default: 1 (true) if HAVE_MORECORE
+  If true, take advantage of fact that consecutive calls to MORECORE
+  with positive arguments always return contiguous increasing
+  addresses.  This is true of unix sbrk. It does not hurt too much to
+  set it true anyway, since malloc copes with non-contiguities.
+  Setting it false when definitely non-contiguous saves time
+  and possibly wasted space it would take to discover this though.
+
+MORECORE_CANNOT_TRIM      default: NOT defined
+  True if MORECORE cannot release space back to the system when given
+  negative arguments. This is generally necessary only if you are
+  using a hand-crafted MORECORE function that cannot handle negative
+  arguments.
+
+NO_SEGMENT_TRAVERSAL       default: 0
+  If non-zero, suppresses traversals of memory segments
+  returned by either MORECORE or CALL_MMAP. This disables
+  merging of segments that are contiguous, and selectively
+  releasing them to the OS if unused, but bounds execution times.
+
+HAVE_MMAP                 default: 1 (true)
+  True if this system supports mmap or an emulation of it.  If so, and
+  HAVE_MORECORE is not true, MMAP is used for all system
+  allocation. If set and HAVE_MORECORE is true as well, MMAP is
+  primarily used to directly allocate very large blocks. It is also
+  used as a backup strategy in cases where MORECORE fails to provide
+  space from system. Note: A single call to MUNMAP is assumed to be
+  able to unmap memory that may have be allocated using multiple calls
+  to MMAP, so long as they are adjacent.
+
+HAVE_MREMAP               default: 1 on linux, else 0
+  If true realloc() uses mremap() to re-allocate large blocks and
+  extend or shrink allocation spaces.
+
+MMAP_CLEARS               default: 1 except on WINCE.
+  True if mmap clears memory so calloc doesn't need to. This is true
+  for standard unix mmap using /dev/zero and on WIN32 except for WINCE.
+
+USE_BUILTIN_FFS            default: 0 (i.e., not used)
+  Causes malloc to use the builtin ffs() function to compute indices.
+  Some compilers may recognize and intrinsify ffs to be faster than the
+  supplied C version. Also, the case of x86 using gcc is special-cased
+  to an asm instruction, so is already as fast as it can be, and so
+  this setting has no effect. Similarly for Win32 under recent MS compilers.
+  (On most x86s, the asm version is only slightly faster than the C version.)
+
+malloc_getpagesize         default: derive from system includes, or 4096.
+  The system page size. To the extent possible, this malloc manages
+  memory from the system in page-size units.  This may be (and
+  usually is) a function rather than a constant. This is ignored
+  if WIN32, where page size is determined using getSystemInfo during
+  initialization.
+
+USE_DEV_RANDOM             default: 0 (i.e., not used)
+  Causes malloc to use /dev/random to initialize secure magic seed for
+  stamping footers. Otherwise, the current time is used.
+
+NO_MALLINFO                default: 0
+  If defined, don't compile "mallinfo". This can be a simple way
+  of dealing with mismatches between system declarations and
+  those in this file.
+
+MALLINFO_FIELD_TYPE        default: size_t
+  The type of the fields in the mallinfo struct. This was originally
+  defined as "int" in SVID etc, but is more usefully defined as
+  size_t. The value is used only if  HAVE_USR_INCLUDE_MALLOC_H is not set
+
+REALLOC_ZERO_BYTES_FREES    default: not defined
+  This should be set if a call to realloc with zero bytes should
+  be the same as a call to free. Some people think it should. Otherwise,
+  since this malloc returns a unique pointer for malloc(0), so does
+  realloc(p, 0).
+
+LACKS_UNISTD_H, LACKS_FCNTL_H, LACKS_SYS_PARAM_H, LACKS_SYS_MMAN_H
+LACKS_STRINGS_H, LACKS_STRING_H, LACKS_SYS_TYPES_H,  LACKS_ERRNO_H
+LACKS_STDLIB_H                default: NOT defined unless on WIN32
+  Define these if your system does not have these header files.
+  You might need to manually insert some of the declarations they provide.
+
+DEFAULT_GRANULARITY        default: page size if MORECORE_CONTIGUOUS,
+                                system_info.dwAllocationGranularity in WIN32,
+                                otherwise 64K.
+      Also settable using mallopt(M_GRANULARITY, x)
+  The unit for allocating and deallocating memory from the system.  On
+  most systems with contiguous MORECORE, there is no reason to
+  make this more than a page. However, systems with MMAP tend to
+  either require or encourage larger granularities.  You can increase
+  this value to prevent system allocation functions to be called so
+  often, especially if they are slow.  The value must be at least one
+  page and must be a power of two.  Setting to 0 causes initialization
+  to either page size or win32 region size.  (Note: In previous
+  versions of malloc, the equivalent of this option was called
+  "TOP_PAD")
+
+DEFAULT_TRIM_THRESHOLD    default: 2MB
+      Also settable using mallopt(M_TRIM_THRESHOLD, x)
+  The maximum amount of unused top-most memory to keep before
+  releasing via malloc_trim in free().  Automatic trimming is mainly
+  useful in long-lived programs using contiguous MORECORE.  Because
+  trimming via sbrk can be slow on some systems, and can sometimes be
+  wasteful (in cases where programs immediately afterward allocate
+  more large chunks) the value should be high enough so that your
+  overall system performance would improve by releasing this much
+  memory.  As a rough guide, you might set to a value close to the
+  average size of a process (program) running on your system.
+  Releasing this much memory would allow such a process to run in
+  memory.  Generally, it is worth tuning trim thresholds when a
+  program undergoes phases where several large chunks are allocated
+  and released in ways that can reuse each other's storage, perhaps
+  mixed with phases where there are no such chunks at all. The trim
+  value must be greater than page size to have any useful effect.  To
+  disable trimming completely, you can set to MAX_SIZE_T. Note that the trick
+  some people use of mallocing a huge space and then freeing it at
+  program startup, in an attempt to reserve system memory, doesn't
+  have the intended effect under automatic trimming, since that memory
+  will immediately be returned to the system.
+
+DEFAULT_MMAP_THRESHOLD       default: 256K
+      Also settable using mallopt(M_MMAP_THRESHOLD, x)
+  The request size threshold for using MMAP to directly service a
+  request. Requests of at least this size that cannot be allocated
+  using already-existing space will be serviced via mmap.  (If enough
+  normal freed space already exists it is used instead.)  Using mmap
+  segregates relatively large chunks of memory so that they can be
+  individually obtained and released from the host system. A request
+  serviced through mmap is never reused by any other request (at least
+  not directly; the system may just so happen to remap successive
+  requests to the same locations).  Segregating space in this way has
+  the benefits that: Mmapped space can always be individually released
+  back to the system, which helps keep the system level memory demands
+  of a long-lived program low.  Also, mapped memory doesn't become
+  `locked' between other chunks, as can happen with normally allocated
+  chunks, which means that even trimming via malloc_trim would not
+  release them.  However, it has the disadvantage that the space
+  cannot be reclaimed, consolidated, and then used to service later
+  requests, as happens with normal chunks.  The advantages of mmap
+  nearly always outweigh disadvantages for "large" chunks, but the
+  value of "large" may vary across systems.  The default is an
+  empirically derived value that works well in most systems. You can
+  disable mmap by setting to MAX_SIZE_T.
+
+MAX_RELEASE_CHECK_RATE   default: 4095 unless not HAVE_MMAP
+  The number of consolidated frees between checks to release
+  unused segments when freeing. When using non-contiguous segments,
+  especially with multiple mspaces, checking only for topmost space
+  doesn't always suffice to trigger trimming. To compensate for this,
+  free() will, with a period of MAX_RELEASE_CHECK_RATE (or the
+  current number of segments, if greater) try to release unused
+  segments to the OS when freeing chunks that result in
+  consolidation. The best value for this parameter is a compromise
+  between slowing down frees with relatively costly checks that
+  rarely trigger versus holding on to unused memory. To effectively
+  disable, set to MAX_SIZE_T. This may lead to a very slight speed
+  improvement at the expense of carrying around more memory.
+*/
+
+/* Version identifier to allow people to support multiple versions */
+#ifndef DLMALLOC_VERSION
+#define DLMALLOC_VERSION 20804
+#endif /* DLMALLOC_VERSION */
+
+#ifndef WIN32
+#ifdef _WIN32
+#define WIN32 1
+#endif  /* _WIN32 */
+#ifdef _WIN32_WCE
+#define LACKS_FCNTL_H
+#define WIN32 1
+#endif /* _WIN32_WCE */
+#endif  /* WIN32 */
+#ifdef WIN32
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+#define HAVE_MMAP 1
+#define HAVE_MORECORE 0
+#define LACKS_UNISTD_H
+#define LACKS_SYS_PARAM_H
+#define LACKS_SYS_MMAN_H
+#define LACKS_STRING_H
+#define LACKS_STRINGS_H
+#define LACKS_SYS_TYPES_H
+#define LACKS_ERRNO_H
+#ifndef MALLOC_FAILURE_ACTION
+#define MALLOC_FAILURE_ACTION
+#endif /* MALLOC_FAILURE_ACTION */
+#ifdef _WIN32_WCE /* WINCE reportedly does not clear */
+#define MMAP_CLEARS 0
+#else
+#define MMAP_CLEARS 1
+#endif /* _WIN32_WCE */
+#endif  /* WIN32 */
+
+#if defined(DARWIN) || defined(_DARWIN)
+/* Mac OSX docs advise not to use sbrk; it seems better to use mmap */
+#ifndef HAVE_MORECORE
+#define HAVE_MORECORE 0
+#define HAVE_MMAP 1
+/* OSX allocators provide 16 byte alignment */
+#ifndef MALLOC_ALIGNMENT
+#define MALLOC_ALIGNMENT ((size_t)16U)
+#endif
+#endif  /* HAVE_MORECORE */
+#endif  /* DARWIN */
+
+#ifndef LACKS_SYS_TYPES_H
+#include <sys/types.h>  /* For size_t */
+#endif  /* LACKS_SYS_TYPES_H */
+
+#if (defined(__GNUC__) && ((defined(__i386__) || defined(__x86_64__)))) || (defined(_MSC_VER) && _MSC_VER>=1310)
+#define SPIN_LOCKS_AVAILABLE 1
+#else
+#define SPIN_LOCKS_AVAILABLE 0
+#endif
+
+/* The maximum possible size_t value has all bits set */
+#define MAX_SIZE_T           (~(size_t)0)
+
+#ifndef ONLY_MSPACES
+#define ONLY_MSPACES 0     /* define to a value */
+#else
+#define ONLY_MSPACES 1
+#endif  /* ONLY_MSPACES */
+#ifndef MSPACES
+#if ONLY_MSPACES
+#define MSPACES 1
+#else   /* ONLY_MSPACES */
+#define MSPACES 0
+#endif  /* ONLY_MSPACES */
+#endif  /* MSPACES */
+#ifndef MALLOC_ALIGNMENT
+#define MALLOC_ALIGNMENT ((size_t)8U)
+#endif  /* MALLOC_ALIGNMENT */
+#ifndef FOOTERS
+#define FOOTERS 0
+#endif  /* FOOTERS */
+#ifndef ABORT
+#define ABORT  abort()
+#endif  /* ABORT */
+#ifndef ABORT_ON_ASSERT_FAILURE
+#define ABORT_ON_ASSERT_FAILURE 1
+#endif  /* ABORT_ON_ASSERT_FAILURE */
+#ifndef PROCEED_ON_ERROR
+#define PROCEED_ON_ERROR 0
+#endif  /* PROCEED_ON_ERROR */
+#ifndef USE_LOCKS
+#define USE_LOCKS 0
+#endif  /* USE_LOCKS */
+#ifndef USE_SPIN_LOCKS
+#if USE_LOCKS && SPIN_LOCKS_AVAILABLE
+#define USE_SPIN_LOCKS 1
+#else
+#define USE_SPIN_LOCKS 0
+#endif /* USE_LOCKS && SPIN_LOCKS_AVAILABLE. */
+#endif /* USE_SPIN_LOCKS */
+#ifndef INSECURE
+#define INSECURE 0
+#endif  /* INSECURE */
+#ifndef HAVE_MMAP
+#define HAVE_MMAP 1
+#endif  /* HAVE_MMAP */
+#ifndef MMAP_CLEARS
+#define MMAP_CLEARS 1
+#endif  /* MMAP_CLEARS */
+#ifndef HAVE_MREMAP
+#ifdef linux
+#define HAVE_MREMAP 1
+#else   /* linux */
+#define HAVE_MREMAP 0
+#endif  /* linux */
+#endif  /* HAVE_MREMAP */
+#ifndef MALLOC_FAILURE_ACTION
+#define MALLOC_FAILURE_ACTION  errno = ENOMEM;
+#endif  /* MALLOC_FAILURE_ACTION */
+#ifndef HAVE_MORECORE
+#if ONLY_MSPACES
+#define HAVE_MORECORE 0
+#else   /* ONLY_MSPACES */
+#define HAVE_MORECORE 1
+#endif  /* ONLY_MSPACES */
+#endif  /* HAVE_MORECORE */
+#if !HAVE_MORECORE
+#define MORECORE_CONTIGUOUS 0
+#else   /* !HAVE_MORECORE */
+#define MORECORE_DEFAULT sbrk
+#ifndef MORECORE_CONTIGUOUS
+#define MORECORE_CONTIGUOUS 1
+#endif  /* MORECORE_CONTIGUOUS */
+#endif  /* HAVE_MORECORE */
+#ifndef DEFAULT_GRANULARITY
+#if (MORECORE_CONTIGUOUS || defined(WIN32))
+#define DEFAULT_GRANULARITY (0)  /* 0 means to compute in init_mparams */
+#else   /* MORECORE_CONTIGUOUS */
+#define DEFAULT_GRANULARITY ((size_t)64U * (size_t)1024U)
+#endif  /* MORECORE_CONTIGUOUS */
+#endif  /* DEFAULT_GRANULARITY */
+#ifndef DEFAULT_TRIM_THRESHOLD
+#ifndef MORECORE_CANNOT_TRIM
+#define DEFAULT_TRIM_THRESHOLD ((size_t)2U * (size_t)1024U * (size_t)1024U)
+#else   /* MORECORE_CANNOT_TRIM */
+#define DEFAULT_TRIM_THRESHOLD MAX_SIZE_T
+#endif  /* MORECORE_CANNOT_TRIM */
+#endif  /* DEFAULT_TRIM_THRESHOLD */
+#ifndef DEFAULT_MMAP_THRESHOLD
+#if HAVE_MMAP
+#define DEFAULT_MMAP_THRESHOLD ((size_t)256U * (size_t)1024U)
+#else   /* HAVE_MMAP */
+#define DEFAULT_MMAP_THRESHOLD MAX_SIZE_T
+#endif  /* HAVE_MMAP */
+#endif  /* DEFAULT_MMAP_THRESHOLD */
+#ifndef MAX_RELEASE_CHECK_RATE
+#if HAVE_MMAP
+#define MAX_RELEASE_CHECK_RATE 4095
+#else
+#define MAX_RELEASE_CHECK_RATE MAX_SIZE_T
+#endif /* HAVE_MMAP */
+#endif /* MAX_RELEASE_CHECK_RATE */
+#ifndef USE_BUILTIN_FFS
+#define USE_BUILTIN_FFS 0
+#endif  /* USE_BUILTIN_FFS */
+#ifndef USE_DEV_RANDOM
+#define USE_DEV_RANDOM 0
+#endif  /* USE_DEV_RANDOM */
+#ifndef NO_MALLINFO
+#define NO_MALLINFO 0
+#endif  /* NO_MALLINFO */
+#ifndef MALLINFO_FIELD_TYPE
+#define MALLINFO_FIELD_TYPE size_t
+#endif  /* MALLINFO_FIELD_TYPE */
+#ifndef NO_SEGMENT_TRAVERSAL
+#define NO_SEGMENT_TRAVERSAL 0
+#endif /* NO_SEGMENT_TRAVERSAL */
+
+/*
+  mallopt tuning options.  SVID/XPG defines four standard parameter
+  numbers for mallopt, normally defined in malloc.h.  None of these
+  are used in this malloc, so setting them has no effect. But this
+  malloc does support the following options.
+*/
+
+#define M_TRIM_THRESHOLD     (-1)
+#define M_GRANULARITY        (-2)
+#define M_MMAP_THRESHOLD     (-3)
+
+/* ------------------------ Mallinfo declarations ------------------------ */
+
+#if !NO_MALLINFO
+/*
+  This version of malloc supports the standard SVID/XPG mallinfo
+  routine that returns a struct containing usage properties and
+  statistics. It should work on any system that has a
+  /usr/include/malloc.h defining struct mallinfo.  The main
+  declaration needed is the mallinfo struct that is returned (by-copy)
+  by mallinfo().  The malloinfo struct contains a bunch of fields that
+  are not even meaningful in this version of malloc.  These fields are
+  are instead filled by mallinfo() with other numbers that might be of
+  interest.
+
+  HAVE_USR_INCLUDE_MALLOC_H should be set if you have a
+  /usr/include/malloc.h file that includes a declaration of struct
+  mallinfo.  If so, it is included; else a compliant version is
+  declared below.  These must be precisely the same for mallinfo() to
+  work.  The original SVID version of this struct, defined on most
+  systems with mallinfo, declares all fields as ints. But some others
+  define as unsigned long. If your system defines the fields using a
+  type of different width than listed here, you MUST #include your
+  system version and #define HAVE_USR_INCLUDE_MALLOC_H.
+*/
+
+/* #define HAVE_USR_INCLUDE_MALLOC_H */
+
+#ifdef HAVE_USR_INCLUDE_MALLOC_H
+#include "/usr/include/malloc.h"
+#else /* HAVE_USR_INCLUDE_MALLOC_H */
+#ifndef STRUCT_MALLINFO_DECLARED
+#define STRUCT_MALLINFO_DECLARED 1
+struct mallinfo {
+  MALLINFO_FIELD_TYPE arena;    /* non-mmapped space allocated from system */
+  MALLINFO_FIELD_TYPE ordblks;  /* number of free chunks */
+  MALLINFO_FIELD_TYPE smblks;   /* always 0 */
+  MALLINFO_FIELD_TYPE hblks;    /* always 0 */
+  MALLINFO_FIELD_TYPE hblkhd;   /* space in mmapped regions */
+  MALLINFO_FIELD_TYPE usmblks;  /* maximum total allocated space */
+  MALLINFO_FIELD_TYPE fsmblks;  /* always 0 */
+  MALLINFO_FIELD_TYPE uordblks; /* total allocated space */
+  MALLINFO_FIELD_TYPE fordblks; /* total free space */
+  MALLINFO_FIELD_TYPE keepcost; /* releasable (via malloc_trim) space */
+};
+#endif /* STRUCT_MALLINFO_DECLARED */
+#endif /* HAVE_USR_INCLUDE_MALLOC_H */
+#endif /* NO_MALLINFO */
+
+/*
+  Try to persuade compilers to inline. The most critical functions for
+  inlining are defined as macros, so these aren't used for them.
+*/
+
+#ifndef FORCEINLINE
+  #if defined(__GNUC__)
+#define FORCEINLINE __inline __attribute__ ((always_inline))
+  #elif defined(_MSC_VER)
+    #define FORCEINLINE __forceinline
+  #endif
+#endif
+#ifndef NOINLINE
+  #if defined(__GNUC__)
+    #define NOINLINE __attribute__ ((noinline))
+  #elif defined(_MSC_VER)
+    #define NOINLINE __declspec(noinline)
+  #else
+    #define NOINLINE
+  #endif
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#ifndef FORCEINLINE
+ #define FORCEINLINE inline
+#endif
+#endif /* __cplusplus */
+#ifndef FORCEINLINE
+ #define FORCEINLINE
+#endif
+
+#if !ONLY_MSPACES
+
+/* ------------------- Declarations of public routines ------------------- */
+
+#ifndef USE_DL_PREFIX
+#define dlcalloc               calloc
+#define dlfree                 free
+#define dlmalloc               malloc
+#define dlmemalign             memalign
+#define dlrealloc              realloc
+#define dlvalloc               valloc
+#define dlpvalloc              pvalloc
+#define dlmallinfo             mallinfo
+#define dlmallopt              mallopt
+#define dlmalloc_trim          malloc_trim
+#define dlmalloc_stats         malloc_stats
+#define dlmalloc_usable_size   malloc_usable_size
+#define dlmalloc_footprint     malloc_footprint
+#define dlmalloc_max_footprint malloc_max_footprint
+#define dlindependent_calloc   independent_calloc
+#define dlindependent_comalloc independent_comalloc
+#endif /* USE_DL_PREFIX */
+
+
+/*
+  malloc(size_t n)
+  Returns a pointer to a newly allocated chunk of at least n bytes, or
+  null if no space is available, in which case errno is set to ENOMEM
+  on ANSI C systems.
+
+  If n is zero, malloc returns a minimum-sized chunk. (The minimum
+  size is 16 bytes on most 32bit systems, and 32 bytes on 64bit
+  systems.)  Note that size_t is an unsigned type, so calls with
+  arguments that would be negative if signed are interpreted as
+  requests for huge amounts of space, which will often fail. The
+  maximum supported value of n differs across systems, but is in all
+  cases less than the maximum representable value of a size_t.
+*/
+void* dlmalloc(size_t);
+
+/*
+  free(void* p)
+  Releases the chunk of memory pointed to by p, that had been previously
+  allocated using malloc or a related routine such as realloc.
+  It has no effect if p is null. If p was not malloced or already
+  freed, free(p) will by default cause the current program to abort.
+*/
+void  dlfree(void*);
+
+/*
+  calloc(size_t n_elements, size_t element_size);
+  Returns a pointer to n_elements * element_size bytes, with all locations
+  set to zero.
+*/
+void* dlcalloc(size_t, size_t);
+
+/*
+  realloc(void* p, size_t n)
+  Returns a pointer to a chunk of size n that contains the same data
+  as does chunk p up to the minimum of (n, p's size) bytes, or null
+  if no space is available.
+
+  The returned pointer may or may not be the same as p. The algorithm
+  prefers extending p in most cases when possible, otherwise it
+  employs the equivalent of a malloc-copy-free sequence.
+
+  If p is null, realloc is equivalent to malloc.
+
+  If space is not available, realloc returns null, errno is set (if on
+  ANSI) and p is NOT freed.
+
+  if n is for fewer bytes than already held by p, the newly unused
+  space is lopped off and freed if possible.  realloc with a size
+  argument of zero (re)allocates a minimum-sized chunk.
+
+  The old unix realloc convention of allowing the last-free'd chunk
+  to be used as an argument to realloc is not supported.
+*/
+
+void* dlrealloc(void*, size_t);
+
+/*
+  memalign(size_t alignment, size_t n);
+  Returns a pointer to a newly allocated chunk of n bytes, aligned
+  in accord with the alignment argument.
+
+  The alignment argument should be a power of two. If the argument is
+  not a power of two, the nearest greater power is used.
+  8-byte alignment is guaranteed by normal malloc calls, so don't
+  bother calling memalign with an argument of 8 or less.
+
+  Overreliance on memalign is a sure way to fragment space.
+*/
+void* dlmemalign(size_t, size_t);
+
+/*
+  valloc(size_t n);
+  Equivalent to memalign(pagesize, n), where pagesize is the page
+  size of the system. If the pagesize is unknown, 4096 is used.
+*/
+void* dlvalloc(size_t);
+
+/*
+  mallopt(int parameter_number, int parameter_value)
+  Sets tunable parameters The format is to provide a
+  (parameter-number, parameter-value) pair.  mallopt then sets the
+  corresponding parameter to the argument value if it can (i.e., so
+  long as the value is meaningful), and returns 1 if successful else
+  0.  To workaround the fact that mallopt is specified to use int,
+  not size_t parameters, the value -1 is specially treated as the
+  maximum unsigned size_t value.
+
+  SVID/XPG/ANSI defines four standard param numbers for mallopt,
+  normally defined in malloc.h.  None of these are use in this malloc,
+  so setting them has no effect. But this malloc also supports other
+  options in mallopt. See below for details.  Briefly, supported
+  parameters are as follows (listed defaults are for "typical"
+  configurations).
+
+  Symbol            param #  default    allowed param values
+  M_TRIM_THRESHOLD     -1   2*1024*1024   any   (-1 disables)
+  M_GRANULARITY        -2     page size   any power of 2 >= page size
+  M_MMAP_THRESHOLD     -3      256*1024   any   (or 0 if no MMAP support)
+*/
+int dlmallopt(int, int);
+
+/*
+  malloc_footprint();
+  Returns the number of bytes obtained from the system.  The total
+  number of bytes allocated by malloc, realloc etc., is less than this
+  value. Unlike mallinfo, this function returns only a precomputed
+  result, so can be called frequently to monitor memory consumption.
+  Even if locks are otherwise defined, this function does not use them,
+  so results might not be up to date.
+*/
+size_t dlmalloc_footprint(void);
+
+/*
+  malloc_max_footprint();
+  Returns the maximum number of bytes obtained from the system. This
+  value will be greater than current footprint if deallocated space
+  has been reclaimed by the system. The peak number of bytes allocated
+  by malloc, realloc etc., is less than this value. Unlike mallinfo,
+  this function returns only a precomputed result, so can be called
+  frequently to monitor memory consumption.  Even if locks are
+  otherwise defined, this function does not use them, so results might
+  not be up to date.
+*/
+size_t dlmalloc_max_footprint(void);
+
+#if !NO_MALLINFO
+/*
+  mallinfo()
+  Returns (by copy) a struct containing various summary statistics:
+
+  arena:     current total non-mmapped bytes allocated from system
+  ordblks:   the number of free chunks
+  smblks:    always zero.
+  hblks:     current number of mmapped regions
+  hblkhd:    total bytes held in mmapped regions
+  usmblks:   the maximum total allocated space. This will be greater
+                than current total if trimming has occurred.
+  fsmblks:   always zero
+  uordblks:  current total allocated space (normal or mmapped)
+  fordblks:  total free space
+  keepcost:  the maximum number of bytes that could ideally be released
+               back to system via malloc_trim. ("ideally" means that
+               it ignores page restrictions etc.)
+
+  Because these fields are ints, but internal bookkeeping may
+  be kept as longs, the reported values may wrap around zero and
+  thus be inaccurate.
+*/
+struct mallinfo dlmallinfo(void);
+#endif /* NO_MALLINFO */
+
+/*
+  independent_calloc(size_t n_elements, size_t element_size, void* chunks[]);
+
+  independent_calloc is similar to calloc, but instead of returning a
+  single cleared space, it returns an array of pointers to n_elements
+  independent elements that can hold contents of size elem_size, each
+  of which starts out cleared, and can be independently freed,
+  realloc'ed etc. The elements are guaranteed to be adjacently
+  allocated (this is not guaranteed to occur with multiple callocs or
+  mallocs), which may also improve cache locality in some
+  applications.
+
+  The "chunks" argument is optional (i.e., may be null, which is
+  probably the most typical usage). If it is null, the returned array
+  is itself dynamically allocated and should also be freed when it is
+  no longer needed. Otherwise, the chunks array must be of at least
+  n_elements in length. It is filled in with the pointers to the
+  chunks.
+
+  In either case, independent_calloc returns this pointer array, or
+  null if the allocation failed.  If n_elements is zero and "chunks"
+  is null, it returns a chunk representing an array with zero elements
+  (which should be freed if not wanted).
+
+  Each element must be individually freed when it is no longer
+  needed. If you'd like to instead be able to free all at once, you
+  should instead use regular calloc and assign pointers into this
+  space to represent elements.  (In this case though, you cannot
+  independently free elements.)
+
+  independent_calloc simplifies and speeds up implementations of many
+  kinds of pools.  It may also be useful when constructing large data
+  structures that initially have a fixed number of fixed-sized nodes,
+  but the number is not known at compile time, and some of the nodes
+  may later need to be freed. For example:
+
+  struct Node { int item; struct Node* next; };
+
+  struct Node* build_list() {
+    struct Node** pool;
+    int n = read_number_of_nodes_needed();
+    if (n <= 0) return 0;
+    pool = (struct Node**)(independent_calloc(n, sizeof(struct Node), 0);
+    if (pool == 0) die();
+    // organize into a linked list...
+    struct Node* first = pool[0];
+    for (i = 0; i < n-1; ++i)
+      pool[i]->next = pool[i+1];
+    free(pool);     // Can now free the array (or not, if it is needed later)
+    return first;
+  }
+*/
+void** dlindependent_calloc(size_t, size_t, void**);
+
+/*
+  independent_comalloc(size_t n_elements, size_t sizes[], void* chunks[]);
+
+  independent_comalloc allocates, all at once, a set of n_elements
+  chunks with sizes indicated in the "sizes" array.    It returns
+  an array of pointers to these elements, each of which can be
+  independently freed, realloc'ed etc. The elements are guaranteed to
+  be adjacently allocated (this is not guaranteed to occur with
+  multiple callocs or mallocs), which may also improve cache locality
+  in some applications.
+
+  The "chunks" argument is optional (i.e., may be null). If it is null
+  the returned array is itself dynamically allocated and should also
+  be freed when it is no longer needed. Otherwise, the chunks array
+  must be of at least n_elements in length. It is filled in with the
+  pointers to the chunks.
+
+  In either case, independent_comalloc returns this pointer array, or
+  null if the allocation failed.  If n_elements is zero and chunks is
+  null, it returns a chunk representing an array with zero elements
+  (which should be freed if not wanted).
+
+  Each element must be individually freed when it is no longer
+  needed. If you'd like to instead be able to free all at once, you
+  should instead use a single regular malloc, and assign pointers at
+  particular offsets in the aggregate space. (In this case though, you
+  cannot independently free elements.)
+
+  independent_comallac differs from independent_calloc in that each
+  element may have a different size, and also that it does not
+  automatically clear elements.
+
+  independent_comalloc can be used to speed up allocation in cases
+  where several structs or objects must always be allocated at the
+  same time.  For example:
+
+  struct Head { ... }
+  struct Foot { ... }
+
+  void send_message(char* msg) {
+    int msglen = strlen(msg);
+    size_t sizes[3] = { sizeof(struct Head), msglen, sizeof(struct Foot) };
+    void* chunks[3];
+    if (independent_comalloc(3, sizes, chunks) == 0)
+      die();
+    struct Head* head = (struct Head*)(chunks[0]);
+    char*        body = (char*)(chunks[1]);
+    struct Foot* foot = (struct Foot*)(chunks[2]);
+    // ...
+  }
+
+  In general though, independent_comalloc is worth using only for
+  larger values of n_elements. For small values, you probably won't
+  detect enough difference from series of malloc calls to bother.
+
+  Overuse of independent_comalloc can increase overall memory usage,
+  since it cannot reuse existing noncontiguous small chunks that
+  might be available for some of the elements.
+*/
+void** dlindependent_comalloc(size_t, size_t*, void**);
+
+
+/*
+  pvalloc(size_t n);
+  Equivalent to valloc(minimum-page-that-holds(n)), that is,
+  round up n to nearest pagesize.
+ */
+void*  dlpvalloc(size_t);
+
+/*
+  malloc_trim(size_t pad);
+
+  If possible, gives memory back to the system (via negative arguments
+  to sbrk) if there is unused memory at the `high' end of the malloc
+  pool or in unused MMAP segments. You can call this after freeing
+  large blocks of memory to potentially reduce the system-level memory
+  requirements of a program. However, it cannot guarantee to reduce
+  memory. Under some allocation patterns, some large free blocks of
+  memory will be locked between two used chunks, so they cannot be
+  given back to the system.
+
+  The `pad' argument to malloc_trim represents the amount of free
+  trailing space to leave untrimmed. If this argument is zero, only
+  the minimum amount of memory to maintain internal data structures
+  will be left. Non-zero arguments can be supplied to maintain enough
+  trailing space to service future expected allocations without having
+  to re-obtain memory from the system.
+
+  Malloc_trim returns 1 if it actually released any memory, else 0.
+*/
+int  dlmalloc_trim(size_t);
+
+/*
+  malloc_stats();
+  Prints on stderr the amount of space obtained from the system (both
+  via sbrk and mmap), the maximum amount (which may be more than
+  current if malloc_trim and/or munmap got called), and the current
+  number of bytes allocated via malloc (or realloc, etc) but not yet
+  freed. Note that this is the number of bytes allocated, not the
+  number requested. It will be larger than the number requested
+  because of alignment and bookkeeping overhead. Because it includes
+  alignment wastage as being in use, this figure may be greater than
+  zero even when no user-level chunks are allocated.
+
+  The reported current and maximum system memory can be inaccurate if
+  a program makes other calls to system memory allocation functions
+  (normally sbrk) outside of malloc.
+
+  malloc_stats prints only the most commonly interesting statistics.
+  More information can be obtained by calling mallinfo.
+*/
+void  dlmalloc_stats(void);
+
+#endif /* ONLY_MSPACES */
+
+/*
+  malloc_usable_size(void* p);
+
+  Returns the number of bytes you can actually use in
+  an allocated chunk, which may be more than you requested (although
+  often not) due to alignment and minimum size constraints.
+  You can use this many bytes without worrying about
+  overwriting other allocated objects. This is not a particularly great
+  programming practice. malloc_usable_size can be more useful in
+  debugging and assertions, for example:
+
+  p = malloc(n);
+  assert(malloc_usable_size(p) >= 256);
+*/
+size_t dlmalloc_usable_size(void*);
+
+
+#if MSPACES
+
+/*
+  mspace is an opaque type representing an independent
+  region of space that supports mspace_malloc, etc.
+*/
+typedef void* mspace;
+
+/*
+  create_mspace creates and returns a new independent space with the
+  given initial capacity, or, if 0, the default granularity size.  It
+  returns null if there is no system memory available to create the
+  space.  If argument locked is non-zero, the space uses a separate
+  lock to control access. The capacity of the space will grow
+  dynamically as needed to service mspace_malloc requests.  You can
+  control the sizes of incremental increases of this space by
+  compiling with a different DEFAULT_GRANULARITY or dynamically
+  setting with mallopt(M_GRANULARITY, value).
+*/
+mspace create_mspace(size_t capacity, int locked);
+
+/*
+  destroy_mspace destroys the given space, and attempts to return all
+  of its memory back to the system, returning the total number of
+  bytes freed. After destruction, the results of access to all memory
+  used by the space become undefined.
+*/
+size_t destroy_mspace(mspace msp);
+
+/*
+  create_mspace_with_base uses the memory supplied as the initial base
+  of a new mspace. Part (less than 128*sizeof(size_t) bytes) of this
+  space is used for bookkeeping, so the capacity must be at least this
+  large. (Otherwise 0 is returned.) When this initial space is
+  exhausted, additional memory will be obtained from the system.
+  Destroying this space will deallocate all additionally allocated
+  space (if possible) but not the initial base.
+*/
+mspace create_mspace_with_base(void* base, size_t capacity, int locked);
+
+/*
+  mspace_track_large_chunks controls whether requests for large chunks
+  are allocated in their own untracked mmapped regions, separate from
+  others in this mspace. By default large chunks are not tracked,
+  which reduces fragmentation. However, such chunks are not
+  necessarily released to the system upon destroy_mspace.  Enabling
+  tracking by setting to true may increase fragmentation, but avoids
+  leakage when relying on destroy_mspace to release all memory
+  allocated using this space.  The function returns the previous
+  setting.
+*/
+int mspace_track_large_chunks(mspace msp, int enable);
+
+
+/*
+  mspace_malloc behaves as malloc, but operates within
+  the given space.
+*/
+void* mspace_malloc(mspace msp, size_t bytes);
+
+/*
+  mspace_free behaves as free, but operates within
+  the given space.
+
+  If compiled with FOOTERS==1, mspace_free is not actually needed.
+  free may be called instead of mspace_free because freed chunks from
+  any space are handled by their originating spaces.
+*/
+void mspace_free(mspace msp, void* mem);
+
+/*
+  mspace_realloc behaves as realloc, but operates within
+  the given space.
+
+  If compiled with FOOTERS==1, mspace_realloc is not actually
+  needed.  realloc may be called instead of mspace_realloc because
+  realloced chunks from any space are handled by their originating
+  spaces.
+*/
+void* mspace_realloc(mspace msp, void* mem, size_t newsize);
+
+/*
+  mspace_calloc behaves as calloc, but operates within
+  the given space.
+*/
+void* mspace_calloc(mspace msp, size_t n_elements, size_t elem_size);
+
+/*
+  mspace_memalign behaves as memalign, but operates within
+  the given space.
+*/
+void* mspace_memalign(mspace msp, size_t alignment, size_t bytes);
+
+/*
+  mspace_independent_calloc behaves as independent_calloc, but
+  operates within the given space.
+*/
+void** mspace_independent_calloc(mspace msp, size_t n_elements,
+                                 size_t elem_size, void* chunks[]);
+
+/*
+  mspace_independent_comalloc behaves as independent_comalloc, but
+  operates within the given space.
+*/
+void** mspace_independent_comalloc(mspace msp, size_t n_elements,
+                                   size_t sizes[], void* chunks[]);
+
+/*
+  mspace_footprint() returns the number of bytes obtained from the
+  system for this space.
+*/
+size_t mspace_footprint(mspace msp);
+
+/*
+  mspace_max_footprint() returns the peak number of bytes obtained from the
+  system for this space.
+*/
+size_t mspace_max_footprint(mspace msp);
+
+
+#if !NO_MALLINFO
+/*
+  mspace_mallinfo behaves as mallinfo, but reports properties of
+  the given space.
+*/
+struct mallinfo mspace_mallinfo(mspace msp);
+#endif /* NO_MALLINFO */
+
+/*
+  malloc_usable_size(void* p) behaves the same as malloc_usable_size;
+*/
+  size_t mspace_usable_size(void* mem);
+
+/*
+  mspace_malloc_stats behaves as malloc_stats, but reports
+  properties of the given space.
+*/
+void mspace_malloc_stats(mspace msp);
+
+/*
+  mspace_trim behaves as malloc_trim, but
+  operates within the given space.
+*/
+int mspace_trim(mspace msp, size_t pad);
+
+/*
+  An alias for mallopt.
+*/
+int mspace_mallopt(int, int);
+
+#endif /* MSPACES */
+
+#ifdef __cplusplus
+};  /* end of extern "C" */
+#endif /* __cplusplus */
+
+/*
+  ========================================================================
+  To make a fully customizable malloc.h header file, cut everything
+  above this line, put into file malloc.h, edit to suit, and #include it
+  on the next line, as well as in programs that use this malloc.
+  ========================================================================
+*/
+
+/* #include "malloc.h" */
+
+/*------------------------------ internal #includes ---------------------- */
+
+#ifdef WIN32
+#pragma warning( disable : 4146 ) /* no "unsigned" warnings */
+#endif /* WIN32 */
+
+#include <stdio.h>       /* for printing in malloc_stats */
+
+#ifndef LACKS_ERRNO_H
+#include <errno.h>       /* for MALLOC_FAILURE_ACTION */
+#endif /* LACKS_ERRNO_H */
+#if FOOTERS || DEBUG
+#include <time.h>        /* for magic initialization */
+#endif /* FOOTERS */
+#ifndef LACKS_STDLIB_H
+#include <stdlib.h>      /* for abort() */
+#endif /* LACKS_STDLIB_H */
+#ifdef DEBUG
+#if ABORT_ON_ASSERT_FAILURE
+#undef assert
+#define assert(x) if(!(x)) ABORT
+#else /* ABORT_ON_ASSERT_FAILURE */
+#include <assert.h>
+#endif /* ABORT_ON_ASSERT_FAILURE */
+#else  /* DEBUG */
+#ifndef assert
+#define assert(x)
+#endif
+#define DEBUG 0
+#endif /* DEBUG */
+#ifndef LACKS_STRING_H
+#include <string.h>      /* for memset etc */
+#endif  /* LACKS_STRING_H */
+#if USE_BUILTIN_FFS
+#ifndef LACKS_STRINGS_H
+#include <strings.h>     /* for ffs */
+#endif /* LACKS_STRINGS_H */
+#endif /* USE_BUILTIN_FFS */
+#if HAVE_MMAP
+#ifndef LACKS_SYS_MMAN_H
+/* On some versions of linux, mremap decl in mman.h needs __USE_GNU set */
+#if (defined(linux) && !defined(__USE_GNU))
+#define __USE_GNU 1
+#include <sys/mman.h>    /* for mmap */
+#undef __USE_GNU
+#else
+#include <sys/mman.h>    /* for mmap */
+#endif /* linux */
+#endif /* LACKS_SYS_MMAN_H */
+#ifndef LACKS_FCNTL_H
+#include <fcntl.h>
+#endif /* LACKS_FCNTL_H */
+#endif /* HAVE_MMAP */
+#ifndef LACKS_UNISTD_H
+#include <unistd.h>     /* for sbrk, sysconf */
+#else /* LACKS_UNISTD_H */
+#if !defined(__FreeBSD__) && !defined(__OpenBSD__) && !defined(__NetBSD__)
+extern void*     sbrk(ptrdiff_t);
+#endif /* FreeBSD etc */
+#endif /* LACKS_UNISTD_H */
+
+/* Declarations for locking */
+#if USE_LOCKS
+#ifndef WIN32
+#include <pthread.h>
+#if defined (__SVR4) && defined (__sun)  /* solaris */
+#include <thread.h>
+#endif /* solaris */
+#else
+#ifndef _M_AMD64
+/* These are already defined on AMD64 builds */
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+LONG __cdecl _InterlockedCompareExchange(LONG volatile *Dest, LONG Exchange, LONG Comp);
+LONG __cdecl _InterlockedExchange(LONG volatile *Target, LONG Value);
+#ifdef __cplusplus
+}
+#endif /* __cplusplus */
+#endif /* _M_AMD64 */
+#pragma intrinsic (_InterlockedCompareExchange)
+#pragma intrinsic (_InterlockedExchange)
+#define interlockedcompareexchange _InterlockedCompareExchange
+#define interlockedexchange _InterlockedExchange
+#endif /* Win32 */
+#endif /* USE_LOCKS */
+
+/* Declarations for bit scanning on win32 */
+#if defined(_MSC_VER) && _MSC_VER>=1300
+#ifndef BitScanForward	/* Try to avoid pulling in WinNT.h */
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+unsigned char _BitScanForward(unsigned long *index, unsigned long mask);
+unsigned char _BitScanReverse(unsigned long *index, unsigned long mask);
+#ifdef __cplusplus
+}
+#endif /* __cplusplus */
+
+#define BitScanForward _BitScanForward
+#define BitScanReverse _BitScanReverse
+#pragma intrinsic(_BitScanForward)
+#pragma intrinsic(_BitScanReverse)
+#endif /* BitScanForward */
+#endif /* defined(_MSC_VER) && _MSC_VER>=1300 */
+
+#ifndef WIN32
+#ifndef malloc_getpagesize
+#  ifdef _SC_PAGESIZE         /* some SVR4 systems omit an underscore */
+#    ifndef _SC_PAGE_SIZE
+#      define _SC_PAGE_SIZE _SC_PAGESIZE
+#    endif
+#  endif
+#  ifdef _SC_PAGE_SIZE
+#    define malloc_getpagesize sysconf(_SC_PAGE_SIZE)
+#  else
+#    if defined(BSD) || defined(DGUX) || defined(HAVE_GETPAGESIZE)
+       extern size_t getpagesize();
+#      define malloc_getpagesize getpagesize()
+#    else
+#      ifdef WIN32 /* use supplied emulation of getpagesize */
+#        define malloc_getpagesize getpagesize()
+#      else
+#        ifndef LACKS_SYS_PARAM_H
+#          include <sys/param.h>
+#        endif
+#        ifdef EXEC_PAGESIZE
+#          define malloc_getpagesize EXEC_PAGESIZE
+#        else
+#          ifdef NBPG
+#            ifndef CLSIZE
+#              define malloc_getpagesize NBPG
+#            else
+#              define malloc_getpagesize (NBPG * CLSIZE)
+#            endif
+#          else
+#            ifdef NBPC
+#              define malloc_getpagesize NBPC
+#            else
+#              ifdef PAGESIZE
+#                define malloc_getpagesize PAGESIZE
+#              else /* just guess */
+#                define malloc_getpagesize ((size_t)4096U)
+#              endif
+#            endif
+#          endif
+#        endif
+#      endif
+#    endif
+#  endif
+#endif
+#endif
+
+
+
+/* ------------------- size_t and alignment properties -------------------- */
+
+/* The byte and bit size of a size_t */
+#define SIZE_T_SIZE         (sizeof(size_t))
+#define SIZE_T_BITSIZE      (sizeof(size_t) << 3)
+
+/* Some constants coerced to size_t */
+/* Annoying but necessary to avoid errors on some platforms */
+#define SIZE_T_ZERO         ((size_t)0)
+#define SIZE_T_ONE          ((size_t)1)
+#define SIZE_T_TWO          ((size_t)2)
+#define SIZE_T_FOUR         ((size_t)4)
+#define TWO_SIZE_T_SIZES    (SIZE_T_SIZE<<1)
+#define FOUR_SIZE_T_SIZES   (SIZE_T_SIZE<<2)
+#define SIX_SIZE_T_SIZES    (FOUR_SIZE_T_SIZES+TWO_SIZE_T_SIZES)
+#define HALF_MAX_SIZE_T     (MAX_SIZE_T / 2U)
+
+/* The bit mask value corresponding to MALLOC_ALIGNMENT */
+#define CHUNK_ALIGN_MASK    (MALLOC_ALIGNMENT - SIZE_T_ONE)
+
+/* True if address a has acceptable alignment */
+#define is_aligned(A)       (((size_t)((A)) & (CHUNK_ALIGN_MASK)) == 0)
+
+/* the number of bytes to offset an address to align it */
+#define align_offset(A)\
+ ((((size_t)(A) & CHUNK_ALIGN_MASK) == 0)? 0 :\
+  ((MALLOC_ALIGNMENT - ((size_t)(A) & CHUNK_ALIGN_MASK)) & CHUNK_ALIGN_MASK))
+
+/* -------------------------- MMAP preliminaries ------------------------- */
+
+/*
+   If HAVE_MORECORE or HAVE_MMAP are false, we just define calls and
+   checks to fail so compiler optimizer can delete code rather than
+   using so many "#if"s.
+*/
+
+
+/* MORECORE and MMAP must return MFAIL on failure */
+#define MFAIL                ((void*)(MAX_SIZE_T))
+#define CMFAIL               ((char*)(MFAIL)) /* defined for convenience */
+
+#if HAVE_MMAP
+
+#ifndef WIN32
+#define MUNMAP_DEFAULT(a, s)  munmap((a), (s))
+#define MMAP_PROT            (PROT_READ|PROT_WRITE)
+#if !defined(MAP_ANONYMOUS) && defined(MAP_ANON)
+#define MAP_ANONYMOUS        MAP_ANON
+#endif /* MAP_ANON */
+#ifdef MAP_ANONYMOUS
+#define MMAP_FLAGS           (MAP_PRIVATE|MAP_ANONYMOUS)
+#define MMAP_DEFAULT(s)       mmap(0, (s), MMAP_PROT, MMAP_FLAGS, -1, 0)
+#else /* MAP_ANONYMOUS */
+/*
+   Nearly all versions of mmap support MAP_ANONYMOUS, so the following
+   is unlikely to be needed, but is supplied just in case.
+*/
+#define MMAP_FLAGS           (MAP_PRIVATE)
+static int dev_zero_fd = -1; /* Cached file descriptor for /dev/zero. */
+#define MMAP_DEFAULT(s) ((dev_zero_fd < 0) ? \
+           (dev_zero_fd = open("/dev/zero", O_RDWR), \
+            mmap(0, (s), MMAP_PROT, MMAP_FLAGS, dev_zero_fd, 0)) : \
+            mmap(0, (s), MMAP_PROT, MMAP_FLAGS, dev_zero_fd, 0))
+#endif /* MAP_ANONYMOUS */
+
+#define DIRECT_MMAP_DEFAULT(s) MMAP_DEFAULT(s)
+
+#else /* WIN32 */
+
+/* Win32 MMAP via VirtualAlloc */
+static FORCEINLINE void* win32mmap(size_t size) {
+  void* ptr = VirtualAlloc(0, size, MEM_RESERVE|MEM_COMMIT, PAGE_READWRITE);
+  return (ptr != 0)? ptr: MFAIL;
+}
+
+/* For direct MMAP, use MEM_TOP_DOWN to minimize interference */
+static FORCEINLINE void* win32direct_mmap(size_t size) {
+  void* ptr = VirtualAlloc(0, size, MEM_RESERVE|MEM_COMMIT|MEM_TOP_DOWN,
+                           PAGE_READWRITE);
+  return (ptr != 0)? ptr: MFAIL;
+}
+
+/* This function supports releasing coalesed segments */
+static FORCEINLINE int win32munmap(void* ptr, size_t size) {
+  MEMORY_BASIC_INFORMATION minfo;
+  char* cptr = (char*)ptr;
+  while (size) {
+    if (VirtualQuery(cptr, &minfo, sizeof(minfo)) == 0)
+      return -1;
+    if (minfo.BaseAddress != cptr || minfo.AllocationBase != cptr ||
+        minfo.State != MEM_COMMIT || minfo.RegionSize > size)
+      return -1;
+    if (VirtualFree(cptr, 0, MEM_RELEASE) == 0)
+      return -1;
+    cptr += minfo.RegionSize;
+    size -= minfo.RegionSize;
+  }
+  return 0;
+}
+
+#define MMAP_DEFAULT(s)             win32mmap(s)
+#define MUNMAP_DEFAULT(a, s)        win32munmap((a), (s))
+#define DIRECT_MMAP_DEFAULT(s)      win32direct_mmap(s)
+#endif /* WIN32 */
+#endif /* HAVE_MMAP */
+
+#if HAVE_MREMAP
+#ifndef WIN32
+#define MREMAP_DEFAULT(addr, osz, nsz, mv) mremap((addr), (osz), (nsz), (mv))
+#endif /* WIN32 */
+#endif /* HAVE_MREMAP */
+
+
+/**
+ * Define CALL_MORECORE
+ */
+#if HAVE_MORECORE
+    #ifdef MORECORE
+        #define CALL_MORECORE(S)    MORECORE(S)
+    #else  /* MORECORE */
+        #define CALL_MORECORE(S)    MORECORE_DEFAULT(S)
+    #endif /* MORECORE */
+#else  /* HAVE_MORECORE */
+    #define CALL_MORECORE(S)        MFAIL
+#endif /* HAVE_MORECORE */
+
+/**
+ * Define CALL_MMAP/CALL_MUNMAP/CALL_DIRECT_MMAP
+ */
+#if HAVE_MMAP
+    #define USE_MMAP_BIT            (SIZE_T_ONE)
+
+    #ifdef MMAP
+        #define CALL_MMAP(s)        MMAP(s)
+    #else /* MMAP */
+        #define CALL_MMAP(s)        MMAP_DEFAULT(s)
+    #endif /* MMAP */
+    #ifdef MUNMAP
+        #define CALL_MUNMAP(a, s)   MUNMAP((a), (s))
+    #else /* MUNMAP */
+        #define CALL_MUNMAP(a, s)   MUNMAP_DEFAULT((a), (s))
+    #endif /* MUNMAP */
+    #ifdef DIRECT_MMAP
+        #define CALL_DIRECT_MMAP(s) DIRECT_MMAP(s)
+    #else /* DIRECT_MMAP */
+        #define CALL_DIRECT_MMAP(s) DIRECT_MMAP_DEFAULT(s)
+    #endif /* DIRECT_MMAP */
+#else  /* HAVE_MMAP */
+    #define USE_MMAP_BIT            (SIZE_T_ZERO)
+
+    #define MMAP(s)                 MFAIL
+    #define MUNMAP(a, s)            (-1)
+    #define DIRECT_MMAP(s)          MFAIL
+    #define CALL_DIRECT_MMAP(s)     DIRECT_MMAP(s)
+    #define CALL_MMAP(s)            MMAP(s)
+    #define CALL_MUNMAP(a, s)       MUNMAP((a), (s))
+#endif /* HAVE_MMAP */
+
+/**
+ * Define CALL_MREMAP
+ */
+#if HAVE_MMAP && HAVE_MREMAP
+    #ifdef MREMAP
+        #define CALL_MREMAP(addr, osz, nsz, mv) MREMAP((addr), (osz), (nsz), (mv))
+    #else /* MREMAP */
+        #define CALL_MREMAP(addr, osz, nsz, mv) MREMAP_DEFAULT((addr), (osz), (nsz), (mv))
+    #endif /* MREMAP */
+#else  /* HAVE_MMAP && HAVE_MREMAP */
+    #define CALL_MREMAP(addr, osz, nsz, mv)     MFAIL
+#endif /* HAVE_MMAP && HAVE_MREMAP */
+
+/* mstate bit set if continguous morecore disabled or failed */
+#define USE_NONCONTIGUOUS_BIT (4U)
+
+/* segment bit set in create_mspace_with_base */
+#define EXTERN_BIT            (8U)
+
+
+//dsm: After all this mumbo-jumbo, really ensure that we don't have MORECORE, MMAP or MREMAP
+#if HAVE_MORECORE
+#error "dsm: Somehow, HAVE_MORECORE got enabled, check defines"
+#endif
+#if HAVE_MMAP
+#error "dsm: Somehow, HAVE_MMAP got enabled, check defines"
+#endif
+#if HAVE_MREMAP
+#error "dsm: Somehow, HAVE_MREMAP got enabled, check defines"
+#endif
+
+
+
+
+/* --------------------------- Lock preliminaries ------------------------ */
+
+/*
+  When locks are defined, there is one global lock, plus
+  one per-mspace lock.
+
+  The global lock_ensures that mparams.magic and other unique
+  mparams values are initialized only once. It also protects
+  sequences of calls to MORECORE.  In many cases sys_alloc requires
+  two calls, that should not be interleaved with calls by other
+  threads.  This does not protect against direct calls to MORECORE
+  by other threads not using this lock, so there is still code to
+  cope the best we can on interference.
+
+  Per-mspace locks surround calls to malloc, free, etc.  To enable use
+  in layered extensions, per-mspace locks are reentrant.
+
+  Because lock-protected regions generally have bounded times, it is
+  OK to use the supplied simple spinlocks in the custom versions for
+  x86. Spinlocks are likely to improve performance for lightly
+  contended applications, but worsen performance under heavy
+  contention.
+
+  If USE_LOCKS is > 1, the definitions of lock routines here are
+  bypassed, in which case you will need to define the type MLOCK_T,
+  and at least INITIAL_LOCK, ACQUIRE_LOCK, RELEASE_LOCK and possibly
+  TRY_LOCK (which is not used in this malloc, but commonly needed in
+  extensions.)  You must also declare a
+    static MLOCK_T malloc_global_mutex = { initialization values };.
+
+*/
+
+#if USE_LOCKS == 1
+
+#if USE_SPIN_LOCKS && SPIN_LOCKS_AVAILABLE
+#ifndef WIN32
+
+/* Custom pthread-style spin locks on x86 and x64 for gcc */
+
+/* dsm: By default, dlmalloc uses pthread_self to identify each thread, and to implement recursive locking.
+ * This is insufficient across multiple processes, because we can have pthread_self return the same value across them
+ * (in fact, this happens frequently; I only caught this when we started allocating BBL descriptors, before we were
+ * doing all allocs sequentially so this did not matter). A proper and compact way would be to call the gettid syscall,
+ * but that could make things much slower, since it's a syscall; instead, lets have the thread id be the concatenation
+ * of pthread_self and the pid, which is cached by glibc.
+ *
+ *
+ * NOTE: DEPRECATED
+ * IMPORTANT: This doesn't work either, because pthread_self() does not work inside Pin. Rather than resorting to Pin-specific
+ * trickery, I have disabled the locking code inside dlmalloc. All locking is now external.
+ */
+struct threadid_t {
+    pid_t pid;
+    pthread_t tid;
+
+    bool is_current() {return (getpid() == pid) && (pthread_self() == tid);}
+    bool is_empty() {return (pid == 0) && (tid == 0);}
+};
+
+struct pthread_mlock_t {
+  volatile unsigned int l;
+  unsigned int c;
+  threadid_t threadid;
+};
+#define MLOCK_T               struct pthread_mlock_t
+#define CURRENT_THREAD        {getpid(), pthread_self()}
+#define NULL_THREAD           {0, 0}
+#define INITIAL_LOCK(sl)      ((sl)->threadid = NULL_THREAD, (sl)->l = (sl)->c = 0, 0)
+#define ACQUIRE_LOCK(sl)      pthread_acquire_lock(sl)
+#define RELEASE_LOCK(sl)      pthread_release_lock(sl)
+#define TRY_LOCK(sl)          pthread_try_lock(sl)
+#define SPINS_PER_YIELD       63
+
+//dsm: This is now per-process, but that's fine
+static MLOCK_T malloc_global_mutex = { 0, 0, NULL_THREAD};
+
+static FORCEINLINE int pthread_acquire_lock (MLOCK_T *sl) {
+  int spins = 0;
+  volatile unsigned int* lp = &sl->l;
+  for (;;) {
+    if (*lp != 0) {
+      if (sl->threadid.is_current()) {
+        ++sl->c;
+        return 0;
+      }
+    }
+    else {
+      /* place args to cmpxchgl in locals to evade oddities in some gccs */
+      int cmp = 0;
+      int val = 1;
+      int ret;
+      __asm__ __volatile__  ("lock; cmpxchgl %1, %2"
+                             : "=a" (ret)
+                             : "r" (val), "m" (*(lp)), "0"(cmp)
+                             : "memory", "cc");
+      if (!ret) {
+        assert(sl->threadid.is_empty());
+        sl->threadid = CURRENT_THREAD;
+        sl->c = 1;
+        return 0;
+      }
+    }
+    if ((++spins & SPINS_PER_YIELD) == 0) {
+#if defined (__SVR4) && defined (__sun) /* solaris */
+      thr_yield();
+#else
+#if defined(__linux__) || defined(__FreeBSD__) || defined(__APPLE__)
+      sched_yield();
+#else  /* no-op yield on unknown systems */
+      ;
+#endif /* __linux__ || __FreeBSD__ || __APPLE__ */
+#endif /* solaris */
+    }
+  }
+}
+
+static FORCEINLINE void pthread_release_lock (MLOCK_T *sl) {
+  volatile unsigned int* lp = &sl->l;
+  assert(*lp != 0);
+  assert_msg(sl->threadid.is_current(), "%d|%ld != %d|%ld", sl->threadid.pid, sl->threadid.tid, getpid(), pthread_self());
+  if (--sl->c == 0) {
+    sl->threadid = NULL_THREAD;
+    int prev = 0;
+    int ret;
+    __asm__ __volatile__ ("lock; xchgl %0, %1"
+                          : "=r" (ret)
+                          : "m" (*(lp)), "0"(prev)
+                          : "memory");
+  }
+}
+
+static FORCEINLINE int pthread_try_lock (MLOCK_T *sl) {
+  volatile unsigned int* lp = &sl->l;
+  if (*lp != 0) {
+    if (sl->threadid.is_current()) {
+      ++sl->c;
+      return 1;
+    }
+  }
+  else {
+    int cmp = 0;
+    int val = 1;
+    int ret;
+    __asm__ __volatile__  ("lock; cmpxchgl %1, %2"
+                           : "=a" (ret)
+                           : "r" (val), "m" (*(lp)), "0"(cmp)
+                           : "memory", "cc");
+    if (!ret) {
+      assert(sl->threadid.is_empty());
+      sl->threadid = CURRENT_THREAD;
+      sl->c = 1;
+      return 1;
+    }
+  }
+  return 0;
+}
+
+
+#else /* WIN32 */
+/* Custom win32-style spin locks on x86 and x64 for MSC */
+struct win32_mlock_t {
+  volatile long l;
+  unsigned int c;
+  long threadid;
+};
+
+#define MLOCK_T               struct win32_mlock_t
+#define CURRENT_THREAD        GetCurrentThreadId()
+#define INITIAL_LOCK(sl)      ((sl)->threadid = 0, (sl)->l = (sl)->c = 0, 0)
+#define ACQUIRE_LOCK(sl)      win32_acquire_lock(sl)
+#define RELEASE_LOCK(sl)      win32_release_lock(sl)
+#define TRY_LOCK(sl)          win32_try_lock(sl)
+#define SPINS_PER_YIELD       63
+
+#error "dsm: This is not supposed to work on WIN32"
+static MLOCK_T malloc_global_mutex = { 0, 0, 0};
+
+static FORCEINLINE int win32_acquire_lock (MLOCK_T *sl) {
+  int spins = 0;
+  for (;;) {
+    if (sl->l != 0) {
+      if (sl->threadid == CURRENT_THREAD) {
+        ++sl->c;
+        return 0;
+      }
+    }
+    else {
+      if (!interlockedexchange(&sl->l, 1)) {
+        assert(!sl->threadid);
+        sl->threadid = CURRENT_THREAD;
+        sl->c = 1;
+        return 0;
+      }
+    }
+    if ((++spins & SPINS_PER_YIELD) == 0)
+      SleepEx(0, FALSE);
+  }
+}
+
+static FORCEINLINE void win32_release_lock (MLOCK_T *sl) {
+  assert(sl->threadid == CURRENT_THREAD);
+  assert(sl->l != 0);
+  if (--sl->c == 0) {
+    sl->threadid = 0;
+    interlockedexchange (&sl->l, 0);
+  }
+}
+
+static FORCEINLINE int win32_try_lock (MLOCK_T *sl) {
+  if (sl->l != 0) {
+    if (sl->threadid == CURRENT_THREAD) {
+      ++sl->c;
+      return 1;
+    }
+  }
+  else {
+    if (!interlockedexchange(&sl->l, 1)){
+      assert(!sl->threadid);
+      sl->threadid = CURRENT_THREAD;
+      sl->c = 1;
+      return 1;
+    }
+  }
+  return 0;
+}
+
+#endif /* WIN32 */
+#else /* USE_SPIN_LOCKS */
+
+#ifndef WIN32
+/* pthreads-based locks */
+
+#define MLOCK_T               pthread_mutex_t
+#define CURRENT_THREAD        pthread_self()
+#define INITIAL_LOCK(sl)      pthread_init_lock(sl)
+#define ACQUIRE_LOCK(sl)      pthread_mutex_lock(sl)
+#define RELEASE_LOCK(sl)      pthread_mutex_unlock(sl)
+#define TRY_LOCK(sl)          (!pthread_mutex_trylock(sl))
+
+//dsm: Per process, but OK
+static MLOCK_T malloc_global_mutex = PTHREAD_MUTEX_INITIALIZER;
+
+/* Cope with old-style linux recursive lock initialization by adding */
+/* skipped internal declaration from pthread.h */
+#ifdef linux
+#ifndef PTHREAD_MUTEX_RECURSIVE
+extern int pthread_mutexattr_setkind_np __P ((pthread_mutexattr_t *__attr,
+					   int __kind));
+#define PTHREAD_MUTEX_RECURSIVE PTHREAD_MUTEX_RECURSIVE_NP
+#define pthread_mutexattr_settype(x,y) pthread_mutexattr_setkind_np(x,y)
+#endif
+#endif
+
+static int pthread_init_lock (MLOCK_T *sl) {
+  pthread_mutexattr_t attr;
+  if (pthread_mutexattr_init(&attr)) return 1;
+  if (pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_RECURSIVE)) return 1;
+  if (pthread_mutex_init(sl, &attr)) return 1;
+  if (pthread_mutexattr_destroy(&attr)) return 1;
+  return 0;
+}
+
+#else /* WIN32 */
+/* Win32 critical sections */
+#define MLOCK_T               CRITICAL_SECTION
+#define CURRENT_THREAD        GetCurrentThreadId()
+#define INITIAL_LOCK(s)       (!InitializeCriticalSectionAndSpinCount((s), 0x80000000|4000))
+#define ACQUIRE_LOCK(s)       (EnterCriticalSection(sl), 0)
+#define RELEASE_LOCK(s)       LeaveCriticalSection(sl)
+#define TRY_LOCK(s)           TryEnterCriticalSection(sl)
+#define NEED_GLOBAL_LOCK_INIT
+
+#error "dsm: This is not supposed to work in WIN32..."
+static MLOCK_T malloc_global_mutex;
+static volatile long malloc_global_mutex_status;
+
+/* Use spin loop to initialize global lock */
+static void init_malloc_global_mutex() {
+  for (;;) {
+    long stat = malloc_global_mutex_status;
+    if (stat > 0)
+      return;
+    /* transition to < 0 while initializing, then to > 0) */
+    if (stat == 0 &&
+        interlockedcompareexchange(&malloc_global_mutex_status, -1, 0) == 0) {
+      InitializeCriticalSection(&malloc_global_mutex);
+      interlockedexchange(&malloc_global_mutex_status,1);
+      return;
+    }
+    SleepEx(0, FALSE);
+  }
+}
+
+#endif /* WIN32 */
+#endif /* USE_SPIN_LOCKS */
+#endif /* USE_LOCKS == 1 */
+
+/* -----------------------  User-defined locks ------------------------ */
+
+#if USE_LOCKS > 1
+/* Define your own lock implementation here */
+/* #define INITIAL_LOCK(sl)  ... */
+/* #define ACQUIRE_LOCK(sl)  ... */
+/* #define RELEASE_LOCK(sl)  ... */
+/* #define TRY_LOCK(sl) ... */
+/* static MLOCK_T malloc_global_mutex = ... */
+#endif /* USE_LOCKS > 1 */
+
+/* -----------------------  Lock-based state ------------------------ */
+
+#if USE_LOCKS
+#define USE_LOCK_BIT               (2U)
+#else  /* USE_LOCKS */
+#define USE_LOCK_BIT               (0U)
+#define INITIAL_LOCK(l)
+#endif /* USE_LOCKS */
+
+#if USE_LOCKS
+#ifndef ACQUIRE_MALLOC_GLOBAL_LOCK
+#define ACQUIRE_MALLOC_GLOBAL_LOCK()  ACQUIRE_LOCK(&malloc_global_mutex);
+#endif
+#ifndef RELEASE_MALLOC_GLOBAL_LOCK
+#define RELEASE_MALLOC_GLOBAL_LOCK()  RELEASE_LOCK(&malloc_global_mutex);
+#endif
+#else  /* USE_LOCKS */
+#define ACQUIRE_MALLOC_GLOBAL_LOCK()
+#define RELEASE_MALLOC_GLOBAL_LOCK()
+#endif /* USE_LOCKS */
+
+
+/* -----------------------  Chunk representations ------------------------ */
+
+/*
+  (The following includes lightly edited explanations by Colin Plumb.)
+
+  The malloc_chunk declaration below is misleading (but accurate and
+  necessary).  It declares a "view" into memory allowing access to
+  necessary fields at known offsets from a given base.
+
+  Chunks of memory are maintained using a `boundary tag' method as
+  originally described by Knuth.  (See the paper by Paul Wilson
+  ftp://ftp.cs.utexas.edu/pub/garbage/allocsrv.ps for a survey of such
+  techniques.)  Sizes of free chunks are stored both in the front of
+  each chunk and at the end.  This makes consolidating fragmented
+  chunks into bigger chunks fast.  The head fields also hold bits
+  representing whether chunks are free or in use.
+
+  Here are some pictures to make it clearer.  They are "exploded" to
+  show that the state of a chunk can be thought of as extending from
+  the high 31 bits of the head field of its header through the
+  prev_foot and PINUSE_BIT bit of the following chunk header.
+
+  A chunk that's in use looks like:
+
+   chunk-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+           | Size of previous chunk (if P = 0)                             |
+           +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+         +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ |P|
+         | Size of this chunk                                         1| +-+
+   mem-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+         |                                                               |
+         +-                                                             -+
+         |                                                               |
+         +-                                                             -+
+         |                                                               :
+         +-      size - sizeof(size_t) available payload bytes          -+
+         :                                                               |
+ chunk-> +-                                                             -+
+         |                                                               |
+         +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ |1|
+       | Size of next chunk (may or may not be in use)               | +-+
+ mem-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+
+    And if it's free, it looks like this:
+
+   chunk-> +-                                                             -+
+           | User payload (must be in use, or we would have merged!)       |
+           +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+         +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ |P|
+         | Size of this chunk                                         0| +-+
+   mem-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+         | Next pointer                                                  |
+         +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+         | Prev pointer                                                  |
+         +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+         |                                                               :
+         +-      size - sizeof(struct chunk) unused bytes               -+
+         :                                                               |
+ chunk-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+         | Size of this chunk                                            |
+         +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ |0|
+       | Size of next chunk (must be in use, or we would have merged)| +-+
+ mem-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+       |                                                               :
+       +- User payload                                                -+
+       :                                                               |
+       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+                                                                     |0|
+                                                                     +-+
+  Note that since we always merge adjacent free chunks, the chunks
+  adjacent to a free chunk must be in use.
+
+  Given a pointer to a chunk (which can be derived trivially from the
+  payload pointer) we can, in O(1) time, find out whether the adjacent
+  chunks are free, and if so, unlink them from the lists that they
+  are on and merge them with the current chunk.
+
+  Chunks always begin on even word boundaries, so the mem portion
+  (which is returned to the user) is also on an even word boundary, and
+  thus at least double-word aligned.
+
+  The P (PINUSE_BIT) bit, stored in the unused low-order bit of the
+  chunk size (which is always a multiple of two words), is an in-use
+  bit for the *previous* chunk.  If that bit is *clear*, then the
+  word before the current chunk size contains the previous chunk
+  size, and can be used to find the front of the previous chunk.
+  The very first chunk allocated always has this bit set, preventing
+  access to non-existent (or non-owned) memory. If pinuse is set for
+  any given chunk, then you CANNOT determine the size of the
+  previous chunk, and might even get a memory addressing fault when
+  trying to do so.
+
+  The C (CINUSE_BIT) bit, stored in the unused second-lowest bit of
+  the chunk size redundantly records whether the current chunk is
+  inuse (unless the chunk is mmapped). This redundancy enables usage
+  checks within free and realloc, and reduces indirection when freeing
+  and consolidating chunks.
+
+  Each freshly allocated chunk must have both cinuse and pinuse set.
+  That is, each allocated chunk borders either a previously allocated
+  and still in-use chunk, or the base of its memory arena. This is
+  ensured by making all allocations from the the `lowest' part of any
+  found chunk.  Further, no free chunk physically borders another one,
+  so each free chunk is known to be preceded and followed by either
+  inuse chunks or the ends of memory.
+
+  Note that the `foot' of the current chunk is actually represented
+  as the prev_foot of the NEXT chunk. This makes it easier to
+  deal with alignments etc but can be very confusing when trying
+  to extend or adapt this code.
+
+  The exceptions to all this are
+
+     1. The special chunk `top' is the top-most available chunk (i.e.,
+        the one bordering the end of available memory). It is treated
+        specially.  Top is never included in any bin, is used only if
+        no other chunk is available, and is released back to the
+        system if it is very large (see M_TRIM_THRESHOLD).  In effect,
+        the top chunk is treated as larger (and thus less well
+        fitting) than any other available chunk.  The top chunk
+        doesn't update its trailing size field since there is no next
+        contiguous chunk that would have to index off it. However,
+        space is still allocated for it (TOP_FOOT_SIZE) to enable
+        separation or merging when space is extended.
+
+     3. Chunks allocated via mmap, have both cinuse and pinuse bits
+        cleared in their head fields.  Because they are allocated
+        one-by-one, each must carry its own prev_foot field, which is
+        also used to hold the offset this chunk has within its mmapped
+        region, which is needed to preserve alignment. Each mmapped
+        chunk is trailed by the first two fields of a fake next-chunk
+        for sake of usage checks.
+
+*/
+
+struct malloc_chunk {
+  size_t               prev_foot;  /* Size of previous chunk (if free).  */
+  size_t               head;       /* Size and inuse bits. */
+  struct malloc_chunk* fd;         /* double links -- used only if free. */
+  struct malloc_chunk* bk;
+};
+
+typedef struct malloc_chunk  mchunk;
+typedef struct malloc_chunk* mchunkptr;
+typedef struct malloc_chunk* sbinptr;  /* The type of bins of chunks */
+typedef unsigned int bindex_t;         /* Described below */
+typedef unsigned int binmap_t;         /* Described below */
+typedef unsigned int flag_t;           /* The type of various bit flag sets */
+
+/* ------------------- Chunks sizes and alignments ----------------------- */
+
+#define MCHUNK_SIZE         (sizeof(mchunk))
+
+#if FOOTERS
+#define CHUNK_OVERHEAD      (TWO_SIZE_T_SIZES)
+#else /* FOOTERS */
+#define CHUNK_OVERHEAD      (SIZE_T_SIZE)
+#endif /* FOOTERS */
+
+/* MMapped chunks need a second word of overhead ... */
+#define MMAP_CHUNK_OVERHEAD (TWO_SIZE_T_SIZES)
+/* ... and additional padding for fake next-chunk at foot */
+#define MMAP_FOOT_PAD       (FOUR_SIZE_T_SIZES)
+
+/* The smallest size we can malloc is an aligned minimal chunk */
+#define MIN_CHUNK_SIZE\
+  ((MCHUNK_SIZE + CHUNK_ALIGN_MASK) & ~CHUNK_ALIGN_MASK)
+
+/* conversion from malloc headers to user pointers, and back */
+#define chunk2mem(p)        ((void*)((char*)(p)       + TWO_SIZE_T_SIZES))
+#define mem2chunk(mem)      ((mchunkptr)((char*)(mem) - TWO_SIZE_T_SIZES))
+/* chunk associated with aligned address A */
+#define align_as_chunk(A)   (mchunkptr)((A) + align_offset(chunk2mem(A)))
+
+/* Bounds on request (not chunk) sizes. */
+#define MAX_REQUEST         ((-MIN_CHUNK_SIZE) << 2)
+#define MIN_REQUEST         (MIN_CHUNK_SIZE - CHUNK_OVERHEAD - SIZE_T_ONE)
+
+/* pad request bytes into a usable size */
+#define pad_request(req) \
+   (((req) + CHUNK_OVERHEAD + CHUNK_ALIGN_MASK) & ~CHUNK_ALIGN_MASK)
+
+/* pad request, checking for minimum (but not maximum) */
+#define request2size(req) \
+  (((req) < MIN_REQUEST)? MIN_CHUNK_SIZE : pad_request(req))
+
+
+/* ------------------ Operations on head and foot fields ----------------- */
+
+/*
+  The head field of a chunk is or'ed with PINUSE_BIT when previous
+  adjacent chunk in use, and or'ed with CINUSE_BIT if this chunk is in
+  use, unless mmapped, in which case both bits are cleared.
+
+  FLAG4_BIT is not used by this malloc, but might be useful in extensions.
+*/
+
+#define PINUSE_BIT          (SIZE_T_ONE)
+#define CINUSE_BIT          (SIZE_T_TWO)
+#define FLAG4_BIT           (SIZE_T_FOUR)
+#define INUSE_BITS          (PINUSE_BIT|CINUSE_BIT)
+#define FLAG_BITS           (PINUSE_BIT|CINUSE_BIT|FLAG4_BIT)
+
+/* Head value for fenceposts */
+#define FENCEPOST_HEAD      (INUSE_BITS|SIZE_T_SIZE)
+
+/* extraction of fields from head words */
+#define cinuse(p)           ((p)->head & CINUSE_BIT)
+#define pinuse(p)           ((p)->head & PINUSE_BIT)
+#define is_inuse(p)         (((p)->head & INUSE_BITS) != PINUSE_BIT)
+#define is_mmapped(p)       (((p)->head & INUSE_BITS) == 0)
+
+#define chunksize(p)        ((p)->head & ~(FLAG_BITS))
+
+#define clear_pinuse(p)     ((p)->head &= ~PINUSE_BIT)
+
+/* Treat space at ptr +/- offset as a chunk */
+#define chunk_plus_offset(p, s)  ((mchunkptr)(((char*)(p)) + (s)))
+#define chunk_minus_offset(p, s) ((mchunkptr)(((char*)(p)) - (s)))
+
+/* Ptr to next or previous physical malloc_chunk. */
+#define next_chunk(p) ((mchunkptr)( ((char*)(p)) + ((p)->head & ~FLAG_BITS)))
+#define prev_chunk(p) ((mchunkptr)( ((char*)(p)) - ((p)->prev_foot) ))
+
+/* extract next chunk's pinuse bit */
+#define next_pinuse(p)  ((next_chunk(p)->head) & PINUSE_BIT)
+
+/* Get/set size at footer */
+#define get_foot(p, s)  (((mchunkptr)((char*)(p) + (s)))->prev_foot)
+#define set_foot(p, s)  (((mchunkptr)((char*)(p) + (s)))->prev_foot = (s))
+
+/* Set size, pinuse bit, and foot */
+#define set_size_and_pinuse_of_free_chunk(p, s)\
+  ((p)->head = (s|PINUSE_BIT), set_foot(p, s))
+
+/* Set size, pinuse bit, foot, and clear next pinuse */
+#define set_free_with_pinuse(p, s, n)\
+  (clear_pinuse(n), set_size_and_pinuse_of_free_chunk(p, s))
+
+/* Get the internal overhead associated with chunk p */
+#define overhead_for(p)\
+ (is_mmapped(p)? MMAP_CHUNK_OVERHEAD : CHUNK_OVERHEAD)
+
+/* Return true if malloced space is not necessarily cleared */
+#if MMAP_CLEARS
+#define calloc_must_clear(p) (!is_mmapped(p))
+#else /* MMAP_CLEARS */
+#define calloc_must_clear(p) (1)
+#endif /* MMAP_CLEARS */
+
+/* ---------------------- Overlaid data structures ----------------------- */
+
+/*
+  When chunks are not in use, they are treated as nodes of either
+  lists or trees.
+
+  "Small"  chunks are stored in circular doubly-linked lists, and look
+  like this:
+
+    chunk-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+            |             Size of previous chunk                            |
+            +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+    `head:' |             Size of chunk, in bytes                         |P|
+      mem-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+            |             Forward pointer to next chunk in list             |
+            +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+            |             Back pointer to previous chunk in list            |
+            +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+            |             Unused space (may be 0 bytes long)                .
+            .                                                               .
+            .                                                               |
+nextchunk-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+    `foot:' |             Size of chunk, in bytes                           |
+            +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+
+  Larger chunks are kept in a form of bitwise digital trees (aka
+  tries) keyed on chunksizes.  Because malloc_tree_chunks are only for
+  free chunks greater than 256 bytes, their size doesn't impose any
+  constraints on user chunk sizes.  Each node looks like:
+
+    chunk-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+            |             Size of previous chunk                            |
+            +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+    `head:' |             Size of chunk, in bytes                         |P|
+      mem-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+            |             Forward pointer to next chunk of same size        |
+            +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+            |             Back pointer to previous chunk of same size       |
+            +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+            |             Pointer to left child (child[0])                  |
+            +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+            |             Pointer to right child (child[1])                 |
+            +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+            |             Pointer to parent                                 |
+            +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+            |             bin index of this chunk                           |
+            +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+            |             Unused space                                      .
+            .                                                               |
+nextchunk-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+    `foot:' |             Size of chunk, in bytes                           |
+            +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+
+  Each tree holding treenodes is a tree of unique chunk sizes.  Chunks
+  of the same size are arranged in a circularly-linked list, with only
+  the oldest chunk (the next to be used, in our FIFO ordering)
+  actually in the tree.  (Tree members are distinguished by a non-null
+  parent pointer.)  If a chunk with the same size an an existing node
+  is inserted, it is linked off the existing node using pointers that
+  work in the same way as fd/bk pointers of small chunks.
+
+  Each tree contains a power of 2 sized range of chunk sizes (the
+  smallest is 0x100 <= x < 0x180), which is is divided in half at each
+  tree level, with the chunks in the smaller half of the range (0x100
+  <= x < 0x140 for the top nose) in the left subtree and the larger
+  half (0x140 <= x < 0x180) in the right subtree.  This is, of course,
+  done by inspecting individual bits.
+
+  Using these rules, each node's left subtree contains all smaller
+  sizes than its right subtree.  However, the node at the root of each
+  subtree has no particular ordering relationship to either.  (The
+  dividing line between the subtree sizes is based on trie relation.)
+  If we remove the last chunk of a given size from the interior of the
+  tree, we need to replace it with a leaf node.  The tree ordering
+  rules permit a node to be replaced by any leaf below it.
+
+  The smallest chunk in a tree (a common operation in a best-fit
+  allocator) can be found by walking a path to the leftmost leaf in
+  the tree.  Unlike a usual binary tree, where we follow left child
+  pointers until we reach a null, here we follow the right child
+  pointer any time the left one is null, until we reach a leaf with
+  both child pointers null. The smallest chunk in the tree will be
+  somewhere along that path.
+
+  The worst case number of steps to add, find, or remove a node is
+  bounded by the number of bits differentiating chunks within
+  bins. Under current bin calculations, this ranges from 6 up to 21
+  (for 32 bit sizes) or up to 53 (for 64 bit sizes). The typical case
+  is of course much better.
+*/
+
+struct malloc_tree_chunk {
+  /* The first four fields must be compatible with malloc_chunk */
+  size_t                    prev_foot;
+  size_t                    head;
+  struct malloc_tree_chunk* fd;
+  struct malloc_tree_chunk* bk;
+
+  struct malloc_tree_chunk* child[2];
+  struct malloc_tree_chunk* parent;
+  bindex_t                  index;
+};
+
+typedef struct malloc_tree_chunk  tchunk;
+typedef struct malloc_tree_chunk* tchunkptr;
+typedef struct malloc_tree_chunk* tbinptr; /* The type of bins of trees */
+
+/* A little helper macro for trees */
+#define leftmost_child(t) ((t)->child[0] != 0? (t)->child[0] : (t)->child[1])
+
+/* ----------------------------- Segments -------------------------------- */
+
+/*
+  Each malloc space may include non-contiguous segments, held in a
+  list headed by an embedded malloc_segment record representing the
+  top-most space. Segments also include flags holding properties of
+  the space. Large chunks that are directly allocated by mmap are not
+  included in this list. They are instead independently created and
+  destroyed without otherwise keeping track of them.
+
+  Segment management mainly comes into play for spaces allocated by
+  MMAP.  Any call to MMAP might or might not return memory that is
+  adjacent to an existing segment.  MORECORE normally contiguously
+  extends the current space, so this space is almost always adjacent,
+  which is simpler and faster to deal with. (This is why MORECORE is
+  used preferentially to MMAP when both are available -- see
+  sys_alloc.)  When allocating using MMAP, we don't use any of the
+  hinting mechanisms (inconsistently) supported in various
+  implementations of unix mmap, or distinguish reserving from
+  committing memory. Instead, we just ask for space, and exploit
+  contiguity when we get it.  It is probably possible to do
+  better than this on some systems, but no general scheme seems
+  to be significantly better.
+
+  Management entails a simpler variant of the consolidation scheme
+  used for chunks to reduce fragmentation -- new adjacent memory is
+  normally prepended or appended to an existing segment. However,
+  there are limitations compared to chunk consolidation that mostly
+  reflect the fact that segment processing is relatively infrequent
+  (occurring only when getting memory from system) and that we
+  don't expect to have huge numbers of segments:
+
+  * Segments are not indexed, so traversal requires linear scans.  (It
+    would be possible to index these, but is not worth the extra
+    overhead and complexity for most programs on most platforms.)
+  * New segments are only appended to old ones when holding top-most
+    memory; if they cannot be prepended to others, they are held in
+    different segments.
+
+  Except for the top-most segment of an mstate, each segment record
+  is kept at the tail of its segment. Segments are added by pushing
+  segment records onto the list headed by &mstate.seg for the
+  containing mstate.
+
+  Segment flags control allocation/merge/deallocation policies:
+  * If EXTERN_BIT set, then we did not allocate this segment,
+    and so should not try to deallocate or merge with others.
+    (This currently holds only for the initial segment passed
+    into create_mspace_with_base.)
+  * If USE_MMAP_BIT set, the segment may be merged with
+    other surrounding mmapped segments and trimmed/de-allocated
+    using munmap.
+  * If neither bit is set, then the segment was obtained using
+    MORECORE so can be merged with surrounding MORECORE'd segments
+    and deallocated/trimmed using MORECORE with negative arguments.
+*/
+
+struct malloc_segment {
+  char*        base;             /* base address */
+  size_t       size;             /* allocated size */
+  struct malloc_segment* next;   /* ptr to next segment */
+  flag_t       sflags;           /* mmap and extern flag */
+};
+
+#define is_mmapped_segment(S)  ((S)->sflags & USE_MMAP_BIT)
+#define is_extern_segment(S)   ((S)->sflags & EXTERN_BIT)
+
+typedef struct malloc_segment  msegment;
+typedef struct malloc_segment* msegmentptr;
+
+/* ---------------------------- malloc_state ----------------------------- */
+
+/*
+   A malloc_state holds all of the bookkeeping for a space.
+   The main fields are:
+
+  Top
+    The topmost chunk of the currently active segment. Its size is
+    cached in topsize.  The actual size of topmost space is
+    topsize+TOP_FOOT_SIZE, which includes space reserved for adding
+    fenceposts and segment records if necessary when getting more
+    space from the system.  The size at which to autotrim top is
+    cached from mparams in trim_check, except that it is disabled if
+    an autotrim fails.
+
+  Designated victim (dv)
+    This is the preferred chunk for servicing small requests that
+    don't have exact fits.  It is normally the chunk split off most
+    recently to service another small request.  Its size is cached in
+    dvsize. The link fields of this chunk are not maintained since it
+    is not kept in a bin.
+
+  SmallBins
+    An array of bin headers for free chunks.  These bins hold chunks
+    with sizes less than MIN_LARGE_SIZE bytes. Each bin contains
+    chunks of all the same size, spaced 8 bytes apart.  To simplify
+    use in double-linked lists, each bin header acts as a malloc_chunk
+    pointing to the real first node, if it exists (else pointing to
+    itself).  This avoids special-casing for headers.  But to avoid
+    waste, we allocate only the fd/bk pointers of bins, and then use
+    repositioning tricks to treat these as the fields of a chunk.
+
+  TreeBins
+    Treebins are pointers to the roots of trees holding a range of
+    sizes. There are 2 equally spaced treebins for each power of two
+    from TREE_SHIFT to TREE_SHIFT+16. The last bin holds anything
+    larger.
+
+  Bin maps
+    There is one bit map for small bins ("smallmap") and one for
+    treebins ("treemap).  Each bin sets its bit when non-empty, and
+    clears the bit when empty.  Bit operations are then used to avoid
+    bin-by-bin searching -- nearly all "search" is done without ever
+    looking at bins that won't be selected.  The bit maps
+    conservatively use 32 bits per map word, even if on 64bit system.
+    For a good description of some of the bit-based techniques used
+    here, see Henry S. Warren Jr's book "Hacker's Delight" (and
+    supplement at http://hackersdelight.org/). Many of these are
+    intended to reduce the branchiness of paths through malloc etc, as
+    well as to reduce the number of memory locations read or written.
+
+  Segments
+    A list of segments headed by an embedded malloc_segment record
+    representing the initial space.
+
+  Address check support
+    The least_addr field is the least address ever obtained from
+    MORECORE or MMAP. Attempted frees and reallocs of any address less
+    than this are trapped (unless INSECURE is defined).
+
+  Magic tag
+    A cross-check field that should always hold same value as mparams.magic.
+
+  Flags
+    Bits recording whether to use MMAP, locks, or contiguous MORECORE
+
+  Statistics
+    Each space keeps track of current and maximum system memory
+    obtained via MORECORE or MMAP.
+
+  Trim support
+    Fields holding the amount of unused topmost memory that should trigger
+    timming, and a counter to force periodic scanning to release unused
+    non-topmost segments.
+
+  Locking
+    If USE_LOCKS is defined, the "mutex" lock is acquired and released
+    around every public call using this mspace.
+
+  Extension support
+    A void* pointer and a size_t field that can be used to help implement
+    extensions to this malloc.
+*/
+
+/* Bin types, widths and sizes */
+#define NSMALLBINS        (32U)
+#define NTREEBINS         (32U)
+#define SMALLBIN_SHIFT    (3U)
+#define SMALLBIN_WIDTH    (SIZE_T_ONE << SMALLBIN_SHIFT)
+#define TREEBIN_SHIFT     (8U)
+#define MIN_LARGE_SIZE    (SIZE_T_ONE << TREEBIN_SHIFT)
+#define MAX_SMALL_SIZE    (MIN_LARGE_SIZE - SIZE_T_ONE)
+#define MAX_SMALL_REQUEST (MAX_SMALL_SIZE - CHUNK_ALIGN_MASK - CHUNK_OVERHEAD)
+
+struct malloc_state {
+  binmap_t   smallmap;
+  binmap_t   treemap;
+  size_t     dvsize;
+  size_t     topsize;
+  char*      least_addr;
+  mchunkptr  dv;
+  mchunkptr  top;
+  size_t     trim_check;
+  size_t     release_checks;
+  size_t     magic;
+  mchunkptr  smallbins[(NSMALLBINS+1)*2];
+  tbinptr    treebins[NTREEBINS];
+  size_t     footprint;
+  size_t     max_footprint;
+  flag_t     mflags;
+#if USE_LOCKS
+  MLOCK_T    mutex;     /* locate lock among fields that rarely change */
+#endif /* USE_LOCKS */
+  msegment   seg;
+  void*      extp;      /* Unused but available for extensions */
+  size_t     exts;
+};
+
+typedef struct malloc_state*    mstate;
+
+/* ------------- Global malloc_state and malloc_params ------------------- */
+
+/*
+  malloc_params holds global properties, including those that can be
+  dynamically set using mallopt. There is a single instance, mparams,
+  initialized in init_mparams. Note that the non-zeroness of "magic"
+  also serves as an initialization flag.
+*/
+
+struct malloc_params {
+  volatile size_t magic; //dsm: Note that this is a fixed number now
+  size_t page_size;
+  size_t granularity;
+  size_t mmap_threshold;
+  size_t trim_threshold;
+  flag_t default_mflags;
+};
+
+//dsm: This can be per-process, no need to make it a pointer
+static struct malloc_params mparams;
+
+/* Ensure mparams initialized */
+#define ensure_initialization() (void)(mparams.magic != 0 || init_mparams())
+
+#if !ONLY_MSPACES
+#error "dsm: No _gm_, this should be compiled with ONLY_MSPACES"
+/* The global malloc_state used for all non-"mspace" calls */
+static struct malloc_state _gm_;
+#define gm                 (&_gm_)
+#define is_global(M)       ((M) == &_gm_)
+
+#endif /* !ONLY_MSPACES */
+
+#define is_initialized(M)  ((M)->top != 0)
+
+/* -------------------------- system alloc setup ------------------------- */
+
+/* Operations on mflags */
+
+#define use_lock(M)           ((M)->mflags &   USE_LOCK_BIT)
+#define enable_lock(M)        ((M)->mflags |=  USE_LOCK_BIT)
+#define disable_lock(M)       ((M)->mflags &= ~USE_LOCK_BIT)
+
+#define use_mmap(M)           ((M)->mflags &   USE_MMAP_BIT)
+#define enable_mmap(M)        ((M)->mflags |=  USE_MMAP_BIT)
+#define disable_mmap(M)       ((M)->mflags &= ~USE_MMAP_BIT)
+
+#define use_noncontiguous(M)  ((M)->mflags &   USE_NONCONTIGUOUS_BIT)
+#define disable_contiguous(M) ((M)->mflags |=  USE_NONCONTIGUOUS_BIT)
+
+#define set_lock(M,L)\
+ ((M)->mflags = (L)?\
+  ((M)->mflags | USE_LOCK_BIT) :\
+  ((M)->mflags & ~USE_LOCK_BIT))
+
+/* page-align a size */
+#define page_align(S)\
+ (((S) + (mparams.page_size - SIZE_T_ONE)) & ~(mparams.page_size - SIZE_T_ONE))
+
+/* granularity-align a size */
+#define granularity_align(S)\
+  (((S) + (mparams.granularity - SIZE_T_ONE))\
+   & ~(mparams.granularity - SIZE_T_ONE))
+
+
+/* For mmap, use granularity alignment on windows, else page-align */
+#ifdef WIN32
+#define mmap_align(S) granularity_align(S)
+#else
+#define mmap_align(S) page_align(S)
+#endif
+
+/* For sys_alloc, enough padding to ensure can malloc request on success */
+#define SYS_ALLOC_PADDING (TOP_FOOT_SIZE + MALLOC_ALIGNMENT)
+
+#define is_page_aligned(S)\
+   (((size_t)(S) & (mparams.page_size - SIZE_T_ONE)) == 0)
+#define is_granularity_aligned(S)\
+   (((size_t)(S) & (mparams.granularity - SIZE_T_ONE)) == 0)
+
+/*  True if segment S holds address A */
+#define segment_holds(S, A)\
+  ((char*)(A) >= S->base && (char*)(A) < S->base + S->size)
+
+/* Return segment holding given address */
+static msegmentptr segment_holding(mstate m, char* addr) {
+  msegmentptr sp = &m->seg;
+  for (;;) {
+    if (addr >= sp->base && addr < sp->base + sp->size)
+      return sp;
+    if ((sp = sp->next) == 0)
+      return 0;
+  }
+}
+
+/* Return true if segment contains a segment link */
+static int has_segment_link(mstate m, msegmentptr ss) {
+  msegmentptr sp = &m->seg;
+  for (;;) {
+    if ((char*)sp >= ss->base && (char*)sp < ss->base + ss->size)
+      return 1;
+    if ((sp = sp->next) == 0)
+      return 0;
+  }
+}
+
+#ifndef MORECORE_CANNOT_TRIM
+#define should_trim(M,s)  ((s) > (M)->trim_check)
+#else  /* MORECORE_CANNOT_TRIM */
+#define should_trim(M,s)  (0)
+#endif /* MORECORE_CANNOT_TRIM */
+
+/*
+  TOP_FOOT_SIZE is padding at the end of a segment, including space
+  that may be needed to place segment records and fenceposts when new
+  noncontiguous segments are added.
+*/
+#define TOP_FOOT_SIZE\
+  (align_offset(chunk2mem(0))+pad_request(sizeof(struct malloc_segment))+MIN_CHUNK_SIZE)
+
+
+/* -------------------------------  Hooks -------------------------------- */
+
+/*
+  PREACTION should be defined to return 0 on success, and nonzero on
+  failure. If you are not using locking, you can redefine these to do
+  anything you like.
+*/
+
+#if USE_LOCKS
+
+#define PREACTION(M)  ((use_lock(M))? ACQUIRE_LOCK(&(M)->mutex) : 0)
+#define POSTACTION(M) { if (use_lock(M)) RELEASE_LOCK(&(M)->mutex); }
+#else /* USE_LOCKS */
+
+#ifndef PREACTION
+#define PREACTION(M) (0)
+#endif  /* PREACTION */
+
+#ifndef POSTACTION
+#define POSTACTION(M)
+#endif  /* POSTACTION */
+
+#endif /* USE_LOCKS */
+
+/*
+  CORRUPTION_ERROR_ACTION is triggered upon detected bad addresses.
+  USAGE_ERROR_ACTION is triggered on detected bad frees and
+  reallocs. The argument p is an address that might have triggered the
+  fault. It is ignored by the two predefined actions, but might be
+  useful in custom actions that try to help diagnose errors.
+*/
+
+#if PROCEED_ON_ERROR
+
+/* A count of the number of corruption errors causing resets */
+int malloc_corruption_error_count;
+
+/* default corruption action */
+static void reset_on_error(mstate m);
+
+#define CORRUPTION_ERROR_ACTION(m)  reset_on_error(m)
+#define USAGE_ERROR_ACTION(m, p)
+
+#else /* PROCEED_ON_ERROR */
+
+#ifndef CORRUPTION_ERROR_ACTION
+#define CORRUPTION_ERROR_ACTION(m) ABORT
+#endif /* CORRUPTION_ERROR_ACTION */
+
+#ifndef USAGE_ERROR_ACTION
+#define USAGE_ERROR_ACTION(m,p) ABORT
+#endif /* USAGE_ERROR_ACTION */
+
+#endif /* PROCEED_ON_ERROR */
+
+/* -------------------------- Debugging setup ---------------------------- */
+
+#if ! DEBUG
+
+#define check_free_chunk(M,P)
+#define check_inuse_chunk(M,P)
+#define check_malloced_chunk(M,P,N)
+#define check_mmapped_chunk(M,P)
+#define check_malloc_state(M)
+#define check_top_chunk(M,P)
+
+#else /* DEBUG */
+#define check_free_chunk(M,P)       do_check_free_chunk(M,P)
+#define check_inuse_chunk(M,P)      do_check_inuse_chunk(M,P)
+#define check_top_chunk(M,P)        do_check_top_chunk(M,P)
+#define check_malloced_chunk(M,P,N) do_check_malloced_chunk(M,P,N)
+#define check_mmapped_chunk(M,P)    do_check_mmapped_chunk(M,P)
+#define check_malloc_state(M)       do_check_malloc_state(M)
+
+static void   do_check_any_chunk(mstate m, mchunkptr p);
+static void   do_check_top_chunk(mstate m, mchunkptr p);
+static void   do_check_mmapped_chunk(mstate m, mchunkptr p);
+static void   do_check_inuse_chunk(mstate m, mchunkptr p);
+static void   do_check_free_chunk(mstate m, mchunkptr p);
+static void   do_check_malloced_chunk(mstate m, void* mem, size_t s);
+static void   do_check_tree(mstate m, tchunkptr t);
+static void   do_check_treebin(mstate m, bindex_t i);
+static void   do_check_smallbin(mstate m, bindex_t i);
+static void   do_check_malloc_state(mstate m);
+static int    bin_find(mstate m, mchunkptr x);
+static size_t traverse_and_check(mstate m);
+#endif /* DEBUG */
+
+/* ---------------------------- Indexing Bins ---------------------------- */
+
+#define is_small(s)         (((s) >> SMALLBIN_SHIFT) < NSMALLBINS)
+#define small_index(s)      ((s)  >> SMALLBIN_SHIFT)
+#define small_index2size(i) ((i)  << SMALLBIN_SHIFT)
+#define MIN_SMALL_INDEX     (small_index(MIN_CHUNK_SIZE))
+
+/* addressing by index. See above about smallbin repositioning */
+#define smallbin_at(M, i)   ((sbinptr)((char*)&((M)->smallbins[(i)<<1])))
+#define treebin_at(M,i)     (&((M)->treebins[i]))
+
+/* assign tree index for size S to variable I. Use x86 asm if possible  */
+#if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
+#define compute_tree_index(S, I)\
+{\
+  unsigned int X = S >> TREEBIN_SHIFT;\
+  if (X == 0)\
+    I = 0;\
+  else if (X > 0xFFFF)\
+    I = NTREEBINS-1;\
+  else {\
+    unsigned int K;\
+    __asm__("bsrl\t%1, %0\n\t" : "=r" (K) : "g"  (X));\
+    I =  (bindex_t)((K << 1) + ((S >> (K + (TREEBIN_SHIFT-1)) & 1)));\
+  }\
+}
+
+#elif defined (__INTEL_COMPILER)
+#define compute_tree_index(S, I)\
+{\
+  size_t X = S >> TREEBIN_SHIFT;\
+  if (X == 0)\
+    I = 0;\
+  else if (X > 0xFFFF)\
+    I = NTREEBINS-1;\
+  else {\
+    unsigned int K = _bit_scan_reverse (X); \
+    I =  (bindex_t)((K << 1) + ((S >> (K + (TREEBIN_SHIFT-1)) & 1)));\
+  }\
+}
+
+#elif defined(_MSC_VER) && _MSC_VER>=1300
+#define compute_tree_index(S, I)\
+{\
+  size_t X = S >> TREEBIN_SHIFT;\
+  if (X == 0)\
+    I = 0;\
+  else if (X > 0xFFFF)\
+    I = NTREEBINS-1;\
+  else {\
+    unsigned int K;\
+    _BitScanReverse((DWORD *) &K, X);\
+    I =  (bindex_t)((K << 1) + ((S >> (K + (TREEBIN_SHIFT-1)) & 1)));\
+  }\
+}
+
+#else /* GNUC */
+#define compute_tree_index(S, I)\
+{\
+  size_t X = S >> TREEBIN_SHIFT;\
+  if (X == 0)\
+    I = 0;\
+  else if (X > 0xFFFF)\
+    I = NTREEBINS-1;\
+  else {\
+    unsigned int Y = (unsigned int)X;\
+    unsigned int N = ((Y - 0x100) >> 16) & 8;\
+    unsigned int K = (((Y <<= N) - 0x1000) >> 16) & 4;\
+    N += K;\
+    N += K = (((Y <<= K) - 0x4000) >> 16) & 2;\
+    K = 14 - N + ((Y <<= K) >> 15);\
+    I = (K << 1) + ((S >> (K + (TREEBIN_SHIFT-1)) & 1));\
+  }\
+}
+#endif /* GNUC */
+
+/* Bit representing maximum resolved size in a treebin at i */
+#define bit_for_tree_index(i) \
+   (i == NTREEBINS-1)? (SIZE_T_BITSIZE-1) : (((i) >> 1) + TREEBIN_SHIFT - 2)
+
+/* Shift placing maximum resolved bit in a treebin at i as sign bit */
+#define leftshift_for_tree_index(i) \
+   ((i == NTREEBINS-1)? 0 : \
+    ((SIZE_T_BITSIZE-SIZE_T_ONE) - (((i) >> 1) + TREEBIN_SHIFT - 2)))
+
+/* The size of the smallest chunk held in bin with index i */
+#define minsize_for_tree_index(i) \
+   ((SIZE_T_ONE << (((i) >> 1) + TREEBIN_SHIFT)) |  \
+   (((size_t)((i) & SIZE_T_ONE)) << (((i) >> 1) + TREEBIN_SHIFT - 1)))
+
+
+/* ------------------------ Operations on bin maps ----------------------- */
+
+/* bit corresponding to given index */
+#define idx2bit(i)              ((binmap_t)(1) << (i))
+
+/* Mark/Clear bits with given index */
+#define mark_smallmap(M,i)      ((M)->smallmap |=  idx2bit(i))
+#define clear_smallmap(M,i)     ((M)->smallmap &= ~idx2bit(i))
+#define smallmap_is_marked(M,i) ((M)->smallmap &   idx2bit(i))
+
+#define mark_treemap(M,i)       ((M)->treemap  |=  idx2bit(i))
+#define clear_treemap(M,i)      ((M)->treemap  &= ~idx2bit(i))
+#define treemap_is_marked(M,i)  ((M)->treemap  &   idx2bit(i))
+
+/* isolate the least set bit of a bitmap */
+#define least_bit(x)         ((x) & -(x))
+
+/* mask with all bits to left of least bit of x on */
+#define left_bits(x)         ((x<<1) | -(x<<1))
+
+/* mask with all bits to left of or equal to least bit of x on */
+#define same_or_left_bits(x) ((x) | -(x))
+
+/* index corresponding to given bit. Use x86 asm if possible */
+
+#if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
+#define compute_bit2idx(X, I)\
+{\
+  unsigned int J;\
+  __asm__("bsfl\t%1, %0\n\t" : "=r" (J) : "g" (X));\
+  I = (bindex_t)J;\
+}
+
+#elif defined (__INTEL_COMPILER)
+#define compute_bit2idx(X, I)\
+{\
+  unsigned int J;\
+  J = _bit_scan_forward (X); \
+  I = (bindex_t)J;\
+}
+
+#elif defined(_MSC_VER) && _MSC_VER>=1300
+#define compute_bit2idx(X, I)\
+{\
+  unsigned int J;\
+  _BitScanForward((DWORD *) &J, X);\
+  I = (bindex_t)J;\
+}
+
+#elif USE_BUILTIN_FFS
+#define compute_bit2idx(X, I) I = ffs(X)-1
+
+#else
+#define compute_bit2idx(X, I)\
+{\
+  unsigned int Y = X - 1;\
+  unsigned int K = Y >> (16-4) & 16;\
+  unsigned int N = K;        Y >>= K;\
+  N += K = Y >> (8-3) &  8;  Y >>= K;\
+  N += K = Y >> (4-2) &  4;  Y >>= K;\
+  N += K = Y >> (2-1) &  2;  Y >>= K;\
+  N += K = Y >> (1-0) &  1;  Y >>= K;\
+  I = (bindex_t)(N + Y);\
+}
+#endif /* GNUC */
+
+
+/* ----------------------- Runtime Check Support ------------------------- */
+
+/*
+  For security, the main invariant is that malloc/free/etc never
+  writes to a static address other than malloc_state, unless static
+  malloc_state itself has been corrupted, which cannot occur via
+  malloc (because of these checks). In essence this means that we
+  believe all pointers, sizes, maps etc held in malloc_state, but
+  check all of those linked or offsetted from other embedded data
+  structures.  These checks are interspersed with main code in a way
+  that tends to minimize their run-time cost.
+
+  When FOOTERS is defined, in addition to range checking, we also
+  verify footer fields of inuse chunks, which can be used guarantee
+  that the mstate controlling malloc/free is intact.  This is a
+  streamlined version of the approach described by William Robertson
+  et al in "Run-time Detection of Heap-based Overflows" LISA'03
+  http://www.usenix.org/events/lisa03/tech/robertson.html The footer
+  of an inuse chunk holds the xor of its mstate and a random seed,
+  that is checked upon calls to free() and realloc().  This is
+  (probablistically) unguessable from outside the program, but can be
+  computed by any code successfully malloc'ing any chunk, so does not
+  itself provide protection against code that has already broken
+  security through some other means.  Unlike Robertson et al, we
+  always dynamically check addresses of all offset chunks (previous,
+  next, etc). This turns out to be cheaper than relying on hashes.
+*/
+
+#if !INSECURE
+/* Check if address a is at least as high as any from MORECORE or MMAP */
+#define ok_address(M, a) ((char*)(a) >= (M)->least_addr)
+/* Check if address of next chunk n is higher than base chunk p */
+#define ok_next(p, n)    ((char*)(p) < (char*)(n))
+/* Check if p has inuse status */
+#define ok_inuse(p)     is_inuse(p)
+/* Check if p has its pinuse bit on */
+#define ok_pinuse(p)     pinuse(p)
+
+#else /* !INSECURE */
+#define ok_address(M, a) (1)
+#define ok_next(b, n)    (1)
+#define ok_inuse(p)      (1)
+#define ok_pinuse(p)     (1)
+#endif /* !INSECURE */
+
+#if (FOOTERS && !INSECURE)
+/* Check if (alleged) mstate m has expected magic field */
+#define ok_magic(M)      ((M)->magic == mparams.magic)
+#else  /* (FOOTERS && !INSECURE) */
+#define ok_magic(M)      (1)
+#endif /* (FOOTERS && !INSECURE) */
+
+
+/* In gcc, use __builtin_expect to minimize impact of checks */
+#if !INSECURE
+#if defined(__GNUC__) && __GNUC__ >= 3
+#define RTCHECK(e)  __builtin_expect(e, 1)
+#else /* GNUC */
+#define RTCHECK(e)  (e)
+#endif /* GNUC */
+#else /* !INSECURE */
+#define RTCHECK(e)  (1)
+#endif /* !INSECURE */
+
+/* macros to set up inuse chunks with or without footers */
+
+#if !FOOTERS
+
+#define mark_inuse_foot(M,p,s)
+
+/* Macros for setting head/foot of non-mmapped chunks */
+
+/* Set cinuse bit and pinuse bit of next chunk */
+#define set_inuse(M,p,s)\
+  ((p)->head = (((p)->head & PINUSE_BIT)|s|CINUSE_BIT),\
+  ((mchunkptr)(((char*)(p)) + (s)))->head |= PINUSE_BIT)
+
+/* Set cinuse and pinuse of this chunk and pinuse of next chunk */
+#define set_inuse_and_pinuse(M,p,s)\
+  ((p)->head = (s|PINUSE_BIT|CINUSE_BIT),\
+  ((mchunkptr)(((char*)(p)) + (s)))->head |= PINUSE_BIT)
+
+/* Set size, cinuse and pinuse bit of this chunk */
+#define set_size_and_pinuse_of_inuse_chunk(M, p, s)\
+  ((p)->head = (s|PINUSE_BIT|CINUSE_BIT))
+
+#else /* FOOTERS */
+
+/* Set foot of inuse chunk to be xor of mstate and seed */
+#define mark_inuse_foot(M,p,s)\
+  (((mchunkptr)((char*)(p) + (s)))->prev_foot = ((size_t)(M) ^ mparams.magic))
+
+#define get_mstate_for(p)\
+  ((mstate)(((mchunkptr)((char*)(p) +\
+    (chunksize(p))))->prev_foot ^ mparams.magic))
+
+#define set_inuse(M,p,s)\
+  ((p)->head = (((p)->head & PINUSE_BIT)|s|CINUSE_BIT),\
+  (((mchunkptr)(((char*)(p)) + (s)))->head |= PINUSE_BIT), \
+  mark_inuse_foot(M,p,s))
+
+#define set_inuse_and_pinuse(M,p,s)\
+  ((p)->head = (s|PINUSE_BIT|CINUSE_BIT),\
+  (((mchunkptr)(((char*)(p)) + (s)))->head |= PINUSE_BIT),\
+ mark_inuse_foot(M,p,s))
+
+#define set_size_and_pinuse_of_inuse_chunk(M, p, s)\
+  ((p)->head = (s|PINUSE_BIT|CINUSE_BIT),\
+  mark_inuse_foot(M, p, s))
+
+#endif /* !FOOTERS */
+
+/* ---------------------------- setting mparams -------------------------- */
+
+/* Initialize mparams */
+static int init_mparams(void) {
+#ifdef NEED_GLOBAL_LOCK_INIT
+  if (malloc_global_mutex_status <= 0)
+    init_malloc_global_mutex();
+#endif
+
+  ACQUIRE_MALLOC_GLOBAL_LOCK();
+  if (mparams.magic == 0) {
+    size_t magic;
+    size_t psize;
+    size_t gsize;
+
+#ifndef WIN32
+    psize = malloc_getpagesize;
+    gsize = ((DEFAULT_GRANULARITY != 0)? DEFAULT_GRANULARITY : psize);
+#else /* WIN32 */
+    {
+      SYSTEM_INFO system_info;
+      GetSystemInfo(&system_info);
+      psize = system_info.dwPageSize;
+      gsize = ((DEFAULT_GRANULARITY != 0)?
+               DEFAULT_GRANULARITY : system_info.dwAllocationGranularity);
+    }
+#endif /* WIN32 */
+
+    /* Sanity-check configuration:
+       size_t must be unsigned and as wide as pointer type.
+       ints must be at least 4 bytes.
+       alignment must be at least 8.
+       Alignment, min chunk size, and page size must all be powers of 2.
+    */
+    if ((sizeof(size_t) != sizeof(char*)) ||
+        (MAX_SIZE_T < MIN_CHUNK_SIZE)  ||
+        (sizeof(int) < 4)  ||
+        (MALLOC_ALIGNMENT < (size_t)8U) ||
+        ((MALLOC_ALIGNMENT & (MALLOC_ALIGNMENT-SIZE_T_ONE)) != 0) ||
+        ((MCHUNK_SIZE      & (MCHUNK_SIZE-SIZE_T_ONE))      != 0) ||
+        ((gsize            & (gsize-SIZE_T_ONE))            != 0) ||
+        ((psize            & (psize-SIZE_T_ONE))            != 0))
+      ABORT;
+
+    mparams.granularity = gsize;
+    mparams.page_size = psize;
+    mparams.mmap_threshold = DEFAULT_MMAP_THRESHOLD;
+    mparams.trim_threshold = DEFAULT_TRIM_THRESHOLD;
+#if MORECORE_CONTIGUOUS
+    mparams.default_mflags = USE_LOCK_BIT|USE_MMAP_BIT;
+#else  /* MORECORE_CONTIGUOUS */
+    mparams.default_mflags = USE_LOCK_BIT|USE_MMAP_BIT|USE_NONCONTIGUOUS_BIT;
+#endif /* MORECORE_CONTIGUOUS */
+
+#if !ONLY_MSPACES
+    /* Set up lock for main malloc area */
+    gm->mflags = mparams.default_mflags;
+    INITIAL_LOCK(&gm->mutex);
+#endif
+
+    {
+#if USE_DEV_RANDOM
+      int fd;
+      unsigned char buf[sizeof(size_t)];
+      /* Try to use /dev/urandom, else fall back on using time */
+      if ((fd = open("/dev/urandom", O_RDONLY)) >= 0 &&
+          read(fd, buf, sizeof(buf)) == sizeof(buf)) {
+        magic = *((size_t *) buf);
+        close(fd);
+      }
+      else
+#endif /* USE_DEV_RANDOM */
+#if 0 //dsm
+#ifdef WIN32
+        magic = (size_t)(GetTickCount() ^ (size_t)0x55555555U);
+#else
+        magic = (size_t)(time(0) ^ (size_t)0x55555555U);
+#endif
+#endif
+      magic = 0xDEADBEEF; //dsm: Fixed, so that multiple processes can have the same value.
+      magic |= (size_t)8U;    /* ensure nonzero */
+      magic &= ~(size_t)7U;   /* improve chances of fault for bad values */
+      mparams.magic = magic;
+    }
+  }
+
+  RELEASE_MALLOC_GLOBAL_LOCK();
+  return 1;
+}
+
+/* support for mallopt */
+static int change_mparam(int param_number, int value) {
+  size_t val;
+  ensure_initialization();
+  val = (value == -1)? MAX_SIZE_T : (size_t)value;
+  switch(param_number) {
+  case M_TRIM_THRESHOLD:
+    mparams.trim_threshold = val;
+    return 1;
+  case M_GRANULARITY:
+    if (val >= mparams.page_size && ((val & (val-1)) == 0)) {
+      mparams.granularity = val;
+      return 1;
+    }
+    else
+      return 0;
+  case M_MMAP_THRESHOLD:
+    mparams.mmap_threshold = val;
+    return 1;
+  default:
+    return 0;
+  }
+}
+
+#if DEBUG
+/* ------------------------- Debugging Support --------------------------- */
+
+/* Check properties of any chunk, whether free, inuse, mmapped etc  */
+static void do_check_any_chunk(mstate m, mchunkptr p) {
+  assert((is_aligned(chunk2mem(p))) || (p->head == FENCEPOST_HEAD));
+  assert(ok_address(m, p));
+}
+
+/* Check properties of top chunk */
+static void do_check_top_chunk(mstate m, mchunkptr p) {
+  msegmentptr sp = segment_holding(m, (char*)p);
+  size_t  sz = p->head & ~INUSE_BITS; /* third-lowest bit can be set! */
+  assert(sp != 0);
+  assert((is_aligned(chunk2mem(p))) || (p->head == FENCEPOST_HEAD));
+  assert(ok_address(m, p));
+  assert(sz == m->topsize);
+  assert(sz > 0);
+  assert(sz == ((sp->base + sp->size) - (char*)p) - TOP_FOOT_SIZE);
+  assert(pinuse(p));
+  assert(!pinuse(chunk_plus_offset(p, sz)));
+}
+
+/* Check properties of (inuse) mmapped chunks */
+static void do_check_mmapped_chunk(mstate m, mchunkptr p) {
+  size_t  sz = chunksize(p);
+  size_t len = (sz + (p->prev_foot) + MMAP_FOOT_PAD);
+  assert(is_mmapped(p));
+  assert(use_mmap(m));
+  assert((is_aligned(chunk2mem(p))) || (p->head == FENCEPOST_HEAD));
+  assert(ok_address(m, p));
+  assert(!is_small(sz));
+  assert((len & (mparams.page_size-SIZE_T_ONE)) == 0);
+  assert(chunk_plus_offset(p, sz)->head == FENCEPOST_HEAD);
+  assert(chunk_plus_offset(p, sz+SIZE_T_SIZE)->head == 0);
+}
+
+/* Check properties of inuse chunks */
+static void do_check_inuse_chunk(mstate m, mchunkptr p) {
+  do_check_any_chunk(m, p);
+  assert(is_inuse(p));
+  assert(next_pinuse(p));
+  /* If not pinuse and not mmapped, previous chunk has OK offset */
+  assert(is_mmapped(p) || pinuse(p) || next_chunk(prev_chunk(p)) == p);
+  if (is_mmapped(p))
+    do_check_mmapped_chunk(m, p);
+}
+
+/* Check properties of free chunks */
+static void do_check_free_chunk(mstate m, mchunkptr p) {
+  size_t sz = chunksize(p);
+  mchunkptr next = chunk_plus_offset(p, sz);
+  do_check_any_chunk(m, p);
+  assert(!is_inuse(p));
+  assert(!next_pinuse(p));
+  assert (!is_mmapped(p));
+  if (p != m->dv && p != m->top) {
+    if (sz >= MIN_CHUNK_SIZE) {
+      assert((sz & CHUNK_ALIGN_MASK) == 0);
+      assert(is_aligned(chunk2mem(p)));
+      assert(next->prev_foot == sz);
+      assert(pinuse(p));
+      assert (next == m->top || is_inuse(next));
+      assert(p->fd->bk == p);
+      assert(p->bk->fd == p);
+    }
+    else  /* markers are always of size SIZE_T_SIZE */
+      assert(sz == SIZE_T_SIZE);
+  }
+}
+
+/* Check properties of malloced chunks at the point they are malloced */
+static void do_check_malloced_chunk(mstate m, void* mem, size_t s) {
+  if (mem != 0) {
+    mchunkptr p = mem2chunk(mem);
+    size_t sz = p->head & ~INUSE_BITS;
+    do_check_inuse_chunk(m, p);
+    assert((sz & CHUNK_ALIGN_MASK) == 0);
+    assert(sz >= MIN_CHUNK_SIZE);
+    assert(sz >= s);
+    /* unless mmapped, size is less than MIN_CHUNK_SIZE more than request */
+    assert(is_mmapped(p) || sz < (s + MIN_CHUNK_SIZE));
+  }
+}
+
+/* Check a tree and its subtrees.  */
+static void do_check_tree(mstate m, tchunkptr t) {
+  tchunkptr head = 0;
+  tchunkptr u = t;
+  bindex_t tindex = t->index;
+  size_t tsize = chunksize(t);
+  bindex_t idx;
+  compute_tree_index(tsize, idx);
+  assert(tindex == idx);
+  assert(tsize >= MIN_LARGE_SIZE);
+  assert(tsize >= minsize_for_tree_index(idx));
+  assert((idx == NTREEBINS-1) || (tsize < minsize_for_tree_index((idx+1))));
+
+  do { /* traverse through chain of same-sized nodes */
+    do_check_any_chunk(m, ((mchunkptr)u));
+    assert(u->index == tindex);
+    assert(chunksize(u) == tsize);
+    assert(!is_inuse(u));
+    assert(!next_pinuse(u));
+    assert(u->fd->bk == u);
+    assert(u->bk->fd == u);
+    if (u->parent == 0) {
+      assert(u->child[0] == 0);
+      assert(u->child[1] == 0);
+    }
+    else {
+      assert(head == 0); /* only one node on chain has parent */
+      head = u;
+      assert(u->parent != u);
+      assert (u->parent->child[0] == u ||
+              u->parent->child[1] == u ||
+              *((tbinptr*)(u->parent)) == u);
+      if (u->child[0] != 0) {
+        assert(u->child[0]->parent == u);
+        assert(u->child[0] != u);
+        do_check_tree(m, u->child[0]);
+      }
+      if (u->child[1] != 0) {
+        assert(u->child[1]->parent == u);
+        assert(u->child[1] != u);
+        do_check_tree(m, u->child[1]);
+      }
+      if (u->child[0] != 0 && u->child[1] != 0) {
+        assert(chunksize(u->child[0]) < chunksize(u->child[1]));
+      }
+    }
+    u = u->fd;
+  } while (u != t);
+  assert(head != 0);
+}
+
+/*  Check all the chunks in a treebin.  */
+static void do_check_treebin(mstate m, bindex_t i) {
+  tbinptr* tb = treebin_at(m, i);
+  tchunkptr t = *tb;
+  int empty = (m->treemap & (1U << i)) == 0;
+  if (t == 0)
+    assert(empty);
+  if (!empty)
+    do_check_tree(m, t);
+}
+
+/*  Check all the chunks in a smallbin.  */
+static void do_check_smallbin(mstate m, bindex_t i) {
+  sbinptr b = smallbin_at(m, i);
+  mchunkptr p = b->bk;
+  unsigned int empty = (m->smallmap & (1U << i)) == 0;
+  if (p == b)
+    assert(empty);
+  if (!empty) {
+    for (; p != b; p = p->bk) {
+      size_t size = chunksize(p);
+      mchunkptr q;
+      /* each chunk claims to be free */
+      do_check_free_chunk(m, p);
+      /* chunk belongs in bin */
+      assert(small_index(size) == i);
+      assert(p->bk == b || chunksize(p->bk) == chunksize(p));
+      /* chunk is followed by an inuse chunk */
+      q = next_chunk(p);
+      if (q->head != FENCEPOST_HEAD)
+        do_check_inuse_chunk(m, q);
+    }
+  }
+}
+
+/* Find x in a bin. Used in other check functions. */
+static int bin_find(mstate m, mchunkptr x) {
+  size_t size = chunksize(x);
+  if (is_small(size)) {
+    bindex_t sidx = small_index(size);
+    sbinptr b = smallbin_at(m, sidx);
+    if (smallmap_is_marked(m, sidx)) {
+      mchunkptr p = b;
+      do {
+        if (p == x)
+          return 1;
+      } while ((p = p->fd) != b);
+    }
+  }
+  else {
+    bindex_t tidx;
+    compute_tree_index(size, tidx);
+    if (treemap_is_marked(m, tidx)) {
+      tchunkptr t = *treebin_at(m, tidx);
+      size_t sizebits = size << leftshift_for_tree_index(tidx);
+      while (t != 0 && chunksize(t) != size) {
+        t = t->child[(sizebits >> (SIZE_T_BITSIZE-SIZE_T_ONE)) & 1];
+        sizebits <<= 1;
+      }
+      if (t != 0) {
+        tchunkptr u = t;
+        do {
+          if (u == (tchunkptr)x)
+            return 1;
+        } while ((u = u->fd) != t);
+      }
+    }
+  }
+  return 0;
+}
+
+/* Traverse each chunk and check it; return total */
+static size_t traverse_and_check(mstate m) {
+  size_t sum = 0;
+  if (is_initialized(m)) {
+    msegmentptr s = &m->seg;
+    sum += m->topsize + TOP_FOOT_SIZE;
+    while (s != 0) {
+      mchunkptr q = align_as_chunk(s->base);
+      mchunkptr lastq = 0;
+      assert(pinuse(q));
+      while (segment_holds(s, q) &&
+             q != m->top && q->head != FENCEPOST_HEAD) {
+        sum += chunksize(q);
+        if (is_inuse(q)) {
+          assert(!bin_find(m, q));
+          do_check_inuse_chunk(m, q);
+        }
+        else {
+          assert(q == m->dv || bin_find(m, q));
+          assert(lastq == 0 || is_inuse(lastq)); /* Not 2 consecutive free */
+          do_check_free_chunk(m, q);
+        }
+        lastq = q;
+        q = next_chunk(q);
+      }
+      s = s->next;
+    }
+  }
+  return sum;
+}
+
+/* Check all properties of malloc_state. */
+static void do_check_malloc_state(mstate m) {
+  bindex_t i;
+  size_t total;
+  /* check bins */
+  for (i = 0; i < NSMALLBINS; ++i)
+    do_check_smallbin(m, i);
+  for (i = 0; i < NTREEBINS; ++i)
+    do_check_treebin(m, i);
+
+  if (m->dvsize != 0) { /* check dv chunk */
+    do_check_any_chunk(m, m->dv);
+    assert(m->dvsize == chunksize(m->dv));
+    assert(m->dvsize >= MIN_CHUNK_SIZE);
+    assert(bin_find(m, m->dv) == 0);
+  }
+
+  if (m->top != 0) {   /* check top chunk */
+    do_check_top_chunk(m, m->top);
+    /*assert(m->topsize == chunksize(m->top)); redundant */
+    assert(m->topsize > 0);
+    assert(bin_find(m, m->top) == 0);
+  }
+
+  total = traverse_and_check(m);
+  assert(total <= m->footprint);
+  assert(m->footprint <= m->max_footprint);
+}
+#endif /* DEBUG */
+
+/* ----------------------------- statistics ------------------------------ */
+
+#if !NO_MALLINFO
+static struct mallinfo internal_mallinfo(mstate m) {
+  struct mallinfo nm = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+  ensure_initialization();
+  if (!PREACTION(m)) {
+    check_malloc_state(m);
+    if (is_initialized(m)) {
+      size_t nfree = SIZE_T_ONE; /* top always free */
+      size_t mfree = m->topsize + TOP_FOOT_SIZE;
+      size_t sum = mfree;
+      msegmentptr s = &m->seg;
+      while (s != 0) {
+        mchunkptr q = align_as_chunk(s->base);
+        while (segment_holds(s, q) &&
+               q != m->top && q->head != FENCEPOST_HEAD) {
+          size_t sz = chunksize(q);
+          sum += sz;
+          if (!is_inuse(q)) {
+            mfree += sz;
+            ++nfree;
+          }
+          q = next_chunk(q);
+        }
+        s = s->next;
+      }
+
+      nm.arena    = sum;
+      nm.ordblks  = nfree;
+      nm.hblkhd   = m->footprint - sum;
+      nm.usmblks  = m->max_footprint;
+      nm.uordblks = m->footprint - mfree;
+      nm.fordblks = mfree;
+      nm.keepcost = m->topsize;
+    }
+
+    POSTACTION(m);
+  }
+  return nm;
+}
+#endif /* !NO_MALLINFO */
+
+static void internal_malloc_stats(mstate m) {
+  ensure_initialization();
+  if (!PREACTION(m)) {
+    size_t maxfp = 0;
+    size_t fp = 0;
+    size_t used = 0;
+    check_malloc_state(m);
+    if (is_initialized(m)) {
+      msegmentptr s = &m->seg;
+      maxfp = m->max_footprint;
+      fp = m->footprint;
+      used = fp - (m->topsize + TOP_FOOT_SIZE);
+
+      while (s != 0) {
+        mchunkptr q = align_as_chunk(s->base);
+        while (segment_holds(s, q) &&
+               q != m->top && q->head != FENCEPOST_HEAD) {
+          if (!is_inuse(q))
+            used -= chunksize(q);
+          q = next_chunk(q);
+        }
+        s = s->next;
+      }
+    }
+
+    fprintf(stderr, "max system bytes = %10lu\n", (unsigned long)(maxfp));
+    fprintf(stderr, "system bytes     = %10lu\n", (unsigned long)(fp));
+    fprintf(stderr, "in use bytes     = %10lu\n", (unsigned long)(used));
+
+    POSTACTION(m);
+  }
+}
+
+/* ----------------------- Operations on smallbins ----------------------- */
+
+/*
+  Various forms of linking and unlinking are defined as macros.  Even
+  the ones for trees, which are very long but have very short typical
+  paths.  This is ugly but reduces reliance on inlining support of
+  compilers.
+*/
+
+/* Link a free chunk into a smallbin  */
+#define insert_small_chunk(M, P, S) {\
+  bindex_t I  = small_index(S);\
+  mchunkptr B = smallbin_at(M, I);\
+  mchunkptr F = B;\
+  assert(S >= MIN_CHUNK_SIZE);\
+  if (!smallmap_is_marked(M, I))\
+    mark_smallmap(M, I);\
+  else if (RTCHECK(ok_address(M, B->fd)))\
+    F = B->fd;\
+  else {\
+    CORRUPTION_ERROR_ACTION(M);\
+  }\
+  B->fd = P;\
+  F->bk = P;\
+  P->fd = F;\
+  P->bk = B;\
+}
+
+/* Unlink a chunk from a smallbin  */
+#define unlink_small_chunk(M, P, S) {\
+  mchunkptr F = P->fd;\
+  mchunkptr B = P->bk;\
+  bindex_t I = small_index(S);\
+  assert(P != B);\
+  assert(P != F);\
+  assert(chunksize(P) == small_index2size(I));\
+  if (F == B)\
+    clear_smallmap(M, I);\
+  else if (RTCHECK((F == smallbin_at(M,I) || ok_address(M, F)) &&\
+                   (B == smallbin_at(M,I) || ok_address(M, B)))) {\
+    F->bk = B;\
+    B->fd = F;\
+  }\
+  else {\
+    CORRUPTION_ERROR_ACTION(M);\
+  }\
+}
+
+/* Unlink the first chunk from a smallbin */
+#define unlink_first_small_chunk(M, B, P, I) {\
+  mchunkptr F = P->fd;\
+  assert(P != B);\
+  assert(P != F);\
+  assert(chunksize(P) == small_index2size(I));\
+  if (B == F)\
+    clear_smallmap(M, I);\
+  else if (RTCHECK(ok_address(M, F))) {\
+    B->fd = F;\
+    F->bk = B;\
+  }\
+  else {\
+    CORRUPTION_ERROR_ACTION(M);\
+  }\
+}
+
+
+
+/* Replace dv node, binning the old one */
+/* Used only when dvsize known to be small */
+#define replace_dv(M, P, S) {\
+  size_t DVS = M->dvsize;\
+  if (DVS != 0) {\
+    mchunkptr DV = M->dv;\
+    assert(is_small(DVS));\
+    insert_small_chunk(M, DV, DVS);\
+  }\
+  M->dvsize = S;\
+  M->dv = P;\
+}
+
+/* ------------------------- Operations on trees ------------------------- */
+
+/* Insert chunk into tree */
+#define insert_large_chunk(M, X, S) {\
+  tbinptr* H;\
+  bindex_t I;\
+  compute_tree_index(S, I);\
+  H = treebin_at(M, I);\
+  X->index = I;\
+  X->child[0] = X->child[1] = 0;\
+  if (!treemap_is_marked(M, I)) {\
+    mark_treemap(M, I);\
+    *H = X;\
+    X->parent = (tchunkptr)H;\
+    X->fd = X->bk = X;\
+  }\
+  else {\
+    tchunkptr T = *H;\
+    size_t K = S << leftshift_for_tree_index(I);\
+    for (;;) {\
+      if (chunksize(T) != S) {\
+        tchunkptr* C = &(T->child[(K >> (SIZE_T_BITSIZE-SIZE_T_ONE)) & 1]);\
+        K <<= 1;\
+        if (*C != 0)\
+          T = *C;\
+        else if (RTCHECK(ok_address(M, C))) {\
+          *C = X;\
+          X->parent = T;\
+          X->fd = X->bk = X;\
+          break;\
+        }\
+        else {\
+          CORRUPTION_ERROR_ACTION(M);\
+          break;\
+        }\
+      }\
+      else {\
+        tchunkptr F = T->fd;\
+        if (RTCHECK(ok_address(M, T) && ok_address(M, F))) {\
+          T->fd = F->bk = X;\
+          X->fd = F;\
+          X->bk = T;\
+          X->parent = 0;\
+          break;\
+        }\
+        else {\
+          CORRUPTION_ERROR_ACTION(M);\
+          break;\
+        }\
+      }\
+    }\
+  }\
+}
+
+/*
+  Unlink steps:
+
+  1. If x is a chained node, unlink it from its same-sized fd/bk links
+     and choose its bk node as its replacement.
+  2. If x was the last node of its size, but not a leaf node, it must
+     be replaced with a leaf node (not merely one with an open left or
+     right), to make sure that lefts and rights of descendents
+     correspond properly to bit masks.  We use the rightmost descendent
+     of x.  We could use any other leaf, but this is easy to locate and
+     tends to counteract removal of leftmosts elsewhere, and so keeps
+     paths shorter than minimally guaranteed.  This doesn't loop much
+     because on average a node in a tree is near the bottom.
+  3. If x is the base of a chain (i.e., has parent links) relink
+     x's parent and children to x's replacement (or null if none).
+*/
+
+#define unlink_large_chunk(M, X) {\
+  tchunkptr XP = X->parent;\
+  tchunkptr R;\
+  if (X->bk != X) {\
+    tchunkptr F = X->fd;\
+    R = X->bk;\
+    if (RTCHECK(ok_address(M, F))) {\
+      F->bk = R;\
+      R->fd = F;\
+    }\
+    else {\
+      CORRUPTION_ERROR_ACTION(M);\
+    }\
+  }\
+  else {\
+    tchunkptr* RP;\
+    if (((R = *(RP = &(X->child[1]))) != 0) ||\
+        ((R = *(RP = &(X->child[0]))) != 0)) {\
+      tchunkptr* CP;\
+      while ((*(CP = &(R->child[1])) != 0) ||\
+             (*(CP = &(R->child[0])) != 0)) {\
+        R = *(RP = CP);\
+      }\
+      if (RTCHECK(ok_address(M, RP)))\
+        *RP = 0;\
+      else {\
+        CORRUPTION_ERROR_ACTION(M);\
+      }\
+    }\
+  }\
+  if (XP != 0) {\
+    tbinptr* H = treebin_at(M, X->index);\
+    if (X == *H) {\
+      if ((*H = R) == 0) \
+        clear_treemap(M, X->index);\
+    }\
+    else if (RTCHECK(ok_address(M, XP))) {\
+      if (XP->child[0] == X) \
+        XP->child[0] = R;\
+      else \
+        XP->child[1] = R;\
+    }\
+    else\
+      CORRUPTION_ERROR_ACTION(M);\
+    if (R != 0) {\
+      if (RTCHECK(ok_address(M, R))) {\
+        tchunkptr C0, C1;\
+        R->parent = XP;\
+        if ((C0 = X->child[0]) != 0) {\
+          if (RTCHECK(ok_address(M, C0))) {\
+            R->child[0] = C0;\
+            C0->parent = R;\
+          }\
+          else\
+            CORRUPTION_ERROR_ACTION(M);\
+        }\
+        if ((C1 = X->child[1]) != 0) {\
+          if (RTCHECK(ok_address(M, C1))) {\
+            R->child[1] = C1;\
+            C1->parent = R;\
+          }\
+          else\
+            CORRUPTION_ERROR_ACTION(M);\
+        }\
+      }\
+      else\
+        CORRUPTION_ERROR_ACTION(M);\
+    }\
+  }\
+}
+
+/* Relays to large vs small bin operations */
+
+#define insert_chunk(M, P, S)\
+  if (is_small(S)) insert_small_chunk(M, P, S)\
+  else { tchunkptr TP = (tchunkptr)(P); insert_large_chunk(M, TP, S); }
+
+#define unlink_chunk(M, P, S)\
+  if (is_small(S)) unlink_small_chunk(M, P, S)\
+  else { tchunkptr TP = (tchunkptr)(P); unlink_large_chunk(M, TP); }
+
+
+/* Relays to internal calls to malloc/free from realloc, memalign etc */
+
+#if ONLY_MSPACES
+#define internal_malloc(m, b) mspace_malloc(m, b)
+#define internal_free(m, mem) mspace_free(m,mem);
+#else /* ONLY_MSPACES */
+#if MSPACES
+#define internal_malloc(m, b)\
+   (m == gm)? dlmalloc(b) : mspace_malloc(m, b)
+#define internal_free(m, mem)\
+   if (m == gm) dlfree(mem); else mspace_free(m,mem);
+#else /* MSPACES */
+#define internal_malloc(m, b) dlmalloc(b)
+#define internal_free(m, mem) dlfree(mem)
+#endif /* MSPACES */
+#endif /* ONLY_MSPACES */
+
+/* -----------------------  Direct-mmapping chunks ----------------------- */
+
+/*
+  Directly mmapped chunks are set up with an offset to the start of
+  the mmapped region stored in the prev_foot field of the chunk. This
+  allows reconstruction of the required argument to MUNMAP when freed,
+  and also allows adjustment of the returned chunk to meet alignment
+  requirements (especially in memalign).
+*/
+
+/* Malloc using mmap */
+static void* mmap_alloc(mstate m, size_t nb) {
+  size_t mmsize = mmap_align(nb + SIX_SIZE_T_SIZES + CHUNK_ALIGN_MASK);
+  if (mmsize > nb) {     /* Check for wrap around 0 */
+    char* mm = (char*)(CALL_DIRECT_MMAP(mmsize));
+    if (mm != CMFAIL) {
+      size_t offset = align_offset(chunk2mem(mm));
+      size_t psize = mmsize - offset - MMAP_FOOT_PAD;
+      mchunkptr p = (mchunkptr)(mm + offset);
+      p->prev_foot = offset;
+      p->head = psize;
+      mark_inuse_foot(m, p, psize);
+      chunk_plus_offset(p, psize)->head = FENCEPOST_HEAD;
+      chunk_plus_offset(p, psize+SIZE_T_SIZE)->head = 0;
+
+      if (m->least_addr == 0 || mm < m->least_addr)
+        m->least_addr = mm;
+      if ((m->footprint += mmsize) > m->max_footprint)
+        m->max_footprint = m->footprint;
+      assert(is_aligned(chunk2mem(p)));
+      check_mmapped_chunk(m, p);
+      return chunk2mem(p);
+    }
+  }
+  return 0;
+}
+
+/* Realloc using mmap */
+static mchunkptr mmap_resize(mstate m, mchunkptr oldp, size_t nb) {
+  size_t oldsize = chunksize(oldp);
+  if (is_small(nb)) /* Can't shrink mmap regions below small size */
+    return 0;
+  /* Keep old chunk if big enough but not too big */
+  if (oldsize >= nb + SIZE_T_SIZE &&
+      (oldsize - nb) <= (mparams.granularity << 1))
+    return oldp;
+  else {
+    size_t offset = oldp->prev_foot;
+    size_t oldmmsize = oldsize + offset + MMAP_FOOT_PAD;
+    size_t newmmsize = mmap_align(nb + SIX_SIZE_T_SIZES + CHUNK_ALIGN_MASK);
+    char* cp = (char*)CALL_MREMAP((char*)oldp - offset,
+                                  oldmmsize, newmmsize, 1);
+    if (cp != CMFAIL) {
+      mchunkptr newp = (mchunkptr)(cp + offset);
+      size_t psize = newmmsize - offset - MMAP_FOOT_PAD;
+      newp->head = psize;
+      mark_inuse_foot(m, newp, psize);
+      chunk_plus_offset(newp, psize)->head = FENCEPOST_HEAD;
+      chunk_plus_offset(newp, psize+SIZE_T_SIZE)->head = 0;
+
+      if (cp < m->least_addr)
+        m->least_addr = cp;
+      if ((m->footprint += newmmsize - oldmmsize) > m->max_footprint)
+        m->max_footprint = m->footprint;
+      check_mmapped_chunk(m, newp);
+      return newp;
+    }
+  }
+  return 0;
+}
+
+/* -------------------------- mspace management -------------------------- */
+
+/* Initialize top chunk and its size */
+static void init_top(mstate m, mchunkptr p, size_t psize) {
+  /* Ensure alignment */
+  size_t offset = align_offset(chunk2mem(p));
+  p = (mchunkptr)((char*)p + offset);
+  psize -= offset;
+
+  m->top = p;
+  m->topsize = psize;
+  p->head = psize | PINUSE_BIT;
+  /* set size of fake trailing chunk holding overhead space only once */
+  chunk_plus_offset(p, psize)->head = TOP_FOOT_SIZE;
+  m->trim_check = mparams.trim_threshold; /* reset on each update */
+}
+
+/* Initialize bins for a new mstate that is otherwise zeroed out */
+static void init_bins(mstate m) {
+  /* Establish circular links for smallbins */
+  bindex_t i;
+  for (i = 0; i < NSMALLBINS; ++i) {
+    sbinptr bin = smallbin_at(m,i);
+    bin->fd = bin->bk = bin;
+  }
+}
+
+#if PROCEED_ON_ERROR
+
+/* default corruption action */
+static void reset_on_error(mstate m) {
+  int i;
+  ++malloc_corruption_error_count;
+  /* Reinitialize fields to forget about all memory */
+  m->smallbins = m->treebins = 0;
+  m->dvsize = m->topsize = 0;
+  m->seg.base = 0;
+  m->seg.size = 0;
+  m->seg.next = 0;
+  m->top = m->dv = 0;
+  for (i = 0; i < NTREEBINS; ++i)
+    *treebin_at(m, i) = 0;
+  init_bins(m);
+}
+#endif /* PROCEED_ON_ERROR */
+
+/* Allocate chunk and prepend remainder with chunk in successor base. */
+static void* prepend_alloc(mstate m, char* newbase, char* oldbase,
+                           size_t nb) {
+  mchunkptr p = align_as_chunk(newbase);
+  mchunkptr oldfirst = align_as_chunk(oldbase);
+  size_t psize = (char*)oldfirst - (char*)p;
+  mchunkptr q = chunk_plus_offset(p, nb);
+  size_t qsize = psize - nb;
+  set_size_and_pinuse_of_inuse_chunk(m, p, nb);
+
+  assert((char*)oldfirst > (char*)q);
+  assert(pinuse(oldfirst));
+  assert(qsize >= MIN_CHUNK_SIZE);
+
+  /* consolidate remainder with first chunk of old base */
+  if (oldfirst == m->top) {
+    size_t tsize = m->topsize += qsize;
+    m->top = q;
+    q->head = tsize | PINUSE_BIT;
+    check_top_chunk(m, q);
+  }
+  else if (oldfirst == m->dv) {
+    size_t dsize = m->dvsize += qsize;
+    m->dv = q;
+    set_size_and_pinuse_of_free_chunk(q, dsize);
+  }
+  else {
+    if (!is_inuse(oldfirst)) {
+      size_t nsize = chunksize(oldfirst);
+      unlink_chunk(m, oldfirst, nsize);
+      oldfirst = chunk_plus_offset(oldfirst, nsize);
+      qsize += nsize;
+    }
+    set_free_with_pinuse(q, qsize, oldfirst);
+    insert_chunk(m, q, qsize);
+    check_free_chunk(m, q);
+  }
+
+  check_malloced_chunk(m, chunk2mem(p), nb);
+  return chunk2mem(p);
+}
+
+/* Add a segment to hold a new noncontiguous region */
+static void add_segment(mstate m, char* tbase, size_t tsize, flag_t mmapped) {
+  /* Determine locations and sizes of segment, fenceposts, old top */
+  char* old_top = (char*)m->top;
+  msegmentptr oldsp = segment_holding(m, old_top);
+  char* old_end = oldsp->base + oldsp->size;
+  size_t ssize = pad_request(sizeof(struct malloc_segment));
+  char* rawsp = old_end - (ssize + FOUR_SIZE_T_SIZES + CHUNK_ALIGN_MASK);
+  size_t offset = align_offset(chunk2mem(rawsp));
+  char* asp = rawsp + offset;
+  char* csp = (asp < (old_top + MIN_CHUNK_SIZE))? old_top : asp;
+  mchunkptr sp = (mchunkptr)csp;
+  msegmentptr ss = (msegmentptr)(chunk2mem(sp));
+  mchunkptr tnext = chunk_plus_offset(sp, ssize);
+  mchunkptr p = tnext;
+  int nfences = 0;
+
+  /* reset top to new space */
+  init_top(m, (mchunkptr)tbase, tsize - TOP_FOOT_SIZE);
+
+  /* Set up segment record */
+  assert(is_aligned(ss));
+  set_size_and_pinuse_of_inuse_chunk(m, sp, ssize);
+  *ss = m->seg; /* Push current record */
+  m->seg.base = tbase;
+  m->seg.size = tsize;
+  m->seg.sflags = mmapped;
+  m->seg.next = ss;
+
+  /* Insert trailing fenceposts */
+  for (;;) {
+    mchunkptr nextp = chunk_plus_offset(p, SIZE_T_SIZE);
+    p->head = FENCEPOST_HEAD;
+    ++nfences;
+    if ((char*)(&(nextp->head)) < old_end)
+      p = nextp;
+    else
+      break;
+  }
+  assert(nfences >= 2);
+
+  /* Insert the rest of old top into a bin as an ordinary free chunk */
+  if (csp != old_top) {
+    mchunkptr q = (mchunkptr)old_top;
+    size_t psize = csp - old_top;
+    mchunkptr tn = chunk_plus_offset(q, psize);
+    set_free_with_pinuse(q, psize, tn);
+    insert_chunk(m, q, psize);
+  }
+
+  check_top_chunk(m, m->top);
+}
+
+/* -------------------------- System allocation -------------------------- */
+
+/* Get memory from system using MORECORE or MMAP */
+static void* sys_alloc(mstate m, size_t nb) {
+  char* tbase = CMFAIL;
+  size_t tsize = 0;
+  flag_t mmap_flag = 0;
+
+  ensure_initialization();
+
+  /* Directly map large chunks, but only if already initialized */
+  if (use_mmap(m) && nb >= mparams.mmap_threshold && m->topsize != 0) {
+    void* mem = mmap_alloc(m, nb);
+    if (mem != 0)
+      return mem;
+  }
+
+  /*
+    Try getting memory in any of three ways (in most-preferred to
+    least-preferred order):
+    1. A call to MORECORE that can normally contiguously extend memory.
+       (disabled if not MORECORE_CONTIGUOUS or not HAVE_MORECORE or
+       or main space is mmapped or a previous contiguous call failed)
+    2. A call to MMAP new space (disabled if not HAVE_MMAP).
+       Note that under the default settings, if MORECORE is unable to
+       fulfill a request, and HAVE_MMAP is true, then mmap is
+       used as a noncontiguous system allocator. This is a useful backup
+       strategy for systems with holes in address spaces -- in this case
+       sbrk cannot contiguously expand the heap, but mmap may be able to
+       find space.
+    3. A call to MORECORE that cannot usually contiguously extend memory.
+       (disabled if not HAVE_MORECORE)
+
+   In all cases, we need to request enough bytes from system to ensure
+   we can malloc nb bytes upon success, so pad with enough space for
+   top_foot, plus alignment-pad to make sure we don't lose bytes if
+   not on boundary, and round this up to a granularity unit.
+  */
+
+  if (MORECORE_CONTIGUOUS && !use_noncontiguous(m)) {
+    char* br = CMFAIL;
+    msegmentptr ss = (m->top == 0)? 0 : segment_holding(m, (char*)m->top);
+    size_t asize = 0;
+    ACQUIRE_MALLOC_GLOBAL_LOCK();
+
+    if (ss == 0) {  /* First time through or recovery */
+      char* base = (char*)CALL_MORECORE(0);
+      if (base != CMFAIL) {
+        asize = granularity_align(nb + SYS_ALLOC_PADDING);
+        /* Adjust to end on a page boundary */
+        if (!is_page_aligned(base))
+          asize += (page_align((size_t)base) - (size_t)base);
+        /* Can't call MORECORE if size is negative when treated as signed */
+        if (asize < HALF_MAX_SIZE_T &&
+            (br = (char*)(CALL_MORECORE(asize))) == base) {
+          tbase = base;
+          tsize = asize;
+        }
+      }
+    }
+    else {
+      /* Subtract out existing available top space from MORECORE request. */
+      asize = granularity_align(nb - m->topsize + SYS_ALLOC_PADDING);
+      /* Use mem here only if it did continuously extend old space */
+      if (asize < HALF_MAX_SIZE_T &&
+          (br = (char*)(CALL_MORECORE(asize))) == ss->base+ss->size) {
+        tbase = br;
+        tsize = asize;
+      }
+    }
+
+    if (tbase == CMFAIL) {    /* Cope with partial failure */
+      if (br != CMFAIL) {    /* Try to use/extend the space we did get */
+        if (asize < HALF_MAX_SIZE_T &&
+            asize < nb + SYS_ALLOC_PADDING) {
+          size_t esize = granularity_align(nb + SYS_ALLOC_PADDING - asize);
+          if (esize < HALF_MAX_SIZE_T) {
+            char* end = (char*)CALL_MORECORE(esize);
+            if (end != CMFAIL)
+              asize += esize;
+            else {            /* Can't use; try to release */
+              (void) CALL_MORECORE(-asize);
+              br = CMFAIL;
+            }
+          }
+        }
+      }
+      if (br != CMFAIL) {    /* Use the space we did get */
+        tbase = br;
+        tsize = asize;
+      }
+      else
+        disable_contiguous(m); /* Don't try contiguous path in the future */
+    }
+
+    RELEASE_MALLOC_GLOBAL_LOCK();
+  }
+
+  if (HAVE_MMAP && tbase == CMFAIL) {  /* Try MMAP */
+    size_t rsize = granularity_align(nb + SYS_ALLOC_PADDING);
+    if (rsize > nb) { /* Fail if wraps around zero */
+      char* mp = (char*)(CALL_MMAP(rsize));
+      if (mp != CMFAIL) {
+        tbase = mp;
+        tsize = rsize;
+        mmap_flag = USE_MMAP_BIT;
+      }
+    }
+  }
+
+  if (HAVE_MORECORE && tbase == CMFAIL) { /* Try noncontiguous MORECORE */
+    size_t asize = granularity_align(nb + SYS_ALLOC_PADDING);
+    if (asize < HALF_MAX_SIZE_T) {
+      char* br = CMFAIL;
+      char* end = CMFAIL;
+      ACQUIRE_MALLOC_GLOBAL_LOCK();
+      br = (char*)(CALL_MORECORE(asize));
+      end = (char*)(CALL_MORECORE(0));
+      RELEASE_MALLOC_GLOBAL_LOCK();
+      if (br != CMFAIL && end != CMFAIL && br < end) {
+        size_t ssize = end - br;
+        if (ssize > nb + TOP_FOOT_SIZE) {
+          tbase = br;
+          tsize = ssize;
+        }
+      }
+    }
+  }
+
+  if (tbase != CMFAIL) {
+
+    if ((m->footprint += tsize) > m->max_footprint)
+      m->max_footprint = m->footprint;
+
+    if (!is_initialized(m)) { /* first-time initialization */
+      if (m->least_addr == 0 || tbase < m->least_addr)
+        m->least_addr = tbase;
+      m->seg.base = tbase;
+      m->seg.size = tsize;
+      m->seg.sflags = mmap_flag;
+      m->magic = mparams.magic;
+      m->release_checks = MAX_RELEASE_CHECK_RATE;
+      init_bins(m);
+#if !ONLY_MSPACES
+      if (is_global(m))
+        init_top(m, (mchunkptr)tbase, tsize - TOP_FOOT_SIZE);
+      else
+#endif
+      {
+        /* Offset top by embedded malloc_state */
+        mchunkptr mn = next_chunk(mem2chunk(m));
+        init_top(m, mn, (size_t)((tbase + tsize) - (char*)mn) -TOP_FOOT_SIZE);
+      }
+    }
+
+    else {
+      /* Try to merge with an existing segment */
+      msegmentptr sp = &m->seg;
+      /* Only consider most recent segment if traversal suppressed */
+      while (sp != 0 && tbase != sp->base + sp->size)
+        sp = (NO_SEGMENT_TRAVERSAL) ? 0 : sp->next;
+      if (sp != 0 &&
+          !is_extern_segment(sp) &&
+          (sp->sflags & USE_MMAP_BIT) == mmap_flag &&
+          segment_holds(sp, m->top)) { /* append */
+        sp->size += tsize;
+        init_top(m, m->top, m->topsize + tsize);
+      }
+      else {
+        if (tbase < m->least_addr)
+          m->least_addr = tbase;
+        sp = &m->seg;
+        while (sp != 0 && sp->base != tbase + tsize)
+          sp = (NO_SEGMENT_TRAVERSAL) ? 0 : sp->next;
+        if (sp != 0 &&
+            !is_extern_segment(sp) &&
+            (sp->sflags & USE_MMAP_BIT) == mmap_flag) {
+          char* oldbase = sp->base;
+          sp->base = tbase;
+          sp->size += tsize;
+          return prepend_alloc(m, tbase, oldbase, nb);
+        }
+        else
+          add_segment(m, tbase, tsize, mmap_flag);
+      }
+    }
+
+    if (nb < m->topsize) { /* Allocate from new or extended top space */
+      size_t rsize = m->topsize -= nb;
+      mchunkptr p = m->top;
+      mchunkptr r = m->top = chunk_plus_offset(p, nb);
+      r->head = rsize | PINUSE_BIT;
+      set_size_and_pinuse_of_inuse_chunk(m, p, nb);
+      check_top_chunk(m, m->top);
+      check_malloced_chunk(m, chunk2mem(p), nb);
+      return chunk2mem(p);
+    }
+  }
+
+  MALLOC_FAILURE_ACTION;
+  return 0;
+}
+
+/* -----------------------  system deallocation -------------------------- */
+
+/* Unmap and unlink any mmapped segments that don't contain used chunks */
+static size_t release_unused_segments(mstate m) {
+  size_t released = 0;
+  unsigned int nsegs = 0; //dsm: Was signed, and gcc was complaining about signed vs unsigned comparisons
+  msegmentptr pred = &m->seg;
+  msegmentptr sp = pred->next;
+  while (sp != 0) {
+    char* base = sp->base;
+    size_t size = sp->size;
+    msegmentptr next = sp->next;
+    ++nsegs;
+    if (is_mmapped_segment(sp) && !is_extern_segment(sp)) {
+      mchunkptr p = align_as_chunk(base);
+      size_t psize = chunksize(p);
+      /* Can unmap if first chunk holds entire segment and not pinned */
+      if (!is_inuse(p) && (char*)p + psize >= base + size - TOP_FOOT_SIZE) {
+        tchunkptr tp = (tchunkptr)p;
+        assert(segment_holds(sp, (char*)sp));
+        if (p == m->dv) {
+          m->dv = 0;
+          m->dvsize = 0;
+        }
+        else {
+          unlink_large_chunk(m, tp);
+        }
+        if (CALL_MUNMAP(base, size) == 0) {
+          released += size;
+          m->footprint -= size;
+          /* unlink obsoleted record */
+          sp = pred;
+          sp->next = next;
+        }
+        else { /* back out if cannot unmap */
+          insert_large_chunk(m, tp, psize);
+        }
+      }
+    }
+    if (NO_SEGMENT_TRAVERSAL) /* scan only first segment */
+      break;
+    pred = sp;
+    sp = next;
+  }
+  /* Reset check counter */
+  m->release_checks = ((nsegs > MAX_RELEASE_CHECK_RATE)?
+                       nsegs : MAX_RELEASE_CHECK_RATE);
+  return released;
+}
+
+static int sys_trim(mstate m, size_t pad) {
+  size_t released = 0;
+  ensure_initialization();
+  if (pad < MAX_REQUEST && is_initialized(m)) {
+    pad += TOP_FOOT_SIZE; /* ensure enough room for segment overhead */
+
+    if (m->topsize > pad) {
+      /* Shrink top space in granularity-size units, keeping at least one */
+      size_t unit = mparams.granularity;
+      size_t extra = ((m->topsize - pad + (unit - SIZE_T_ONE)) / unit -
+                      SIZE_T_ONE) * unit;
+      msegmentptr sp = segment_holding(m, (char*)m->top);
+
+      if (!is_extern_segment(sp)) {
+        if (is_mmapped_segment(sp)) {
+          if (HAVE_MMAP &&
+              sp->size >= extra &&
+              !has_segment_link(m, sp)) { /* can't shrink if pinned */
+            size_t newsize = sp->size - extra;
+            (void)newsize; //dsm: Kill unused warning. Without MMAP, newsize is not used, so the compiler will complain (even though HAVE_MMAP == 0 and this code will be eliminated)
+            /* Prefer mremap, fall back to munmap */
+            if ((CALL_MREMAP(sp->base, sp->size, newsize, 0) != MFAIL) ||
+                (CALL_MUNMAP(sp->base + newsize, extra) == 0)) {
+              released = extra;
+            }
+          }
+        }
+        else if (HAVE_MORECORE) {
+          if (extra >= HALF_MAX_SIZE_T) /* Avoid wrapping negative */
+            extra = (HALF_MAX_SIZE_T) + SIZE_T_ONE - unit;
+          ACQUIRE_MALLOC_GLOBAL_LOCK();
+          {
+            /* Make sure end of memory is where we last set it. */
+            char* old_br = (char*)(CALL_MORECORE(0));
+            if (old_br == sp->base + sp->size) {
+              char* rel_br = (char*)(CALL_MORECORE(-extra));
+              char* new_br = (char*)(CALL_MORECORE(0));
+              if (rel_br != CMFAIL && new_br < old_br)
+                released = old_br - new_br;
+            }
+          }
+          RELEASE_MALLOC_GLOBAL_LOCK();
+        }
+      }
+
+      if (released != 0) {
+        sp->size -= released;
+        m->footprint -= released;
+        init_top(m, m->top, m->topsize - released);
+        check_top_chunk(m, m->top);
+      }
+    }
+
+    /* Unmap any unused mmapped segments */
+    if (HAVE_MMAP)
+      released += release_unused_segments(m);
+
+    /* On failure, disable autotrim to avoid repeated failed future calls */
+    if (released == 0 && m->topsize > m->trim_check)
+      m->trim_check = MAX_SIZE_T;
+  }
+
+  return (released != 0)? 1 : 0;
+}
+
+
+/* ---------------------------- malloc support --------------------------- */
+
+/* allocate a large request from the best fitting chunk in a treebin */
+static void* tmalloc_large(mstate m, size_t nb) {
+  tchunkptr v = 0;
+  size_t rsize = -nb; /* Unsigned negation */
+  tchunkptr t;
+  bindex_t idx;
+  compute_tree_index(nb, idx);
+  if ((t = *treebin_at(m, idx)) != 0) {
+    /* Traverse tree for this bin looking for node with size == nb */
+    size_t sizebits = nb << leftshift_for_tree_index(idx);
+    tchunkptr rst = 0;  /* The deepest untaken right subtree */
+    for (;;) {
+      tchunkptr rt;
+      size_t trem = chunksize(t) - nb;
+      if (trem < rsize) {
+        v = t;
+        if ((rsize = trem) == 0)
+          break;
+      }
+      rt = t->child[1];
+      t = t->child[(sizebits >> (SIZE_T_BITSIZE-SIZE_T_ONE)) & 1];
+      if (rt != 0 && rt != t)
+        rst = rt;
+      if (t == 0) {
+        t = rst; /* set t to least subtree holding sizes > nb */
+        break;
+      }
+      sizebits <<= 1;
+    }
+  }
+  if (t == 0 && v == 0) { /* set t to root of next non-empty treebin */
+    binmap_t leftbits = left_bits(idx2bit(idx)) & m->treemap;
+    if (leftbits != 0) {
+      bindex_t i;
+      binmap_t leastbit = least_bit(leftbits);
+      compute_bit2idx(leastbit, i);
+      t = *treebin_at(m, i);
+    }
+  }
+
+  while (t != 0) { /* find smallest of tree or subtree */
+    size_t trem = chunksize(t) - nb;
+    if (trem < rsize) {
+      rsize = trem;
+      v = t;
+    }
+    t = leftmost_child(t);
+  }
+
+  /*  If dv is a better fit, return 0 so malloc will use it */
+  if (v != 0 && rsize < (size_t)(m->dvsize - nb)) {
+    if (RTCHECK(ok_address(m, v))) { /* split */
+      mchunkptr r = chunk_plus_offset(v, nb);
+      assert(chunksize(v) == rsize + nb);
+      if (RTCHECK(ok_next(v, r))) {
+        unlink_large_chunk(m, v);
+        if (rsize < MIN_CHUNK_SIZE)
+          set_inuse_and_pinuse(m, v, (rsize + nb));
+        else {
+          set_size_and_pinuse_of_inuse_chunk(m, v, nb);
+          set_size_and_pinuse_of_free_chunk(r, rsize);
+          insert_chunk(m, r, rsize);
+        }
+        return chunk2mem(v);
+      }
+    }
+    CORRUPTION_ERROR_ACTION(m);
+  }
+  return 0;
+}
+
+/* allocate a small request from the best fitting chunk in a treebin */
+static void* tmalloc_small(mstate m, size_t nb) {
+  tchunkptr t, v;
+  size_t rsize;
+  bindex_t i;
+  binmap_t leastbit = least_bit(m->treemap);
+  compute_bit2idx(leastbit, i);
+  v = t = *treebin_at(m, i);
+  rsize = chunksize(t) - nb;
+
+  while ((t = leftmost_child(t)) != 0) {
+    size_t trem = chunksize(t) - nb;
+    if (trem < rsize) {
+      rsize = trem;
+      v = t;
+    }
+  }
+
+  if (RTCHECK(ok_address(m, v))) {
+    mchunkptr r = chunk_plus_offset(v, nb);
+    assert(chunksize(v) == rsize + nb);
+    if (RTCHECK(ok_next(v, r))) {
+      unlink_large_chunk(m, v);
+      if (rsize < MIN_CHUNK_SIZE)
+        set_inuse_and_pinuse(m, v, (rsize + nb));
+      else {
+        set_size_and_pinuse_of_inuse_chunk(m, v, nb);
+        set_size_and_pinuse_of_free_chunk(r, rsize);
+        replace_dv(m, r, rsize);
+      }
+      return chunk2mem(v);
+    }
+  }
+
+  CORRUPTION_ERROR_ACTION(m);
+  return 0;
+}
+
+/* --------------------------- realloc support --------------------------- */
+
+static void* internal_realloc(mstate m, void* oldmem, size_t bytes) {
+  if (bytes >= MAX_REQUEST) {
+    MALLOC_FAILURE_ACTION;
+    return 0;
+  }
+  if (!PREACTION(m)) {
+    mchunkptr oldp = mem2chunk(oldmem);
+    size_t oldsize = chunksize(oldp);
+    mchunkptr next = chunk_plus_offset(oldp, oldsize);
+    mchunkptr newp = 0;
+    void* extra = 0;
+
+    /* Try to either shrink or extend into top. Else malloc-copy-free */
+
+    if (RTCHECK(ok_address(m, oldp) && ok_inuse(oldp) &&
+                ok_next(oldp, next) && ok_pinuse(next))) {
+      size_t nb = request2size(bytes);
+      if (is_mmapped(oldp))
+        newp = mmap_resize(m, oldp, nb);
+      else if (oldsize >= nb) { /* already big enough */
+        size_t rsize = oldsize - nb;
+        newp = oldp;
+        if (rsize >= MIN_CHUNK_SIZE) {
+          mchunkptr remainder = chunk_plus_offset(newp, nb);
+          set_inuse(m, newp, nb);
+          set_inuse_and_pinuse(m, remainder, rsize);
+          extra = chunk2mem(remainder);
+        }
+      }
+      else if (next == m->top && oldsize + m->topsize > nb) {
+        /* Expand into top */
+        size_t newsize = oldsize + m->topsize;
+        size_t newtopsize = newsize - nb;
+        mchunkptr newtop = chunk_plus_offset(oldp, nb);
+        set_inuse(m, oldp, nb);
+        newtop->head = newtopsize |PINUSE_BIT;
+        m->top = newtop;
+        m->topsize = newtopsize;
+        newp = oldp;
+      }
+    }
+    else {
+      USAGE_ERROR_ACTION(m, oldmem);
+      POSTACTION(m);
+      return 0;
+    }
+#if DEBUG
+    if (newp != 0) {
+      check_inuse_chunk(m, newp); /* Check requires lock */
+    }
+#endif
+
+    POSTACTION(m);
+
+    if (newp != 0) {
+      if (extra != 0) {
+        internal_free(m, extra);
+      }
+      return chunk2mem(newp);
+    }
+    else {
+      void* newmem = internal_malloc(m, bytes);
+      if (newmem != 0) {
+        size_t oc = oldsize - overhead_for(oldp);
+        memcpy(newmem, oldmem, (oc < bytes)? oc : bytes);
+        internal_free(m, oldmem);
+      }
+      return newmem;
+    }
+  }
+  return 0;
+}
+
+/* --------------------------- memalign support -------------------------- */
+
+static void* internal_memalign(mstate m, size_t alignment, size_t bytes) {
+  if (alignment <= MALLOC_ALIGNMENT)    /* Can just use malloc */
+    return internal_malloc(m, bytes);
+  if (alignment <  MIN_CHUNK_SIZE) /* must be at least a minimum chunk size */
+    alignment = MIN_CHUNK_SIZE;
+  if ((alignment & (alignment-SIZE_T_ONE)) != 0) {/* Ensure a power of 2 */
+    size_t a = MALLOC_ALIGNMENT << 1;
+    while (a < alignment) a <<= 1;
+    alignment = a;
+  }
+
+  if (bytes >= MAX_REQUEST - alignment) {
+    if (m != 0)  { /* Test isn't needed but avoids compiler warning */
+      MALLOC_FAILURE_ACTION;
+    }
+  }
+  else {
+    size_t nb = request2size(bytes);
+    size_t req = nb + alignment + MIN_CHUNK_SIZE - CHUNK_OVERHEAD;
+    char* mem = (char*)internal_malloc(m, req);
+    if (mem != 0) {
+      void* leader = 0;
+      void* trailer = 0;
+      mchunkptr p = mem2chunk(mem);
+
+      if (PREACTION(m)) return 0;
+      if ((((size_t)(mem)) % alignment) != 0) { /* misaligned */
+        /*
+          Find an aligned spot inside chunk.  Since we need to give
+          back leading space in a chunk of at least MIN_CHUNK_SIZE, if
+          the first calculation places us at a spot with less than
+          MIN_CHUNK_SIZE leader, we can move to the next aligned spot.
+          We've allocated enough total room so that this is always
+          possible.
+        */
+        char* br = (char*)mem2chunk((size_t)(((size_t)(mem +
+                                                       alignment -
+                                                       SIZE_T_ONE)) &
+                                             -alignment));
+        char* pos = ((size_t)(br - (char*)(p)) >= MIN_CHUNK_SIZE)?
+          br : br+alignment;
+        mchunkptr newp = (mchunkptr)pos;
+        size_t leadsize = pos - (char*)(p);
+        size_t newsize = chunksize(p) - leadsize;
+
+        if (is_mmapped(p)) { /* For mmapped chunks, just adjust offset */
+          newp->prev_foot = p->prev_foot + leadsize;
+          newp->head = newsize;
+        }
+        else { /* Otherwise, give back leader, use the rest */
+          set_inuse(m, newp, newsize);
+          set_inuse(m, p, leadsize);
+          leader = chunk2mem(p);
+        }
+        p = newp;
+      }
+
+      /* Give back spare room at the end */
+      if (!is_mmapped(p)) {
+        size_t size = chunksize(p);
+        if (size > nb + MIN_CHUNK_SIZE) {
+          size_t remainder_size = size - nb;
+          mchunkptr remainder = chunk_plus_offset(p, nb);
+          set_inuse(m, p, nb);
+          set_inuse(m, remainder, remainder_size);
+          trailer = chunk2mem(remainder);
+        }
+      }
+
+      assert (chunksize(p) >= nb);
+      assert((((size_t)(chunk2mem(p))) % alignment) == 0);
+      check_inuse_chunk(m, p);
+      POSTACTION(m);
+      if (leader != 0) {
+        internal_free(m, leader);
+      }
+      if (trailer != 0) {
+        internal_free(m, trailer);
+      }
+      return chunk2mem(p);
+    }
+  }
+  return 0;
+}
+
+/* ------------------------ comalloc/coalloc support --------------------- */
+
+static void** ialloc(mstate m,
+                     size_t n_elements,
+                     size_t* sizes,
+                     int opts,
+                     void* chunks[]) {
+  /*
+    This provides common support for independent_X routines, handling
+    all of the combinations that can result.
+
+    The opts arg has:
+    bit 0 set if all elements are same size (using sizes[0])
+    bit 1 set if elements should be zeroed
+  */
+
+  size_t    element_size;   /* chunksize of each element, if all same */
+  size_t    contents_size;  /* total size of elements */
+  size_t    array_size;     /* request size of pointer array */
+  void*     mem;            /* malloced aggregate space */
+  mchunkptr p;              /* corresponding chunk */
+  size_t    remainder_size; /* remaining bytes while splitting */
+  void**    marray;         /* either "chunks" or malloced ptr array */
+  mchunkptr array_chunk;    /* chunk for malloced ptr array */
+  flag_t    was_enabled;    /* to disable mmap */
+  size_t    size;
+  size_t    i;
+
+  ensure_initialization();
+  /* compute array length, if needed */
+  if (chunks != 0) {
+    if (n_elements == 0)
+      return chunks; /* nothing to do */
+    marray = chunks;
+    array_size = 0;
+  }
+  else {
+    /* if empty req, must still return chunk representing empty array */
+    if (n_elements == 0)
+      return (void**)internal_malloc(m, 0);
+    marray = 0;
+    array_size = request2size(n_elements * (sizeof(void*)));
+  }
+
+  /* compute total element size */
+  if (opts & 0x1) { /* all-same-size */
+    element_size = request2size(*sizes);
+    contents_size = n_elements * element_size;
+  }
+  else { /* add up all the sizes */
+    element_size = 0;
+    contents_size = 0;
+    for (i = 0; i != n_elements; ++i)
+      contents_size += request2size(sizes[i]);
+  }
+
+  size = contents_size + array_size;
+
+  /*
+     Allocate the aggregate chunk.  First disable direct-mmapping so
+     malloc won't use it, since we would not be able to later
+     free/realloc space internal to a segregated mmap region.
+  */
+  was_enabled = use_mmap(m);
+  disable_mmap(m);
+  mem = internal_malloc(m, size - CHUNK_OVERHEAD);
+  if (was_enabled)
+    enable_mmap(m);
+  if (mem == 0)
+    return 0;
+
+  if (PREACTION(m)) return 0;
+  p = mem2chunk(mem);
+  remainder_size = chunksize(p);
+
+  assert(!is_mmapped(p));
+
+  if (opts & 0x2) {       /* optionally clear the elements */
+    memset((size_t*)mem, 0, remainder_size - SIZE_T_SIZE - array_size);
+  }
+
+  /* If not provided, allocate the pointer array as final part of chunk */
+  if (marray == 0) {
+    size_t  array_chunk_size;
+    array_chunk = chunk_plus_offset(p, contents_size);
+    array_chunk_size = remainder_size - contents_size;
+    marray = (void**) (chunk2mem(array_chunk));
+    set_size_and_pinuse_of_inuse_chunk(m, array_chunk, array_chunk_size);
+    remainder_size = contents_size;
+  }
+
+  /* split out elements */
+  for (i = 0; ; ++i) {
+    marray[i] = chunk2mem(p);
+    if (i != n_elements-1) {
+      if (element_size != 0)
+        size = element_size;
+      else
+        size = request2size(sizes[i]);
+      remainder_size -= size;
+      set_size_and_pinuse_of_inuse_chunk(m, p, size);
+      p = chunk_plus_offset(p, size);
+    }
+    else { /* the final element absorbs any overallocation slop */
+      set_size_and_pinuse_of_inuse_chunk(m, p, remainder_size);
+      break;
+    }
+  }
+
+#if DEBUG
+  if (marray != chunks) {
+    /* final element must have exactly exhausted chunk */
+    if (element_size != 0) {
+      assert(remainder_size == element_size);
+    }
+    else {
+      assert(remainder_size == request2size(sizes[i]));
+    }
+    check_inuse_chunk(m, mem2chunk(marray));
+  }
+  for (i = 0; i != n_elements; ++i)
+    check_inuse_chunk(m, mem2chunk(marray[i]));
+
+#endif /* DEBUG */
+
+  POSTACTION(m);
+  return marray;
+}
+
+
+/* -------------------------- public routines ---------------------------- */
+
+#if !ONLY_MSPACES
+
+void* dlmalloc(size_t bytes) {
+  /*
+     Basic algorithm:
+     If a small request (< 256 bytes minus per-chunk overhead):
+       1. If one exists, use a remainderless chunk in associated smallbin.
+          (Remainderless means that there are too few excess bytes to
+          represent as a chunk.)
+       2. If it is big enough, use the dv chunk, which is normally the
+          chunk adjacent to the one used for the most recent small request.
+       3. If one exists, split the smallest available chunk in a bin,
+          saving remainder in dv.
+       4. If it is big enough, use the top chunk.
+       5. If available, get memory from system and use it
+     Otherwise, for a large request:
+       1. Find the smallest available binned chunk that fits, and use it
+          if it is better fitting than dv chunk, splitting if necessary.
+       2. If better fitting than any binned chunk, use the dv chunk.
+       3. If it is big enough, use the top chunk.
+       4. If request size >= mmap threshold, try to directly mmap this chunk.
+       5. If available, get memory from system and use it
+
+     The ugly goto's here ensure that postaction occurs along all paths.
+  */
+
+#if USE_LOCKS
+  ensure_initialization(); /* initialize in sys_alloc if not using locks */
+#endif
+
+  if (!PREACTION(gm)) {
+    void* mem;
+    size_t nb;
+    if (bytes <= MAX_SMALL_REQUEST) {
+      bindex_t idx;
+      binmap_t smallbits;
+      nb = (bytes < MIN_REQUEST)? MIN_CHUNK_SIZE : pad_request(bytes);
+      idx = small_index(nb);
+      smallbits = gm->smallmap >> idx;
+
+      if ((smallbits & 0x3U) != 0) { /* Remainderless fit to a smallbin. */
+        mchunkptr b, p;
+        idx += ~smallbits & 1;       /* Uses next bin if idx empty */
+        b = smallbin_at(gm, idx);
+        p = b->fd;
+        assert(chunksize(p) == small_index2size(idx));
+        unlink_first_small_chunk(gm, b, p, idx);
+        set_inuse_and_pinuse(gm, p, small_index2size(idx));
+        mem = chunk2mem(p);
+        check_malloced_chunk(gm, mem, nb);
+        goto postaction;
+      }
+
+      else if (nb > gm->dvsize) {
+        if (smallbits != 0) { /* Use chunk in next nonempty smallbin */
+          mchunkptr b, p, r;
+          size_t rsize;
+          bindex_t i;
+          binmap_t leftbits = (smallbits << idx) & left_bits(idx2bit(idx));
+          binmap_t leastbit = least_bit(leftbits);
+          compute_bit2idx(leastbit, i);
+          b = smallbin_at(gm, i);
+          p = b->fd;
+          assert(chunksize(p) == small_index2size(i));
+          unlink_first_small_chunk(gm, b, p, i);
+          rsize = small_index2size(i) - nb;
+          /* Fit here cannot be remainderless if 4byte sizes */
+          if (SIZE_T_SIZE != 4 && rsize < MIN_CHUNK_SIZE)
+            set_inuse_and_pinuse(gm, p, small_index2size(i));
+          else {
+            set_size_and_pinuse_of_inuse_chunk(gm, p, nb);
+            r = chunk_plus_offset(p, nb);
+            set_size_and_pinuse_of_free_chunk(r, rsize);
+            replace_dv(gm, r, rsize);
+          }
+          mem = chunk2mem(p);
+          check_malloced_chunk(gm, mem, nb);
+          goto postaction;
+        }
+
+        else if (gm->treemap != 0 && (mem = tmalloc_small(gm, nb)) != 0) {
+          check_malloced_chunk(gm, mem, nb);
+          goto postaction;
+        }
+      }
+    }
+    else if (bytes >= MAX_REQUEST)
+      nb = MAX_SIZE_T; /* Too big to allocate. Force failure (in sys alloc) */
+    else {
+      nb = pad_request(bytes);
+      if (gm->treemap != 0 && (mem = tmalloc_large(gm, nb)) != 0) {
+        check_malloced_chunk(gm, mem, nb);
+        goto postaction;
+      }
+    }
+
+    if (nb <= gm->dvsize) {
+      size_t rsize = gm->dvsize - nb;
+      mchunkptr p = gm->dv;
+      if (rsize >= MIN_CHUNK_SIZE) { /* split dv */
+        mchunkptr r = gm->dv = chunk_plus_offset(p, nb);
+        gm->dvsize = rsize;
+        set_size_and_pinuse_of_free_chunk(r, rsize);
+        set_size_and_pinuse_of_inuse_chunk(gm, p, nb);
+      }
+      else { /* exhaust dv */
+        size_t dvs = gm->dvsize;
+        gm->dvsize = 0;
+        gm->dv = 0;
+        set_inuse_and_pinuse(gm, p, dvs);
+      }
+      mem = chunk2mem(p);
+      check_malloced_chunk(gm, mem, nb);
+      goto postaction;
+    }
+
+    else if (nb < gm->topsize) { /* Split top */
+      size_t rsize = gm->topsize -= nb;
+      mchunkptr p = gm->top;
+      mchunkptr r = gm->top = chunk_plus_offset(p, nb);
+      r->head = rsize | PINUSE_BIT;
+      set_size_and_pinuse_of_inuse_chunk(gm, p, nb);
+      mem = chunk2mem(p);
+      check_top_chunk(gm, gm->top);
+      check_malloced_chunk(gm, mem, nb);
+      goto postaction;
+    }
+
+    mem = sys_alloc(gm, nb);
+
+  postaction:
+    POSTACTION(gm);
+    return mem;
+  }
+
+  return 0;
+}
+
+void dlfree(void* mem) {
+  /*
+     Consolidate freed chunks with preceeding or succeeding bordering
+     free chunks, if they exist, and then place in a bin.  Intermixed
+     with special cases for top, dv, mmapped chunks, and usage errors.
+  */
+
+  if (mem != 0) {
+    mchunkptr p  = mem2chunk(mem);
+#if FOOTERS
+    mstate fm = get_mstate_for(p);
+    if (!ok_magic(fm)) {
+      USAGE_ERROR_ACTION(fm, p);
+      return;
+    }
+#else /* FOOTERS */
+#define fm gm
+#endif /* FOOTERS */
+    if (!PREACTION(fm)) {
+      check_inuse_chunk(fm, p);
+      if (RTCHECK(ok_address(fm, p) && ok_inuse(p))) {
+        size_t psize = chunksize(p);
+        mchunkptr next = chunk_plus_offset(p, psize);
+        if (!pinuse(p)) {
+          size_t prevsize = p->prev_foot;
+          if (is_mmapped(p)) {
+            psize += prevsize + MMAP_FOOT_PAD;
+            if (CALL_MUNMAP((char*)p - prevsize, psize) == 0)
+              fm->footprint -= psize;
+            goto postaction;
+          }
+          else {
+            mchunkptr prev = chunk_minus_offset(p, prevsize);
+            psize += prevsize;
+            p = prev;
+            if (RTCHECK(ok_address(fm, prev))) { /* consolidate backward */
+              if (p != fm->dv) {
+                unlink_chunk(fm, p, prevsize);
+              }
+              else if ((next->head & INUSE_BITS) == INUSE_BITS) {
+                fm->dvsize = psize;
+                set_free_with_pinuse(p, psize, next);
+                goto postaction;
+              }
+            }
+            else
+              goto erroraction;
+          }
+        }
+
+        if (RTCHECK(ok_next(p, next) && ok_pinuse(next))) {
+          if (!cinuse(next)) {  /* consolidate forward */
+            if (next == fm->top) {
+              size_t tsize = fm->topsize += psize;
+              fm->top = p;
+              p->head = tsize | PINUSE_BIT;
+              if (p == fm->dv) {
+                fm->dv = 0;
+                fm->dvsize = 0;
+              }
+              if (should_trim(fm, tsize))
+                sys_trim(fm, 0);
+              goto postaction;
+            }
+            else if (next == fm->dv) {
+              size_t dsize = fm->dvsize += psize;
+              fm->dv = p;
+              set_size_and_pinuse_of_free_chunk(p, dsize);
+              goto postaction;
+            }
+            else {
+              size_t nsize = chunksize(next);
+              psize += nsize;
+              unlink_chunk(fm, next, nsize);
+              set_size_and_pinuse_of_free_chunk(p, psize);
+              if (p == fm->dv) {
+                fm->dvsize = psize;
+                goto postaction;
+              }
+            }
+          }
+          else
+            set_free_with_pinuse(p, psize, next);
+
+          if (is_small(psize)) {
+            insert_small_chunk(fm, p, psize);
+            check_free_chunk(fm, p);
+          }
+          else {
+            tchunkptr tp = (tchunkptr)p;
+            insert_large_chunk(fm, tp, psize);
+            check_free_chunk(fm, p);
+            if (--fm->release_checks == 0)
+              release_unused_segments(fm);
+          }
+          goto postaction;
+        }
+      }
+    erroraction:
+      USAGE_ERROR_ACTION(fm, p);
+    postaction:
+      POSTACTION(fm);
+    }
+  }
+#if !FOOTERS
+#undef fm
+#endif /* FOOTERS */
+}
+
+void* dlcalloc(size_t n_elements, size_t elem_size) {
+  void* mem;
+  size_t req = 0;
+  if (n_elements != 0) {
+    req = n_elements * elem_size;
+    if (((n_elements | elem_size) & ~(size_t)0xffff) &&
+        (req / n_elements != elem_size))
+      req = MAX_SIZE_T; /* force downstream failure on overflow */
+  }
+  mem = dlmalloc(req);
+  if (mem != 0 && calloc_must_clear(mem2chunk(mem)))
+    memset(mem, 0, req);
+  return mem;
+}
+
+void* dlrealloc(void* oldmem, size_t bytes) {
+  if (oldmem == 0)
+    return dlmalloc(bytes);
+#ifdef REALLOC_ZERO_BYTES_FREES
+  if (bytes == 0) {
+    dlfree(oldmem);
+    return 0;
+  }
+#endif /* REALLOC_ZERO_BYTES_FREES */
+  else {
+#if ! FOOTERS
+    mstate m = gm;
+#else /* FOOTERS */
+    mstate m = get_mstate_for(mem2chunk(oldmem));
+    if (!ok_magic(m)) {
+      USAGE_ERROR_ACTION(m, oldmem);
+      return 0;
+    }
+#endif /* FOOTERS */
+    return internal_realloc(m, oldmem, bytes);
+  }
+}
+
+void* dlmemalign(size_t alignment, size_t bytes) {
+  return internal_memalign(gm, alignment, bytes);
+}
+
+void** dlindependent_calloc(size_t n_elements, size_t elem_size,
+                                 void* chunks[]) {
+  size_t sz = elem_size; /* serves as 1-element array */
+  return ialloc(gm, n_elements, &sz, 3, chunks);
+}
+
+void** dlindependent_comalloc(size_t n_elements, size_t sizes[],
+                                   void* chunks[]) {
+  return ialloc(gm, n_elements, sizes, 0, chunks);
+}
+
+void* dlvalloc(size_t bytes) {
+  size_t pagesz;
+  ensure_initialization();
+  pagesz = mparams.page_size;
+  return dlmemalign(pagesz, bytes);
+}
+
+void* dlpvalloc(size_t bytes) {
+  size_t pagesz;
+  ensure_initialization();
+  pagesz = mparams.page_size;
+  return dlmemalign(pagesz, (bytes + pagesz - SIZE_T_ONE) & ~(pagesz - SIZE_T_ONE));
+}
+
+int dlmalloc_trim(size_t pad) {
+  int result = 0;
+  ensure_initialization();
+  if (!PREACTION(gm)) {
+    result = sys_trim(gm, pad);
+    POSTACTION(gm);
+  }
+  return result;
+}
+
+size_t dlmalloc_footprint(void) {
+  return gm->footprint;
+}
+
+size_t dlmalloc_max_footprint(void) {
+  return gm->max_footprint;
+}
+
+#if !NO_MALLINFO
+struct mallinfo dlmallinfo(void) {
+  return internal_mallinfo(gm);
+}
+#endif /* NO_MALLINFO */
+
+void dlmalloc_stats() {
+  internal_malloc_stats(gm);
+}
+
+int dlmallopt(int param_number, int value) {
+  return change_mparam(param_number, value);
+}
+
+#endif /* !ONLY_MSPACES */
+
+size_t dlmalloc_usable_size(void* mem) {
+  if (mem != 0) {
+    mchunkptr p = mem2chunk(mem);
+    if (is_inuse(p))
+      return chunksize(p) - overhead_for(p);
+  }
+  return 0;
+}
+
+/* ----------------------------- user mspaces ---------------------------- */
+
+#if MSPACES
+
+static mstate init_user_mstate(char* tbase, size_t tsize) {
+  size_t msize = pad_request(sizeof(struct malloc_state));
+  mchunkptr mn;
+  mchunkptr msp = align_as_chunk(tbase);
+  mstate m = (mstate)(chunk2mem(msp));
+  memset(m, 0, msize);
+  INITIAL_LOCK(&m->mutex);
+  msp->head = (msize|INUSE_BITS);
+  m->seg.base = m->least_addr = tbase;
+  m->seg.size = m->footprint = m->max_footprint = tsize;
+  m->magic = mparams.magic;
+  m->release_checks = MAX_RELEASE_CHECK_RATE;
+  m->mflags = mparams.default_mflags;
+  m->extp = 0;
+  m->exts = 0;
+  disable_contiguous(m);
+  init_bins(m);
+  mn = next_chunk(mem2chunk(m));
+  init_top(m, mn, (size_t)((tbase + tsize) - (char*)mn) - TOP_FOOT_SIZE);
+  check_top_chunk(m, m->top);
+  return m;
+}
+
+mspace create_mspace(size_t capacity, int locked) {
+  mstate m = 0;
+  size_t msize;
+  ensure_initialization();
+  msize = pad_request(sizeof(struct malloc_state));
+  if (capacity < (size_t) -(msize + TOP_FOOT_SIZE + mparams.page_size)) {
+    size_t rs = ((capacity == 0)? mparams.granularity :
+                 (capacity + TOP_FOOT_SIZE + msize));
+    size_t tsize = granularity_align(rs);
+    char* tbase = (char*)(CALL_MMAP(tsize));
+    if (tbase != CMFAIL) {
+      m = init_user_mstate(tbase, tsize);
+      m->seg.sflags = USE_MMAP_BIT;
+      set_lock(m, locked);
+    }
+  }
+  return (mspace)m;
+}
+
+mspace create_mspace_with_base(void* base, size_t capacity, int locked) {
+  mstate m = 0;
+  size_t msize;
+  ensure_initialization();
+  msize = pad_request(sizeof(struct malloc_state));
+  if (capacity > msize + TOP_FOOT_SIZE &&
+      capacity < (size_t) -(msize + TOP_FOOT_SIZE + mparams.page_size)) {
+    m = init_user_mstate((char*)base, capacity);
+    m->seg.sflags = EXTERN_BIT;
+    set_lock(m, locked);
+  }
+  return (mspace)m;
+}
+
+int mspace_track_large_chunks(mspace msp, int enable) {
+  int ret = 0;
+  mstate ms = (mstate)msp;
+  if (!PREACTION(ms)) {
+    if (!use_mmap(ms))
+      ret = 1;
+    if (!enable)
+      enable_mmap(ms);
+    else
+      disable_mmap(ms);
+    POSTACTION(ms);
+  }
+  return ret;
+}
+
+size_t destroy_mspace(mspace msp) {
+  size_t freed = 0;
+  mstate ms = (mstate)msp;
+  if (ok_magic(ms)) {
+    msegmentptr sp = &ms->seg;
+    while (sp != 0) {
+      char* base = sp->base;
+      (void)base; //dsm: Kill unused warning when HAVE_MMAP == 0 (CALL_MUNMAP does not use base)
+      size_t size = sp->size;
+      flag_t flag = sp->sflags;
+      sp = sp->next;
+      if ((flag & USE_MMAP_BIT) && !(flag & EXTERN_BIT) &&
+          CALL_MUNMAP(base, size) == 0)
+        freed += size;
+    }
+  }
+  else {
+    USAGE_ERROR_ACTION(ms,ms);
+  }
+  return freed;
+}
+
+/*
+  mspace versions of routines are near-clones of the global
+  versions. This is not so nice but better than the alternatives.
+*/
+
+
+void* mspace_malloc(mspace msp, size_t bytes) {
+  mstate ms = (mstate)msp;
+  if (!ok_magic(ms)) {
+    USAGE_ERROR_ACTION(ms,ms);
+    return 0;
+  }
+  if (!PREACTION(ms)) {
+    void* mem;
+    size_t nb;
+    if (bytes <= MAX_SMALL_REQUEST) {
+      bindex_t idx;
+      binmap_t smallbits;
+      nb = (bytes < MIN_REQUEST)? MIN_CHUNK_SIZE : pad_request(bytes);
+      idx = small_index(nb);
+      smallbits = ms->smallmap >> idx;
+
+      if ((smallbits & 0x3U) != 0) { /* Remainderless fit to a smallbin. */
+        mchunkptr b, p;
+        idx += ~smallbits & 1;       /* Uses next bin if idx empty */
+        b = smallbin_at(ms, idx);
+        p = b->fd;
+        assert(chunksize(p) == small_index2size(idx));
+        unlink_first_small_chunk(ms, b, p, idx);
+        set_inuse_and_pinuse(ms, p, small_index2size(idx));
+        mem = chunk2mem(p);
+        check_malloced_chunk(ms, mem, nb);
+        goto postaction;
+      }
+
+      else if (nb > ms->dvsize) {
+        if (smallbits != 0) { /* Use chunk in next nonempty smallbin */
+          mchunkptr b, p, r;
+          size_t rsize;
+          bindex_t i;
+          binmap_t leftbits = (smallbits << idx) & left_bits(idx2bit(idx));
+          binmap_t leastbit = least_bit(leftbits);
+          compute_bit2idx(leastbit, i);
+          b = smallbin_at(ms, i);
+          p = b->fd;
+          assert(chunksize(p) == small_index2size(i));
+          unlink_first_small_chunk(ms, b, p, i);
+          rsize = small_index2size(i) - nb;
+          /* Fit here cannot be remainderless if 4byte sizes */
+          if (SIZE_T_SIZE != 4 && rsize < MIN_CHUNK_SIZE)
+            set_inuse_and_pinuse(ms, p, small_index2size(i));
+          else {
+            set_size_and_pinuse_of_inuse_chunk(ms, p, nb);
+            r = chunk_plus_offset(p, nb);
+            set_size_and_pinuse_of_free_chunk(r, rsize);
+            replace_dv(ms, r, rsize);
+          }
+          mem = chunk2mem(p);
+          check_malloced_chunk(ms, mem, nb);
+          goto postaction;
+        }
+
+        else if (ms->treemap != 0 && (mem = tmalloc_small(ms, nb)) != 0) {
+          check_malloced_chunk(ms, mem, nb);
+          goto postaction;
+        }
+      }
+    }
+    else if (bytes >= MAX_REQUEST)
+      nb = MAX_SIZE_T; /* Too big to allocate. Force failure (in sys alloc) */
+    else {
+      nb = pad_request(bytes);
+      if (ms->treemap != 0 && (mem = tmalloc_large(ms, nb)) != 0) {
+        check_malloced_chunk(ms, mem, nb);
+        goto postaction;
+      }
+    }
+
+    if (nb <= ms->dvsize) {
+      size_t rsize = ms->dvsize - nb;
+      mchunkptr p = ms->dv;
+      if (rsize >= MIN_CHUNK_SIZE) { /* split dv */
+        mchunkptr r = ms->dv = chunk_plus_offset(p, nb);
+        ms->dvsize = rsize;
+        set_size_and_pinuse_of_free_chunk(r, rsize);
+        set_size_and_pinuse_of_inuse_chunk(ms, p, nb);
+      }
+      else { /* exhaust dv */
+        size_t dvs = ms->dvsize;
+        ms->dvsize = 0;
+        ms->dv = 0;
+        set_inuse_and_pinuse(ms, p, dvs);
+      }
+      mem = chunk2mem(p);
+      check_malloced_chunk(ms, mem, nb);
+      goto postaction;
+    }
+
+    else if (nb < ms->topsize) { /* Split top */
+      size_t rsize = ms->topsize -= nb;
+      mchunkptr p = ms->top;
+      mchunkptr r = ms->top = chunk_plus_offset(p, nb);
+      r->head = rsize | PINUSE_BIT;
+      set_size_and_pinuse_of_inuse_chunk(ms, p, nb);
+      mem = chunk2mem(p);
+      check_top_chunk(ms, ms->top);
+      check_malloced_chunk(ms, mem, nb);
+      goto postaction;
+    }
+
+    mem = sys_alloc(ms, nb);
+
+  postaction:
+    POSTACTION(ms);
+    return mem;
+  }
+
+  return 0;
+}
+
+void mspace_free(mspace msp, void* mem) {
+  if (mem != 0) {
+    mchunkptr p  = mem2chunk(mem);
+#if FOOTERS
+    mstate fm = get_mstate_for(p);
+    msp = msp; /* placate people compiling -Wunused */
+#else /* FOOTERS */
+    mstate fm = (mstate)msp;
+#endif /* FOOTERS */
+    if (!ok_magic(fm)) {
+      USAGE_ERROR_ACTION(fm, p);
+      return;
+    }
+    if (!PREACTION(fm)) {
+      check_inuse_chunk(fm, p);
+      if (RTCHECK(ok_address(fm, p) && ok_inuse(p))) {
+        size_t psize = chunksize(p);
+        mchunkptr next = chunk_plus_offset(p, psize);
+        if (!pinuse(p)) {
+          size_t prevsize = p->prev_foot;
+          if (is_mmapped(p)) {
+            psize += prevsize + MMAP_FOOT_PAD;
+            if (CALL_MUNMAP((char*)p - prevsize, psize) == 0)
+              fm->footprint -= psize;
+            goto postaction;
+          }
+          else {
+            mchunkptr prev = chunk_minus_offset(p, prevsize);
+            psize += prevsize;
+            p = prev;
+            if (RTCHECK(ok_address(fm, prev))) { /* consolidate backward */
+              if (p != fm->dv) {
+                unlink_chunk(fm, p, prevsize);
+              }
+              else if ((next->head & INUSE_BITS) == INUSE_BITS) {
+                fm->dvsize = psize;
+                set_free_with_pinuse(p, psize, next);
+                goto postaction;
+              }
+            }
+            else
+              goto erroraction;
+          }
+        }
+
+        if (RTCHECK(ok_next(p, next) && ok_pinuse(next))) {
+          if (!cinuse(next)) {  /* consolidate forward */
+            if (next == fm->top) {
+              size_t tsize = fm->topsize += psize;
+              fm->top = p;
+              p->head = tsize | PINUSE_BIT;
+              if (p == fm->dv) {
+                fm->dv = 0;
+                fm->dvsize = 0;
+              }
+              if (should_trim(fm, tsize))
+                sys_trim(fm, 0);
+              goto postaction;
+            }
+            else if (next == fm->dv) {
+              size_t dsize = fm->dvsize += psize;
+              fm->dv = p;
+              set_size_and_pinuse_of_free_chunk(p, dsize);
+              goto postaction;
+            }
+            else {
+              size_t nsize = chunksize(next);
+              psize += nsize;
+              unlink_chunk(fm, next, nsize);
+              set_size_and_pinuse_of_free_chunk(p, psize);
+              if (p == fm->dv) {
+                fm->dvsize = psize;
+                goto postaction;
+              }
+            }
+          }
+          else
+            set_free_with_pinuse(p, psize, next);
+
+          if (is_small(psize)) {
+            insert_small_chunk(fm, p, psize);
+            check_free_chunk(fm, p);
+          }
+          else {
+            tchunkptr tp = (tchunkptr)p;
+            insert_large_chunk(fm, tp, psize);
+            check_free_chunk(fm, p);
+            if (--fm->release_checks == 0)
+              release_unused_segments(fm);
+          }
+          goto postaction;
+        }
+      }
+    erroraction:
+      USAGE_ERROR_ACTION(fm, p);
+    postaction:
+      POSTACTION(fm);
+    }
+  }
+}
+
+void* mspace_calloc(mspace msp, size_t n_elements, size_t elem_size) {
+  void* mem;
+  size_t req = 0;
+  mstate ms = (mstate)msp;
+  if (!ok_magic(ms)) {
+    USAGE_ERROR_ACTION(ms,ms);
+    return 0;
+  }
+  if (n_elements != 0) {
+    req = n_elements * elem_size;
+    if (((n_elements | elem_size) & ~(size_t)0xffff) &&
+        (req / n_elements != elem_size))
+      req = MAX_SIZE_T; /* force downstream failure on overflow */
+  }
+  mem = internal_malloc(ms, req);
+  if (mem != 0 && calloc_must_clear(mem2chunk(mem)))
+    memset(mem, 0, req);
+  return mem;
+}
+
+void* mspace_realloc(mspace msp, void* oldmem, size_t bytes) {
+  if (oldmem == 0)
+    return mspace_malloc(msp, bytes);
+#ifdef REALLOC_ZERO_BYTES_FREES
+  if (bytes == 0) {
+    mspace_free(msp, oldmem);
+    return 0;
+  }
+#endif /* REALLOC_ZERO_BYTES_FREES */
+  else {
+#if FOOTERS
+    mchunkptr p  = mem2chunk(oldmem);
+    mstate ms = get_mstate_for(p);
+#else /* FOOTERS */
+    mstate ms = (mstate)msp;
+#endif /* FOOTERS */
+    if (!ok_magic(ms)) {
+      USAGE_ERROR_ACTION(ms,ms);
+      return 0;
+    }
+    return internal_realloc(ms, oldmem, bytes);
+  }
+}
+
+void* mspace_memalign(mspace msp, size_t alignment, size_t bytes) {
+  mstate ms = (mstate)msp;
+  if (!ok_magic(ms)) {
+    USAGE_ERROR_ACTION(ms,ms);
+    return 0;
+  }
+  return internal_memalign(ms, alignment, bytes);
+}
+
+void** mspace_independent_calloc(mspace msp, size_t n_elements,
+                                 size_t elem_size, void* chunks[]) {
+  size_t sz = elem_size; /* serves as 1-element array */
+  mstate ms = (mstate)msp;
+  if (!ok_magic(ms)) {
+    USAGE_ERROR_ACTION(ms,ms);
+    return 0;
+  }
+  return ialloc(ms, n_elements, &sz, 3, chunks);
+}
+
+void** mspace_independent_comalloc(mspace msp, size_t n_elements,
+                                   size_t sizes[], void* chunks[]) {
+  mstate ms = (mstate)msp;
+  if (!ok_magic(ms)) {
+    USAGE_ERROR_ACTION(ms,ms);
+    return 0;
+  }
+  return ialloc(ms, n_elements, sizes, 0, chunks);
+}
+
+int mspace_trim(mspace msp, size_t pad) {
+  int result = 0;
+  mstate ms = (mstate)msp;
+  if (ok_magic(ms)) {
+    if (!PREACTION(ms)) {
+      result = sys_trim(ms, pad);
+      POSTACTION(ms);
+    }
+  }
+  else {
+    USAGE_ERROR_ACTION(ms,ms);
+  }
+  return result;
+}
+
+void mspace_malloc_stats(mspace msp) {
+  mstate ms = (mstate)msp;
+  if (ok_magic(ms)) {
+    internal_malloc_stats(ms);
+  }
+  else {
+    USAGE_ERROR_ACTION(ms,ms);
+  }
+}
+
+size_t mspace_footprint(mspace msp) {
+  size_t result = 0;
+  mstate ms = (mstate)msp;
+  if (ok_magic(ms)) {
+    result = ms->footprint;
+  }
+  else {
+    USAGE_ERROR_ACTION(ms,ms);
+  }
+  return result;
+}
+
+
+size_t mspace_max_footprint(mspace msp) {
+  size_t result = 0;
+  mstate ms = (mstate)msp;
+  if (ok_magic(ms)) {
+    result = ms->max_footprint;
+  }
+  else {
+    USAGE_ERROR_ACTION(ms,ms);
+  }
+  return result;
+}
+
+
+#if !NO_MALLINFO
+struct mallinfo mspace_mallinfo(mspace msp) {
+  mstate ms = (mstate)msp;
+  if (!ok_magic(ms)) {
+    USAGE_ERROR_ACTION(ms,ms);
+  }
+  return internal_mallinfo(ms);
+}
+#endif /* NO_MALLINFO */
+
+size_t mspace_usable_size(void* mem) {
+  if (mem != 0) {
+    mchunkptr p = mem2chunk(mem);
+    if (is_inuse(p))
+      return chunksize(p) - overhead_for(p);
+  }
+  return 0;
+}
+
+int mspace_mallopt(int param_number, int value) {
+  return change_mparam(param_number, value);
+}
+
+#endif /* MSPACES */
+
+
+/* -------------------- Alternative MORECORE functions ------------------- */
+
+/*
+  Guidelines for creating a custom version of MORECORE:
+
+  * For best performance, MORECORE should allocate in multiples of pagesize.
+  * MORECORE may allocate more memory than requested. (Or even less,
+      but this will usually result in a malloc failure.)
+  * MORECORE must not allocate memory when given argument zero, but
+      instead return one past the end address of memory from previous
+      nonzero call.
+  * For best performance, consecutive calls to MORECORE with positive
+      arguments should return increasing addresses, indicating that
+      space has been contiguously extended.
+  * Even though consecutive calls to MORECORE need not return contiguous
+      addresses, it must be OK for malloc'ed chunks to span multiple
+      regions in those cases where they do happen to be contiguous.
+  * MORECORE need not handle negative arguments -- it may instead
+      just return MFAIL when given negative arguments.
+      Negative arguments are always multiples of pagesize. MORECORE
+      must not misinterpret negative args as large positive unsigned
+      args. You can suppress all such calls from even occurring by defining
+      MORECORE_CANNOT_TRIM,
+
+  As an example alternative MORECORE, here is a custom allocator
+  kindly contributed for pre-OSX macOS.  It uses virtually but not
+  necessarily physically contiguous non-paged memory (locked in,
+  present and won't get swapped out).  You can use it by uncommenting
+  this section, adding some #includes, and setting up the appropriate
+  defines above:
+
+      #define MORECORE osMoreCore
+
+  There is also a shutdown routine that should somehow be called for
+  cleanup upon program exit.
+
+  #define MAX_POOL_ENTRIES 100
+  #define MINIMUM_MORECORE_SIZE  (64 * 1024U)
+  static int next_os_pool;
+  void *our_os_pools[MAX_POOL_ENTRIES];
+
+  void *osMoreCore(int size)
+  {
+    void *ptr = 0;
+    static void *sbrk_top = 0;
+
+    if (size > 0)
+    {
+      if (size < MINIMUM_MORECORE_SIZE)
+         size = MINIMUM_MORECORE_SIZE;
+      if (CurrentExecutionLevel() == kTaskLevel)
+         ptr = PoolAllocateResident(size + RM_PAGE_SIZE, 0);
+      if (ptr == 0)
+      {
+        return (void *) MFAIL;
+      }
+      // save ptrs so they can be freed during cleanup
+      our_os_pools[next_os_pool] = ptr;
+      next_os_pool++;
+      ptr = (void *) ((((size_t) ptr) + RM_PAGE_MASK) & ~RM_PAGE_MASK);
+      sbrk_top = (char *) ptr + size;
+      return ptr;
+    }
+    else if (size < 0)
+    {
+      // we don't currently support shrink behavior
+      return (void *) MFAIL;
+    }
+    else
+    {
+      return sbrk_top;
+    }
+  }
+
+  // cleanup any allocated memory pools
+  // called as last thing before shutting down driver
+
+  void osCleanupMem(void)
+  {
+    void **ptr;
+
+    for (ptr = our_os_pools; ptr < &our_os_pools[MAX_POOL_ENTRIES]; ptr++)
+      if (*ptr)
+      {
+         PoolDeallocate(*ptr);
+         *ptr = 0;
+      }
+  }
+
+*/
+
+
+/* -----------------------------------------------------------------------
+History:
+    V2.8.4 Wed May 27 09:56:23 2009  Doug Lea  (dl at gee)
+      * Use zeros instead of prev foot for is_mmapped
+      * Add mspace_track_large_chunks; thanks to Jean Brouwers
+      * Fix set_inuse in internal_realloc; thanks to Jean Brouwers
+      * Fix insufficient sys_alloc padding when using 16byte alignment
+      * Fix bad error check in mspace_footprint
+      * Adaptations for ptmalloc; thanks to Wolfram Gloger.
+      * Reentrant spin locks; thanks to Earl Chew and others
+      * Win32 improvements; thanks to Niall Douglas and Earl Chew
+      * Add NO_SEGMENT_TRAVERSAL and MAX_RELEASE_CHECK_RATE options
+      * Extension hook in malloc_state
+      * Various small adjustments to reduce warnings on some compilers
+      * Various configuration extensions/changes for more platforms. Thanks
+         to all who contributed these.
+
+    V2.8.3 Thu Sep 22 11:16:32 2005  Doug Lea  (dl at gee)
+      * Add max_footprint functions
+      * Ensure all appropriate literals are size_t
+      * Fix conditional compilation problem for some #define settings
+      * Avoid concatenating segments with the one provided
+        in create_mspace_with_base
+      * Rename some variables to avoid compiler shadowing warnings
+      * Use explicit lock initialization.
+      * Better handling of sbrk interference.
+      * Simplify and fix segment insertion, trimming and mspace_destroy
+      * Reinstate REALLOC_ZERO_BYTES_FREES option from 2.7.x
+      * Thanks especially to Dennis Flanagan for help on these.
+
+    V2.8.2 Sun Jun 12 16:01:10 2005  Doug Lea  (dl at gee)
+      * Fix memalign brace error.
+
+    V2.8.1 Wed Jun  8 16:11:46 2005  Doug Lea  (dl at gee)
+      * Fix improper #endif nesting in C++
+      * Add explicit casts needed for C++
+
+    V2.8.0 Mon May 30 14:09:02 2005  Doug Lea  (dl at gee)
+      * Use trees for large bins
+      * Support mspaces
+      * Use segments to unify sbrk-based and mmap-based system allocation,
+        removing need for emulation on most platforms without sbrk.
+      * Default safety checks
+      * Optional footer checks. Thanks to William Robertson for the idea.
+      * Internal code refactoring
+      * Incorporate suggestions and platform-specific changes.
+        Thanks to Dennis Flanagan, Colin Plumb, Niall Douglas,
+        Aaron Bachmann,  Emery Berger, and others.
+      * Speed up non-fastbin processing enough to remove fastbins.
+      * Remove useless cfree() to avoid conflicts with other apps.
+      * Remove internal memcpy, memset. Compilers handle builtins better.
+      * Remove some options that no one ever used and rename others.
+
+    V2.7.2 Sat Aug 17 09:07:30 2002  Doug Lea  (dl at gee)
+      * Fix malloc_state bitmap array misdeclaration
+
+    V2.7.1 Thu Jul 25 10:58:03 2002  Doug Lea  (dl at gee)
+      * Allow tuning of FIRST_SORTED_BIN_SIZE
+      * Use PTR_UINT as type for all ptr->int casts. Thanks to John Belmonte.
+      * Better detection and support for non-contiguousness of MORECORE.
+        Thanks to Andreas Mueller, Conal Walsh, and Wolfram Gloger
+      * Bypass most of malloc if no frees. Thanks To Emery Berger.
+      * Fix freeing of old top non-contiguous chunk im sysmalloc.
+      * Raised default trim and map thresholds to 256K.
+      * Fix mmap-related #defines. Thanks to Lubos Lunak.
+      * Fix copy macros; added LACKS_FCNTL_H. Thanks to Neal Walfield.
+      * Branch-free bin calculation
+      * Default trim and mmap thresholds now 256K.
+
+    V2.7.0 Sun Mar 11 14:14:06 2001  Doug Lea  (dl at gee)
+      * Introduce independent_comalloc and independent_calloc.
+        Thanks to Michael Pachos for motivation and help.
+      * Make optional .h file available
+      * Allow > 2GB requests on 32bit systems.
+      * new WIN32 sbrk, mmap, munmap, lock code from <Walter@GeNeSys-e.de>.
+        Thanks also to Andreas Mueller <a.mueller at paradatec.de>,
+        and Anonymous.
+      * Allow override of MALLOC_ALIGNMENT (Thanks to Ruud Waij for
+        helping test this.)
+      * memalign: check alignment arg
+      * realloc: don't try to shift chunks backwards, since this
+        leads to  more fragmentation in some programs and doesn't
+        seem to help in any others.
+      * Collect all cases in malloc requiring system memory into sysmalloc
+      * Use mmap as backup to sbrk
+      * Place all internal state in malloc_state
+      * Introduce fastbins (although similar to 2.5.1)
+      * Many minor tunings and cosmetic improvements
+      * Introduce USE_PUBLIC_MALLOC_WRAPPERS, USE_MALLOC_LOCK
+      * Introduce MALLOC_FAILURE_ACTION, MORECORE_CONTIGUOUS
+        Thanks to Tony E. Bennett <tbennett@nvidia.com> and others.
+      * Include errno.h to support default failure action.
+
+    V2.6.6 Sun Dec  5 07:42:19 1999  Doug Lea  (dl at gee)
+      * return null for negative arguments
+      * Added Several WIN32 cleanups from Martin C. Fong <mcfong at yahoo.com>
+         * Add 'LACKS_SYS_PARAM_H' for those systems without 'sys/param.h'
+          (e.g. WIN32 platforms)
+         * Cleanup header file inclusion for WIN32 platforms
+         * Cleanup code to avoid Microsoft Visual C++ compiler complaints
+         * Add 'USE_DL_PREFIX' to quickly allow co-existence with existing
+           memory allocation routines
+         * Set 'malloc_getpagesize' for WIN32 platforms (needs more work)
+         * Use 'assert' rather than 'ASSERT' in WIN32 code to conform to
+           usage of 'assert' in non-WIN32 code
+         * Improve WIN32 'sbrk()' emulation's 'findRegion()' routine to
+           avoid infinite loop
+      * Always call 'fREe()' rather than 'free()'
+
+    V2.6.5 Wed Jun 17 15:57:31 1998  Doug Lea  (dl at gee)
+      * Fixed ordering problem with boundary-stamping
+
+    V2.6.3 Sun May 19 08:17:58 1996  Doug Lea  (dl at gee)
+      * Added pvalloc, as recommended by H.J. Liu
+      * Added 64bit pointer support mainly from Wolfram Gloger
+      * Added anonymously donated WIN32 sbrk emulation
+      * Malloc, calloc, getpagesize: add optimizations from Raymond Nijssen
+      * malloc_extend_top: fix mask error that caused wastage after
+        foreign sbrks
+      * Add linux mremap support code from HJ Liu
+
+    V2.6.2 Tue Dec  5 06:52:55 1995  Doug Lea  (dl at gee)
+      * Integrated most documentation with the code.
+      * Add support for mmap, with help from
+        Wolfram Gloger (Gloger@lrz.uni-muenchen.de).
+      * Use last_remainder in more cases.
+      * Pack bins using idea from  colin@nyx10.cs.du.edu
+      * Use ordered bins instead of best-fit threshhold
+      * Eliminate block-local decls to simplify tracing and debugging.
+      * Support another case of realloc via move into top
+      * Fix error occuring when initial sbrk_base not word-aligned.
+      * Rely on page size for units instead of SBRK_UNIT to
+        avoid surprises about sbrk alignment conventions.
+      * Add mallinfo, mallopt. Thanks to Raymond Nijssen
+        (raymond@es.ele.tue.nl) for the suggestion.
+      * Add `pad' argument to malloc_trim and top_pad mallopt parameter.
+      * More precautions for cases where other routines call sbrk,
+        courtesy of Wolfram Gloger (Gloger@lrz.uni-muenchen.de).
+      * Added macros etc., allowing use in linux libc from
+        H.J. Lu (hjl@gnu.ai.mit.edu)
+      * Inverted this history list
+
+    V2.6.1 Sat Dec  2 14:10:57 1995  Doug Lea  (dl at gee)
+      * Re-tuned and fixed to behave more nicely with V2.6.0 changes.
+      * Removed all preallocation code since under current scheme
+        the work required to undo bad preallocations exceeds
+        the work saved in good cases for most test programs.
+      * No longer use return list or unconsolidated bins since
+        no scheme using them consistently outperforms those that don't
+        given above changes.
+      * Use best fit for very large chunks to prevent some worst-cases.
+      * Added some support for debugging
+
+    V2.6.0 Sat Nov  4 07:05:23 1995  Doug Lea  (dl at gee)
+      * Removed footers when chunks are in use. Thanks to
+        Paul Wilson (wilson@cs.texas.edu) for the suggestion.
+
+    V2.5.4 Wed Nov  1 07:54:51 1995  Doug Lea  (dl at gee)
+      * Added malloc_trim, with help from Wolfram Gloger
+        (wmglo@Dent.MED.Uni-Muenchen.DE).
+
+    V2.5.3 Tue Apr 26 10:16:01 1994  Doug Lea  (dl at g)
+
+    V2.5.2 Tue Apr  5 16:20:40 1994  Doug Lea  (dl at g)
+      * realloc: try to expand in both directions
+      * malloc: swap order of clean-bin strategy;
+      * realloc: only conditionally expand backwards
+      * Try not to scavenge used bins
+      * Use bin counts as a guide to preallocation
+      * Occasionally bin return list chunks in first scan
+      * Add a few optimizations from colin@nyx10.cs.du.edu
+
+    V2.5.1 Sat Aug 14 15:40:43 1993  Doug Lea  (dl at g)
+      * faster bin computation & slightly different binning
+      * merged all consolidations to one part of malloc proper
+         (eliminating old malloc_find_space & malloc_clean_bin)
+      * Scan 2 returns chunks (not just 1)
+      * Propagate failure in realloc if malloc returns 0
+      * Add stuff to allow compilation on non-ANSI compilers
+          from kpv@research.att.com
+
+    V2.5 Sat Aug  7 07:41:59 1993  Doug Lea  (dl at g.oswego.edu)
+      * removed potential for odd address access in prev_chunk
+      * removed dependency on getpagesize.h
+      * misc cosmetics and a bit more internal documentation
+      * anticosmetics: mangled names in macros to evade debugger strangeness
+      * tested on sparc, hp-700, dec-mips, rs6000
+          with gcc & native cc (hp, dec only) allowing
+          Detlefs & Zorn comparison study (in SIGPLAN Notices.)
+
+    Trial version Fri Aug 28 13:14:29 1992  Doug Lea  (dl at g.oswego.edu)
+      * Based loosely on libg++-1.2X malloc. (It retains some of the overall
+         structure of old version,  but most details differ.)
+
+*/
+
diff --git a/src/g_std/README.txt b/src/g_std/README.txt
new file mode 100644
index 00000000..e0576735
--- /dev/null
+++ b/src/g_std/README.txt
@@ -0,0 +1,5 @@
+This folder simply has template spacializations of STL classes that use the
+global heap allocator. Feel free to define new specializations as needed.
+
+See g_vector.h for a brief discussion/pointers on how to do this. This should
+be more elegant when/if template typedefs get in C++2011.
diff --git a/src/g_std/g_list.h b/src/g_std/g_list.h
new file mode 100644
index 00000000..66f23a2b
--- /dev/null
+++ b/src/g_std/g_list.h
@@ -0,0 +1,44 @@
+/** $lic$
+ * Copyright (C) 2012-2014 by Massachusetts Institute of Technology
+ * Copyright (C) 2010-2013 by The Board of Trustees of Stanford University
+ *
+ * This file is part of zsim.
+ *
+ * zsim is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU General Public License as published by the Free Software
+ * Foundation, version 2.
+ *
+ * If you use this software in your research, we request that you reference
+ * the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of
+ * Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the
+ * source of the simulator in any publications that use this software, and that
+ * you send us a citation of your work.
+ *
+ * zsim is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef G_LIST_H_
+#define G_LIST_H_
+
+#include <list>
+#include "g_std/stl_galloc.h"
+
+// Supposedly, this will work in C++2011
+//template <typename T> typedef std::vector<T, StlGlobAlloc<T> > g_vector;
+
+// Until GCC is compliant with this, just inherit:
+template <typename T> class g_list : public std::list<T, StlGlobAlloc<T> > {};
+
+/* Some pointers on template typedefs:
+ * http://www.gotw.ca/gotw/079.htm
+ * http://drdobbs.com/cpp/184403850
+ * http://gcc.gnu.org/ml/gcc-help/2007-04/msg00338.html
+ */
+
+#endif  // G_LIST_H_
diff --git a/src/g_std/g_multimap.h b/src/g_std/g_multimap.h
new file mode 100644
index 00000000..b0273f19
--- /dev/null
+++ b/src/g_std/g_multimap.h
@@ -0,0 +1,36 @@
+/** $lic$
+ * Copyright (C) 2012-2014 by Massachusetts Institute of Technology
+ * Copyright (C) 2010-2013 by The Board of Trustees of Stanford University
+ *
+ * This file is part of zsim.
+ *
+ * zsim is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU General Public License as published by the Free Software
+ * Foundation, version 2.
+ *
+ * If you use this software in your research, we request that you reference
+ * the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of
+ * Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the
+ * source of the simulator in any publications that use this software, and that
+ * you send us a citation of your work.
+ *
+ * zsim is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef G_MULTIMAP_H_
+#define G_MULTIMAP_H_
+
+#include <functional>
+#include <map>
+#include "g_std/stl_galloc.h"
+
+template <typename K, typename V> class g_map : public std::map<K, V, std::less<K>, StlGlobAlloc<std::pair<const K, V> > > {};
+template <typename K, typename V> class g_multimap : public std::multimap<K, V, std::less<K>, StlGlobAlloc<std::pair<const K, V> > > {};
+
+#endif  // G_MULTIMAP_H_
diff --git a/src/g_std/g_string.h b/src/g_std/g_string.h
new file mode 100644
index 00000000..a097752b
--- /dev/null
+++ b/src/g_std/g_string.h
@@ -0,0 +1,39 @@
+/** $lic$
+ * Copyright (C) 2012-2014 by Massachusetts Institute of Technology
+ * Copyright (C) 2010-2013 by The Board of Trustees of Stanford University
+ *
+ * This file is part of zsim.
+ *
+ * zsim is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU General Public License as published by the Free Software
+ * Foundation, version 2.
+ *
+ * If you use this software in your research, we request that you reference
+ * the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of
+ * Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the
+ * source of the simulator in any publications that use this software, and that
+ * you send us a citation of your work.
+ *
+ * zsim is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef G_STRING_H_
+#define G_STRING_H_
+// What? I'm just using consistent naming rules :)
+// Just be thankful that std::spot does not exist
+
+#include <string>
+#include "g_std/stl_galloc.h"
+
+/* basic_string is actually an STL-mandated class, not an internal GCC thing.
+ * This should work with other compilers
+ */
+typedef std::basic_string<char, std::char_traits<char>, StlGlobAlloc<char> > g_string;
+
+#endif  // G_STRING_H_
diff --git a/src/g_std/g_unordered_map.h b/src/g_std/g_unordered_map.h
new file mode 100644
index 00000000..833d3993
--- /dev/null
+++ b/src/g_std/g_unordered_map.h
@@ -0,0 +1,36 @@
+/** $lic$
+ * Copyright (C) 2012-2014 by Massachusetts Institute of Technology
+ * Copyright (C) 2010-2013 by The Board of Trustees of Stanford University
+ *
+ * This file is part of zsim.
+ *
+ * zsim is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU General Public License as published by the Free Software
+ * Foundation, version 2.
+ *
+ * If you use this software in your research, we request that you reference
+ * the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of
+ * Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the
+ * source of the simulator in any publications that use this software, and that
+ * you send us a citation of your work.
+ *
+ * zsim is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef G_UNORDERED_MAP_H_
+#define G_UNORDERED_MAP_H_
+
+#include <functional>
+#include <unordered_map>
+#include "g_std/stl_galloc.h"
+
+//template <typename K, typename V> class g_unordered_map : public std::unordered_map<K, V, StlGlobAlloc<std::pair<K const, V> > > {}; //this seems to work for TR1, not for final
+template <typename K, typename V> class g_unordered_map : public std::unordered_map<K, V, std::hash<K>, std::equal_to<K>, StlGlobAlloc<std::pair<const K, V> > > {};
+
+#endif  // G_UNORDERED_MAP_H_
diff --git a/src/g_std/g_unordered_set.h b/src/g_std/g_unordered_set.h
new file mode 100644
index 00000000..186e9be2
--- /dev/null
+++ b/src/g_std/g_unordered_set.h
@@ -0,0 +1,35 @@
+/** $lic$
+ * Copyright (C) 2012-2014 by Massachusetts Institute of Technology
+ * Copyright (C) 2010-2013 by The Board of Trustees of Stanford University
+ *
+ * This file is part of zsim.
+ *
+ * zsim is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU General Public License as published by the Free Software
+ * Foundation, version 2.
+ *
+ * If you use this software in your research, we request that you reference
+ * the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of
+ * Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the
+ * source of the simulator in any publications that use this software, and that
+ * you send us a citation of your work.
+ *
+ * zsim is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef G_UNORDERED_SET_H_
+#define G_UNORDERED_SET_H_
+
+#include <functional>
+#include <unordered_set>
+#include "g_std/stl_galloc.h"
+
+template <typename K> class g_unordered_set : public std::unordered_set<K, std::hash<K>, std::equal_to<K>, StlGlobAlloc<K> > {};
+
+#endif  // G_UNORDERED_SET_H_
diff --git a/src/g_std/g_vector.h b/src/g_std/g_vector.h
new file mode 100644
index 00000000..903e5839
--- /dev/null
+++ b/src/g_std/g_vector.h
@@ -0,0 +1,57 @@
+/** $lic$
+ * Copyright (C) 2012-2014 by Massachusetts Institute of Technology
+ * Copyright (C) 2010-2013 by The Board of Trustees of Stanford University
+ *
+ * This file is part of zsim.
+ *
+ * zsim is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU General Public License as published by the Free Software
+ * Foundation, version 2.
+ *
+ * If you use this software in your research, we request that you reference
+ * the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of
+ * Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the
+ * source of the simulator in any publications that use this software, and that
+ * you send us a citation of your work.
+ *
+ * zsim is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef G_VECTOR_H_
+#define G_VECTOR_H_
+
+#include <vector>
+#include "g_std/stl_galloc.h"
+
+// Supposedly, this will work in C++2011
+//template <typename T> typedef std::vector<T, StlGlobAlloc<T> > g_vector;
+
+// Until GCC is compliant with this, just inherit:
+template <typename T> class g_vector : public std::vector<T, StlGlobAlloc<T> >, public GlobAlloc {
+    public:
+        g_vector() = default;
+
+        g_vector(const std::vector<T>& v) {
+            this->resize(v.size());
+            for (size_t i = 0; i < v.size(); i++) {
+                (*this)[i] = v[i];
+            }
+        }
+
+        g_vector(std::initializer_list<T> list) : std::vector<T, StlGlobAlloc<T>>(list) {}
+        g_vector(size_t n, const T& t = T()) : std::vector<T, StlGlobAlloc<T>>(n, t) {}
+};
+
+/* Some pointers on template typedefs:
+ * http://www.gotw.ca/gotw/079.htm
+ * http://drdobbs.com/cpp/184403850
+ * http://gcc.gnu.org/ml/gcc-help/2007-04/msg00338.html
+ */
+
+#endif  // G_VECTOR_H_
diff --git a/src/g_std/stl_galloc.h b/src/g_std/stl_galloc.h
new file mode 100644
index 00000000..b49ae764
--- /dev/null
+++ b/src/g_std/stl_galloc.h
@@ -0,0 +1,98 @@
+/** $lic$
+ * Copyright (C) 2012-2014 by Massachusetts Institute of Technology
+ * Copyright (C) 2010-2013 by The Board of Trustees of Stanford University
+ *
+ * This file is part of zsim.
+ *
+ * zsim is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU General Public License as published by the Free Software
+ * Foundation, version 2.
+ *
+ * If you use this software in your research, we request that you reference
+ * the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of
+ * Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the
+ * source of the simulator in any publications that use this software, and that
+ * you send us a citation of your work.
+ *
+ * zsim is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef STL_GALLOC_H_
+#define STL_GALLOC_H_
+
+#include <stddef.h>
+#include "galloc.h"
+
+/* Follows interface of STL allocator, allocates and frees from the global heap */
+
+template <class T>
+class StlGlobAlloc {
+    public:
+        typedef size_t size_type;
+        typedef ptrdiff_t difference_type;
+        typedef T* pointer;
+        typedef const T* const_pointer;
+        typedef T& reference;
+        typedef const T& const_reference;
+        typedef T value_type;
+
+        StlGlobAlloc() {}
+        StlGlobAlloc(const StlGlobAlloc&) {}
+
+        pointer allocate(size_type n, const void * = 0) {
+            T* t = gm_calloc<T>(n);
+            return t;
+        }
+
+        void deallocate(void* p, size_type) {
+            if (p) gm_free(p);
+        }
+
+        pointer address(reference x) const { return &x; }
+        const_pointer address(const_reference x) const { return &x; }
+        StlGlobAlloc<T>& operator=(const StlGlobAlloc&) { return *this; }
+        
+
+        // Construct/destroy
+        // gcc keeps changing these interfaces. See /usr/include/c++/4.8/ext/new_allocator.h
+#if __cplusplus >= 201103L // >= 4.8
+        template<typename _Up, typename... _Args>
+        void construct(_Up* __p, _Args&&... __args) { ::new((void *)__p) _Up(std::forward<_Args>(__args)...); }
+
+        template<typename _Up> void destroy(_Up* __p) { __p->~_Up(); }
+#else // < 4.8
+        void construct(pointer p, const T& val) { new (static_cast<T*>(p)) T(val); }
+        void construct(pointer p) { construct(p, value_type()); } //required by gcc 4.6
+        void destroy(pointer p) { p->~T(); }
+#endif
+
+        size_type max_size() const { return size_t(-1); }
+
+        template <class U> struct rebind { typedef StlGlobAlloc<U> other; };
+
+        template <class U> StlGlobAlloc(const StlGlobAlloc<U>&) {}
+
+        template <class U> StlGlobAlloc& operator=(const StlGlobAlloc<U>&) { return *this; }
+
+
+        /* dsm: The == (and !=) operator in an allocator must be defined and,
+         * per http://download.oracle.com/docs/cd/E19422-01/819-3703/15_3.htm :
+         *
+         *   Returns true if allocators b and a can be safely interchanged. Safely
+         *   interchanged means that b could be used to deallocate storage obtained
+         *   through a, and vice versa.
+         *
+         * We can ALWAYS do this, as deallocate just calls gm_free()
+         */
+        template <class U> bool operator==(const StlGlobAlloc<U>&) const { return true; }
+
+        template <class U> bool operator!=(const StlGlobAlloc<U>&) const { return false; }
+};
+
+#endif  // STL_GALLOC_H_
diff --git a/src/galloc.cpp b/src/galloc.cpp
new file mode 100644
index 00000000..8bada099
--- /dev/null
+++ b/src/galloc.cpp
@@ -0,0 +1,213 @@
+/** $lic$
+ * Copyright (C) 2012-2014 by Massachusetts Institute of Technology
+ * Copyright (C) 2010-2013 by The Board of Trustees of Stanford University
+ *
+ * This file is part of zsim.
+ *
+ * zsim is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU General Public License as published by the Free Software
+ * Foundation, version 2.
+ *
+ * If you use this software in your research, we request that you reference
+ * the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of
+ * Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the
+ * source of the simulator in any publications that use this software, and that
+ * you send us a citation of your work.
+ *
+ * zsim is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "galloc.h"
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string>
+#include <sys/ipc.h>
+#include <sys/shm.h>
+
+#include "log.h"  // NOLINT must precede dlmalloc, which defines assert if undefined
+#include "g_heap/dlmalloc.h.c"
+#include "locks.h"
+#include "pad.h"
+
+/* Base heap address. Has to be available cross-process. With 64-bit virtual
+ * addresses, the address space is so sparse that it's quite easy to find
+ * some random base that always works in practice. If for some weird reason
+ * you want to compile this on a 32-bit address space, there are fancier,
+ * more structured ways to get a common range (e.g. launch all the processes
+ * before allocating the global heap segment, and find a common range either
+ * by brute-force scanning and communicating through pipes, or by parsing
+ * /proc/{pid}/maps).
+ *
+ * But, since I'm using a 64-bit address space, I don't really care to make
+ * it fancy.
+ */
+#define GM_BASE_ADDR ((const void*)0x00ABBA000000)
+
+struct gm_segment {
+    volatile void* base_regp; //common data structure, accessible with glob_ptr; threads poll on gm_isready to determine when everything has been initialized
+    volatile void* secondary_regp; //secondary data structure, used to exchange information between harness and initializing process
+    mspace mspace_ptr;
+
+    PAD();
+    lock_t lock;
+    PAD();
+};
+
+static gm_segment* GM = NULL;
+static int gm_shmid = 0;
+
+/* Heap segment size, in bytes. Can't grow for now, so choose something sensible, and within the machine's limits (see sysctl vars kernel.shmmax and kernel.shmall) */
+int gm_init(size_t segmentSize) {
+    /* Create a SysV IPC shared memory segment, attach to it, and mark the segment to
+     * auto-destroy when the number of attached processes becomes 0.
+     *
+     * IMPORTANT: There is a small window of vulnerability between shmget and shmctl that
+     * can lead to major issues: between these calls, we have a segment of persistent
+     * memory that will survive the program if it dies (e.g. someone just happens to send us
+     * a SIGKILL)
+     */
+
+    assert(GM == NULL);
+    assert(gm_shmid == 0);
+    gm_shmid = shmget(IPC_PRIVATE, segmentSize, 0644 | IPC_CREAT /*| SHM_HUGETLB*/);
+    if (gm_shmid == -1) {
+        perror("gm_create failed shmget");
+        exit(1);
+    }
+    GM = static_cast<gm_segment*>(shmat(gm_shmid, GM_BASE_ADDR, 0));
+    if (GM != GM_BASE_ADDR) {
+        perror("gm_create failed shmat");
+        warn("shmat failed, shmid %d. Trying not to leave garbage behind before dying...", gm_shmid);
+        int ret = shmctl(gm_shmid, IPC_RMID, NULL);
+        if (ret) {
+            perror("shmctl failed, we're leaving garbage behind!");
+            panic("Check /proc/sysvipc/shm and manually delete segment with shmid %d", gm_shmid);
+        } else {
+            panic("shmctl succeeded, we're dying in peace");
+        }
+    }
+
+    //Mark the segment to auto-destroy when the number of attached processes becomes 0.
+    int ret = shmctl(gm_shmid, IPC_RMID, NULL);
+    assert(!ret);
+
+    char* alloc_start = reinterpret_cast<char*>(GM) + 1024;
+    size_t alloc_size = segmentSize - 1 - 1024;
+    GM->base_regp = NULL;
+
+    GM->mspace_ptr = create_mspace_with_base(alloc_start, alloc_size, 1 /*locked*/);
+    futex_init(&GM->lock);
+    assert(GM->mspace_ptr);
+
+    return gm_shmid;
+}
+
+void gm_attach(int shmid) {
+    assert(GM == NULL);
+    assert(gm_shmid == 0);
+    gm_shmid = shmid;
+    GM = static_cast<gm_segment*>(shmat(gm_shmid, GM_BASE_ADDR, 0));
+    if (GM != GM_BASE_ADDR) {
+        warn("shmid %d \n", shmid);
+        panic("gm_attach failed allocation");
+    }
+}
+
+
+void* gm_malloc(size_t size) {
+    assert(GM);
+    assert(GM->mspace_ptr);
+    futex_lock(&GM->lock);
+    void* ptr = mspace_malloc(GM->mspace_ptr, size);
+    futex_unlock(&GM->lock);
+    if (!ptr) panic("gm_malloc(): Out of global heap memory, use a larger GM segment");
+    return ptr;
+}
+
+void* __gm_calloc(size_t num, size_t size) {
+    assert(GM);
+    assert(GM->mspace_ptr);
+    futex_lock(&GM->lock);
+    void* ptr = mspace_calloc(GM->mspace_ptr, num, size);
+    futex_unlock(&GM->lock);
+    if (!ptr) panic("gm_calloc(): Out of global heap memory, use a larger GM segment");
+    return ptr;
+}
+
+void* __gm_memalign(size_t blocksize, size_t bytes) {
+    assert(GM);
+    assert(GM->mspace_ptr);
+    futex_lock(&GM->lock);
+    void* ptr = mspace_memalign(GM->mspace_ptr, blocksize, bytes);
+    futex_unlock(&GM->lock);
+    if (!ptr) panic("gm_memalign(): Out of global heap memory, use a larger GM segment");
+    return ptr;
+}
+
+
+void gm_free(void* ptr) {
+    assert(GM);
+    assert(GM->mspace_ptr);
+    futex_lock(&GM->lock);
+    mspace_free(GM->mspace_ptr, ptr);
+    futex_unlock(&GM->lock);
+}
+
+
+char* gm_strdup(const char* str) {
+    size_t l = strlen(str);
+    char* res = static_cast<char*>(gm_malloc(l+1));
+    memcpy(res, str, l+1);
+    return res;
+}
+
+
+void gm_set_glob_ptr(void* ptr) {
+    assert(GM);
+    assert(GM->base_regp == NULL);
+    GM->base_regp = ptr;
+}
+
+void* gm_get_glob_ptr() {
+    assert(GM);
+    assert(GM->base_regp);
+    return const_cast<void*>(GM->base_regp);  // devolatilize
+}
+
+void gm_set_secondary_ptr(void* ptr) {
+    assert(GM);
+    assert(GM->secondary_regp == NULL);
+    GM->secondary_regp = ptr;
+}
+
+void* gm_get_secondary_ptr() {
+    assert(GM);
+    assert(GM->secondary_regp != NULL);
+    return const_cast<void*>(GM->secondary_regp);  // devolatilize
+}
+
+void gm_stats() {
+    assert(GM);
+    mspace_malloc_stats(GM->mspace_ptr);
+}
+
+bool gm_isready() {
+    assert(GM);
+    return (GM->base_regp != NULL);
+}
+
+void gm_detach() {
+    assert(GM);
+    shmdt(GM);
+    GM = NULL;
+    gm_shmid = 0;
+}
+
+
diff --git a/src/galloc.h b/src/galloc.h
new file mode 100644
index 00000000..446ebfb7
--- /dev/null
+++ b/src/galloc.h
@@ -0,0 +1,89 @@
+/** $lic$
+ * Copyright (C) 2012-2014 by Massachusetts Institute of Technology
+ * Copyright (C) 2010-2013 by The Board of Trustees of Stanford University
+ *
+ * This file is part of zsim.
+ *
+ * zsim is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU General Public License as published by the Free Software
+ * Foundation, version 2.
+ *
+ * If you use this software in your research, we request that you reference
+ * the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of
+ * Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the
+ * source of the simulator in any publications that use this software, and that
+ * you send us a citation of your work.
+ *
+ * zsim is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef GALLOC_H_
+#define GALLOC_H_
+
+#include <stdlib.h>
+#include <string.h>
+
+int gm_init(size_t segmentSize);
+
+void gm_attach(int shmid);
+
+// C-style interface
+void* gm_malloc(size_t size);
+void* __gm_calloc(size_t num, size_t size);  //deprecated, only used internally
+void* __gm_memalign(size_t blocksize, size_t bytes);  // deprecated, only used internally
+char* gm_strdup(const char* str);
+void gm_free(void* ptr);
+
+// C++-style alloc interface (preferred)
+template <typename T> T* gm_malloc() {return static_cast<T*>(gm_malloc(sizeof(T)));}
+template <typename T> T* gm_malloc(size_t objs) {return static_cast<T*>(gm_malloc(sizeof(T)*objs));}
+template <typename T> T* gm_calloc() {return static_cast<T*>(__gm_calloc(1, sizeof(T)));}
+template <typename T> T* gm_calloc(size_t objs) {return static_cast<T*>(__gm_calloc(objs, sizeof(T)));}
+template <typename T> T* gm_memalign(size_t blocksize) {return static_cast<T*>(__gm_memalign(blocksize, sizeof(T)));}
+template <typename T> T* gm_memalign(size_t blocksize, size_t objs) {return static_cast<T*>(__gm_memalign(blocksize, sizeof(T)*objs));}
+template <typename T> T* gm_dup(T* src, size_t objs) {
+    T* dst = gm_malloc<T>(objs);
+    memcpy(dst, src, sizeof(T)*objs);
+    return dst;
+}
+
+void gm_set_glob_ptr(void* ptr);
+void* gm_get_glob_ptr();
+
+void gm_set_secondary_ptr(void* ptr);
+void* gm_get_secondary_ptr();
+
+void gm_stats();
+
+bool gm_isready();
+void gm_detach();
+
+
+class GlobAlloc {
+    public:
+        virtual ~GlobAlloc() {}
+
+        inline void* operator new (size_t sz) {
+            return gm_malloc(sz);
+        }
+
+        //Placement new
+        inline void* operator new (size_t sz, void* ptr) {
+            return ptr;
+        }
+
+        inline void operator delete(void *p, size_t sz) {
+            gm_free(p);
+        }
+
+        //Placement delete... make ICC happy. This would only fire on an exception
+        void operator delete (void* p, void* ptr) {}
+};
+
+#endif  // GALLOC_H_
diff --git a/src/hash.cpp b/src/hash.cpp
new file mode 100644
index 00000000..ede2141c
--- /dev/null
+++ b/src/hash.cpp
@@ -0,0 +1,173 @@
+/** $lic$
+ * Copyright (C) 2012-2014 by Massachusetts Institute of Technology
+ * Copyright (C) 2010-2013 by The Board of Trustees of Stanford University
+ *
+ * This file is part of zsim.
+ *
+ * zsim is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU General Public License as published by the Free Software
+ * Foundation, version 2.
+ *
+ * If you use this software in your research, we request that you reference
+ * the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of
+ * Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the
+ * source of the simulator in any publications that use this software, and that
+ * you send us a citation of your work.
+ *
+ * zsim is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "hash.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include "log.h"
+#include "mtrand.h"
+
+H3HashFamily::H3HashFamily(uint32_t numFunctions, uint32_t outputBits, uint64_t randSeed) : numFuncs(numFunctions) {
+    MTRand rnd(randSeed);
+
+    if (outputBits <= 8) {
+        resShift = 3;
+    } else if (outputBits <= 16) {
+        resShift = 2;
+    } else if (outputBits <= 32) {
+        resShift = 1;
+    } else if (outputBits <= 64) {
+        resShift = 0;
+    } else {
+        panic("Hash function can't produce more than 64 bits of output!!");
+    }
+
+    uint32_t words = 64 >> resShift;
+    hMatrix = gm_calloc<uint64_t>(words*numFuncs);
+    for (uint32_t ii = 0; ii < numFuncs; ii++) {
+        for (uint32_t jj = 0; jj < words; jj++) {
+            uint64_t val = 0;
+            for (int kk = 0; kk < 64; kk++) {
+                val = val << 1;
+                if (rnd.randInt() % 2 == 0) val++;
+            }
+            //Indeed, they are distributed around 32, but we might get better mileage by forcing 32b...
+            //info("H3: Function %d Matrix 64-bit word %d has %d 1s", ii, jj, __builtin_popcountll(val));
+            //if (__builtin_popcountll(val) != 32) {jj--; continue;} // no difference
+            hMatrix[ii*words + jj] = val;
+        }
+    }
+}
+
+/* NOTE: This is fairly well hand-optimized. Go to the commit logs to see the speedup of this function. Main things:
+ * 1. resShift indicates how many bits of output are computed (64, 32, 16, or 8). With less than 64 bits, several rounds are folded at the end.
+ * 2. The output folding does not mask, the output is expected to be masked by caller.
+ * 3. The main loop is hand-unrolled and optimized for ILP.
+ * 4. Pre-computing shifted versions of the input does not help, as it increases register pressure.
+ *
+ * For reference, here is the original, simpler code (computes a 64-bit hash):
+ * for (uint32_t x = 0; x < 64; x++) {
+ *     res ^= val & hMatrix[id*64 + x];
+ *     res = (res << 1) | (res >> 63);
+ * }
+ */
+uint64_t H3HashFamily::hash(uint32_t id, uint64_t val) {
+    uint64_t res = 0;
+    assert(id >= 0 && id < numFuncs);
+
+    // 8-way unrolled loop
+    uint32_t maxBits = 64 >> resShift;
+    for (uint32_t x = 0; x < maxBits; x+=8) {
+        uint32_t base = (id << (6 - resShift)) + x;
+        uint64_t res0 = val & hMatrix[base];
+        uint64_t res1 = val & hMatrix[base+1];
+        uint64_t res2 = val & hMatrix[base+2];
+        uint64_t res3 = val & hMatrix[base+3];
+
+        uint64_t res4 = val & hMatrix[base+4];
+        uint64_t res5 = val & hMatrix[base+5];
+        uint64_t res6 = val & hMatrix[base+6];
+        uint64_t res7 = val & hMatrix[base+7];
+
+        res ^= res0 ^ ((res1 << 1) | (res1 >> 63)) ^ ((res2 << 2) | (res2 >> 62)) ^ ((res3 << 3) | (res3 >> 61));
+        res ^= ((res4 << 4) | (res4 >> 60)) ^ ((res5 << 5) | (res5 >> 59)) ^ ((res6 << 6) | (res6 >> 58)) ^ ((res7 << 7) | (res7 >> 57));
+        res = (res << 8) | (res >> 56);
+    }
+
+    // Fold bits to match output
+    switch (resShift) {
+        case 0: //64-bit output
+            break;
+        case 1: //32-bit output
+            res = (res >> 32) ^ res;
+            break;
+        case 2: //16-bit output
+            res = (res >> 32) ^ res;
+            res = (res >> 16) ^ res;
+            break;
+        case 3: //8-bit output
+            res = (res >> 32) ^ res;
+            res = (res >> 16) ^ res;
+            res = (res >> 8) ^ res;
+            break;
+    }
+
+    //info("0x%lx", res);
+
+    return res;
+}
+
+#if _WITH_POLARSSL_
+
+#include "polarssl/sha1.h"
+
+SHA1HashFamily::SHA1HashFamily(int numFunctions) : numFuncs(numFunctions) {
+    memoizedVal = 0;
+    numPasses = numFuncs/5 + 1;
+    memoizedHashes = gm_calloc<uint32_t>(numPasses*5);  // always > than multiple of buffers
+}
+
+uint64_t SHA1HashFamily::hash(uint32_t id, uint64_t val) {
+    assert(id >= 0 && id < (uint32_t)numFuncs);
+    if (val == memoizedVal) {
+        //info("Memo hit 0x%x", memoizedHashes[id]);
+        return (uint64_t) memoizedHashes[id];
+    } else {
+        uint64_t buffer[16];
+        //sha1_context ctx;
+        for (int i = 0; i < 16; i++) {
+            buffer[i] = val;
+        }
+
+        for (int i = 0; i < numPasses; i++) {
+            if (i > 0) { //change source
+                for (int j = 0; j < 5; j++) {
+                    buffer[j] ^= memoizedHashes[(i-1)*5 + j];
+                }
+            }
+            sha1((unsigned char*) buffer, sizeof(buffer), (unsigned char*) &(memoizedHashes[i*5]));
+        }
+        /*info("SHA1: 0x%lx:", val);
+        for (int i = 0; i < numFuncs; i++) {
+            info(" %d: 0x%x", i, memoizedHashes[i]);
+        }*/
+
+        memoizedVal = val;
+        return (uint64_t) memoizedHashes[id];
+    }
+}
+
+#else  // _WITH_POLARSSL_
+
+SHA1HashFamily::SHA1HashFamily(int numFunctions) {
+    panic("Cannot use SHA1HashFamily, zsim was not compiled with PolarSSL");
+}
+
+uint64_t SHA1HashFamily::hash(uint32_t id, uint64_t val) {
+    panic("???");
+    return 0;
+}
+ 
+#endif  // _WITH_POLARSSL_
diff --git a/src/hash.h b/src/hash.h
new file mode 100644
index 00000000..7999decb
--- /dev/null
+++ b/src/hash.h
@@ -0,0 +1,69 @@
+/** $lic$
+ * Copyright (C) 2012-2014 by Massachusetts Institute of Technology
+ * Copyright (C) 2010-2013 by The Board of Trustees of Stanford University
+ *
+ * This file is part of zsim.
+ *
+ * zsim is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU General Public License as published by the Free Software
+ * Foundation, version 2.
+ *
+ * If you use this software in your research, we request that you reference
+ * the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of
+ * Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the
+ * source of the simulator in any publications that use this software, and that
+ * you send us a citation of your work.
+ *
+ * zsim is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef HASH_H_
+#define HASH_H_
+
+#include <stdint.h>
+#include "galloc.h"
+
+class HashFamily : public GlobAlloc {
+    public:
+        HashFamily() {}
+        virtual ~HashFamily() {}
+
+        virtual uint64_t hash(uint32_t id, uint64_t val) = 0;
+};
+
+class H3HashFamily : public HashFamily {
+    private:
+        uint32_t numFuncs;
+        uint32_t resShift;
+        uint64_t* hMatrix;
+    public:
+        H3HashFamily(uint32_t numFunctions, uint32_t outputBits, uint64_t randSeed = 123132127);
+        uint64_t hash(uint32_t id, uint64_t val);
+};
+
+class SHA1HashFamily : public HashFamily {
+    private:
+        int numFuncs;
+        int numPasses;
+
+        //SHA1 is quite expensive and returns large blocks, so we use memoization and chunk the block to implement hash function families.
+        uint64_t memoizedVal;
+        uint32_t* memoizedHashes;
+    public:
+        explicit SHA1HashFamily(int numFunctions);
+        uint64_t hash(uint32_t id, uint64_t val);
+};
+
+/* Used when we don't want hashing, just return the value */
+class IdHashFamily : public HashFamily {
+    public:
+        inline uint64_t hash(uint32_t id, uint64_t val) {return val;}
+};
+
+#endif  // HASH_H_
diff --git a/src/hdf5_stats.cpp b/src/hdf5_stats.cpp
new file mode 100644
index 00000000..e3a6e067
--- /dev/null
+++ b/src/hdf5_stats.cpp
@@ -0,0 +1,265 @@
+/** $lic$
+ * Copyright (C) 2012-2014 by Massachusetts Institute of Technology
+ * Copyright (C) 2010-2013 by The Board of Trustees of Stanford University
+ *
+ * This file is part of zsim.
+ *
+ * zsim is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU General Public License as published by the Free Software
+ * Foundation, version 2.
+ *
+ * If you use this software in your research, we request that you reference
+ * the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of
+ * Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the
+ * source of the simulator in any publications that use this software, and that
+ * you send us a citation of your work.
+ *
+ * zsim is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <fstream>
+#include <hdf5.h>
+#include <hdf5_hl.h>
+#include <iostream>
+#include <vector>
+#include "galloc.h"
+#include "log.h"
+#include "stats.h"
+#include "zsim.h"
+
+/** Implements the HDF5 backend. Creates one big table in the file, and writes one row per dump.
+ * NOTE: Because dump may be called from multiple processes, we close and open the HDF5 file every dump.
+ * This is inefficient, but dumps are not that common anyhow, and we get the ability to read hdf5 files mid-simulation.
+ * (alternatively, we could have an extra thread exclusively dedicated to writing stats).
+ */
+class HDF5BackendImpl : public GlobAlloc {
+    private:
+        const char* filename;
+        AggregateStat* rootStat;
+        bool skipVectors;
+        bool sumRegularAggregates;
+
+        uint64_t* dataBuf; //buffered record data
+        uint64_t* curPtr; //points to next element to write in dump
+        uint64_t recordSize; // in bytes
+        uint32_t recordsPerWrite; //how many records to buffer; determines chunk size as well
+
+        uint32_t bufferedRecords; //number of records buffered (dumped w/o being written), <= recordsPerWrite
+
+        // Always have a single function to determine when to skip a stat to avoid inconsistencies in the code
+        bool skipStat(Stat* s) {
+            return skipVectors && dynamic_cast<VectorStat*>(s);
+        }
+
+        // Dump the stats, inorder walk
+        void dumpWalk(Stat* s) {
+            if (skipStat(s)) return;
+            if (AggregateStat* as = dynamic_cast<AggregateStat*>(s)) {
+                if (as->isRegular() && sumRegularAggregates) {
+                    //Dump first record
+                    uint64_t* startPtr = curPtr;
+                    dumpWalk(as->get(0));
+                    uint64_t* tmpPtr = curPtr;
+                    uint32_t sz = tmpPtr - startPtr;
+                    //Dump others below, and add them up
+                    for (uint32_t i = 1; i < as->size(); i++) {
+                        dumpWalk(as->get(i));
+                        //Add record with previous ones
+                        assert(curPtr == tmpPtr + sz);
+                        for (uint32_t j = 0; j < sz; j++) startPtr[j] += tmpPtr[j];
+                        //Rewind
+                        curPtr = tmpPtr;
+                    }
+                } else {
+                    for (uint32_t i = 0; i < as->size(); i++) {
+                        dumpWalk(as->get(i));
+                    }
+                }
+            } else if (Counter* cs = dynamic_cast<Counter*>(s)) {
+                *(curPtr++) = cs->count();
+            } else if (ScalarStat* ss = dynamic_cast<ScalarStat*>(s)) {
+                *(curPtr++) = ss->get();
+            } else if (VectorStat* vs = dynamic_cast<VectorStat*>(s)) {
+                for (uint32_t i = 0; i < vs->size(); i++) {
+                    *(curPtr++) = vs->count(i);
+                }
+            } else if (ProxyStat* ps = dynamic_cast<ProxyStat*>(s)) {
+                *(curPtr++) = ps->stat();
+            } else if (ProxyFuncStat* pfs = dynamic_cast<ProxyFuncStat*>(s)) {
+                *(curPtr++) = pfs->stat();
+            } else {
+                panic("Unrecognized stat type");
+            }
+        }
+
+        //Note this is a local vector, b/c it's only used at initialization.
+        std::vector<hid_t> uniqueTypes;
+
+        /* Gets an HDF5 type, compares it with every prior unique type, and returns the ID of the type to use.
+         * Note that it will close the current type if it is a duplicate!
+         * I'm not sure that this reduces type size (maybe with committed types?). It is good practice though --
+         * we don't need thousands of equivalent types flying around inside the HDF5 library, who knows what goes
+         * inside that place.
+         */
+        hid_t deduplicateH5Type(hid_t type) {
+            std::vector<hid_t>::iterator it;
+            for (it = uniqueTypes.begin(); it != uniqueTypes.end(); it++) {
+                if (*it == type) {
+                    // Avoid closing a type that was registered before
+                    return type;
+                }
+                if (H5Tequal(*it, type)) {
+                    //Must check we have created the type before closing it, otherwise the library screams :)
+                    H5T_class_t typeClass = H5Tget_class(type);
+                    if (typeClass == H5T_COMPOUND || typeClass == H5T_ARRAY) {
+                        H5Tclose(type);
+                    }
+                    return *it;
+                }
+            }
+            // This is indeed a new type
+            uniqueTypes.push_back(type);
+            return type;
+        }
+
+        /* Code to create a large compund datatype from an aggregate stat. ALWAYS returns deduplicated types */
+        hid_t getH5Type(Stat* stat) { //I'd like to make this functional, but passing a member function as an argument is non-trivial...
+            AggregateStat* aggrStat = dynamic_cast<AggregateStat*>(stat);
+            if (aggrStat == NULL) {
+                return getBaseH5Type(stat);
+            } else if (aggrStat->isRegular()) {
+                //This is a regular aggregate, i.e. an array of possibly compound types
+                assert(aggrStat->size() > 0);
+                assert(!skipStat(aggrStat->get(0))); //should not happen unless we start skipping compounds in the future.
+                hid_t childType = getH5Type(aggrStat->get(0));
+                //Sanity check
+                for (uint32_t i = 1; i < aggrStat->size(); i++) {
+                    hid_t otherType = getH5Type(aggrStat->get(i)); //already deduplicated
+                    if (otherType != childType) {
+                        panic("In regular aggregate %s, child %d has a different type than first child. Doesn't look regular to me!", stat->name(), i);
+                    }
+                }
+                if (sumRegularAggregates) {
+                    return childType; //this is already deduplicated
+                } else {
+                    hsize_t dims[] = {aggrStat->size()};
+                    hid_t res = H5Tarray_create2(childType, 1 /*rank*/, dims);
+                    return deduplicateH5Type(res);
+                }
+            } else {
+                //Irregular aggregate
+                //First pass to get sizes
+                size_t size = 0;
+                for (uint32_t i = 0; i < aggrStat->size(); i++) {
+                    Stat* child = aggrStat->get(i);
+                    if (skipStat(child)) continue;
+                    size += H5Tget_size(getH5Type(child));
+                }
+                hid_t res = H5Tcreate(H5T_COMPOUND, size);
+                size_t offset = 0;
+                for (uint32_t i = 0; i < aggrStat->size(); i++) {
+                    Stat* child = aggrStat->get(i);
+                    if (skipStat(child)) continue;
+                    hid_t childType = getH5Type(child);
+                    H5Tinsert(res, child->name(), offset, childType);
+                    offset += H5Tget_size(childType);
+                }
+                assert(size == offset);
+                return deduplicateH5Type(res);
+            }
+        }
+
+        /* Return type of non-aggregates. ALWAYS returns deduplicated types. */
+        hid_t getBaseH5Type(Stat* s) {
+            assert(dynamic_cast<AggregateStat*>(s) == NULL); //this can't be an aggregate
+            hid_t res;
+            uint32_t size = 1; //scalar by default
+            if (VectorStat* vs = dynamic_cast<VectorStat*>(s)) {
+                size = vs->size();
+            }
+            if (size > 1) {
+                hsize_t dims[] = {size};
+                res = H5Tarray_create2(H5T_NATIVE_ULONG, 1 /*rank*/, dims);
+            } else {
+                assert(size == 1);
+                res = H5T_NATIVE_ULONG;
+            }
+            return deduplicateH5Type(res);
+        }
+
+    public:
+        HDF5BackendImpl(const char* _filename, AggregateStat* _rootStat, size_t _bytesPerWrite, bool _skipVectors, bool _sumRegularAggregates) :
+            filename(_filename), rootStat(_rootStat), skipVectors(_skipVectors), sumRegularAggregates(_sumRegularAggregates)
+        {
+            // Create stats file
+            info("HDF5 backend: Opening %s", filename);
+            hid_t fileID = H5Fcreate(filename, H5F_ACC_TRUNC, H5P_DEFAULT, H5P_DEFAULT);
+
+            hid_t rootType = getH5Type(rootStat);
+
+            //TODO: Use of table interface is vestigial at this point. Just create the dataset...
+            size_t fieldOffsets[] = {0};
+            hid_t fieldTypes[] = {rootType};
+            const char* fieldNames[] = {rootStat->name()};
+            recordSize = H5Tget_size(rootType);
+
+            recordsPerWrite = _bytesPerWrite/recordSize + 1;
+
+            herr_t hErrVal = H5TBmake_table("stats", fileID, "stats",
+                    1 /*# fields*/, 0 /*# records*/,
+                    recordSize, fieldNames, fieldOffsets, fieldTypes,
+                    recordsPerWrite /*chunk size, in records, might as well be our aggregation degree*/,
+                    NULL, 0 /*compression*/, NULL);
+            assert(hErrVal == 0);
+
+            size_t bufSize = recordsPerWrite*recordSize;
+            if (sumRegularAggregates) bufSize += recordSize; //conservatively add space for a record. See dumpWalk(), we bleed into the buffer a bit when dumping a regular aggregate.
+            dataBuf = static_cast<uint64_t*>(gm_malloc(bufSize));
+            curPtr = dataBuf;
+
+            bufferedRecords = 0;
+
+            info("HDF5 backend: Created table, %ld bytes/record, %d records/write", recordSize, recordsPerWrite);
+            H5Fclose(fileID);
+        }
+
+        ~HDF5BackendImpl() {}
+
+        void dump(bool buffered) {
+            // Copy stats to data buffer
+            dumpWalk(rootStat);
+            bufferedRecords++;
+
+            assert_msg(dataBuf + bufferedRecords*recordSize/sizeof(uint64_t) == curPtr, "HDF5 (%s): %p + %d * %ld / %ld != %p", filename, dataBuf, bufferedRecords, recordSize, sizeof(uint64_t), curPtr);
+
+            // Write to table if needed
+            if (bufferedRecords == recordsPerWrite || !buffered) {
+                hid_t fileID = H5Fopen(filename, H5F_ACC_RDWR, H5P_DEFAULT);
+
+                size_t fieldOffsets[] = {0};
+                size_t fieldSizes[] = {recordSize};
+                H5TBappend_records(fileID, "stats", bufferedRecords, recordSize, fieldOffsets, fieldSizes, dataBuf);
+                H5Fclose(fileID);
+
+                //Rewind
+                bufferedRecords = 0;
+                curPtr = dataBuf;
+            }
+        }
+};
+
+
+HDF5Backend::HDF5Backend(const char* filename, AggregateStat* rootStat, size_t bytesPerWrite, bool skipVectors, bool sumRegularAggregates) {
+    backend = new HDF5BackendImpl(filename, rootStat, bytesPerWrite, skipVectors, sumRegularAggregates);
+}
+
+void HDF5Backend::dump(bool buffered) {
+    backend->dump(buffered);
+}
+
diff --git a/src/ideal_arrays.h b/src/ideal_arrays.h
new file mode 100644
index 00000000..746d38a1
--- /dev/null
+++ b/src/ideal_arrays.h
@@ -0,0 +1,284 @@
+/** $lic$
+ * Copyright (C) 2012-2014 by Massachusetts Institute of Technology
+ * Copyright (C) 2010-2013 by The Board of Trustees of Stanford University
+ *
+ * This file is part of zsim.
+ *
+ * zsim is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU General Public License as published by the Free Software
+ * Foundation, version 2.
+ *
+ * If you use this software in your research, we request that you reference
+ * the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of
+ * Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the
+ * source of the simulator in any publications that use this software, and that
+ * you send us a citation of your work.
+ *
+ * zsim is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef IDEAL_ARRAYS_H_
+#define IDEAL_ARRAYS_H_
+
+#include "cache_arrays.h"
+#include "g_std/g_unordered_map.h"
+#include "intrusive_list.h"
+#include "part_repl_policies.h"
+#include "repl_policies.h"
+
+/* Fully associative cache arrays with LRU replacement (non-part; part coming up) */
+
+//We use a combination of a hash table and an intrusive list to perform fully-associative lookups and insertions in O(1) time
+//TODO: Post-deadline, make it a single array with a rank(req) interface
+
+class IdealLRUArray : public CacheArray {
+    private:
+        //We need a fake replpolicy and just want the CC...
+        class ProxyReplPolicy : public ReplPolicy {
+            private:
+                IdealLRUArray* a;
+            public:
+                explicit ProxyReplPolicy(IdealLRUArray* _a) : a(_a) {}
+                void setCC(CC* _cc) {a->setCC(cc);}
+
+                void update(uint32_t id, const MemReq* req) {panic("!")}
+                void replaced(uint32_t id) {panic("!!");}
+                template <typename C> uint32_t rank(const MemReq* req, C cands) {panic("!!!");}
+                void initStats(AggregateStat* parent) {}
+                DECL_RANK_BINDINGS
+        };
+
+        struct Entry : InListNode<Entry> {
+            Address lineAddr;
+            const uint32_t lineId;
+            explicit Entry(uint32_t _lineId) : lineAddr(0), lineId(_lineId) {}
+        };
+
+        Entry* array;
+        InList<Entry> lruList;
+        g_unordered_map<Address, uint32_t> lineMap; //address->lineId; if too slow, try an AATree, which does not alloc dynamically
+
+        uint32_t numLines;
+        ProxyReplPolicy* rp;
+        CC* cc;
+
+    public:
+        explicit IdealLRUArray(uint32_t _numLines) : numLines(_numLines), cc(NULL) {
+            array = gm_calloc<Entry>(numLines);
+            for (uint32_t i = 0; i < numLines; i++) {
+                Entry* e = new (&array[i]) Entry(i);
+                lruList.push_front(e);
+            }
+            rp = new ProxyReplPolicy(this);
+        }
+
+        int32_t lookup(const Address lineAddr, const MemReq* req, bool updateReplacement) {
+            g_unordered_map<Address, uint32_t>::iterator it = lineMap.find(lineAddr);
+            if (it == lineMap.end()) return -1;
+
+            uint32_t lineId = it->second;
+            if (updateReplacement) {
+                lruList.remove(&array[lineId]);
+                lruList.push_front(&array[lineId]);
+            }
+            return lineId;
+        }
+
+        uint32_t preinsert(const Address lineAddr, const MemReq* req, Address* wbLineAddr) {
+            Entry* e = lruList.back();
+            *wbLineAddr = e->lineAddr;
+            return e->lineId;
+        }
+
+        void postinsert(const Address lineAddr, const MemReq* req, uint32_t lineId) {
+            Entry* e = &array[lineId];
+
+            //Update addr mapping for lineId
+            lineMap.erase(e->lineAddr);
+            assert(lineMap.find(lineAddr) == lineMap.end());
+            e->lineAddr = lineAddr;
+            lineMap[lineAddr] = lineId;
+
+            //Update repl
+            lruList.remove(e);
+            lruList.push_front(e);
+        }
+
+        ReplPolicy* getRP() const {return rp;}
+        void setCC(CC* _cc) {cc = _cc;}
+};
+
+//Goes with IdealLRUPartArray
+class IdealLRUPartReplPolicy : public PartReplPolicy {
+    protected:
+        struct Entry : InListNode<Entry> {
+            const uint32_t lineId;
+            uint32_t p;
+            bool used; //careful, true except when just evicted, even if invalid
+            Entry(uint32_t _id, uint32_t _p) : lineId(_id), p(_p), used(true) {}
+        };
+
+        struct IdPartInfo : public PartInfo {
+            InList<Entry> lruList;
+        };
+
+        Entry* array;
+        IdPartInfo* partInfo;
+        uint32_t partitions;
+        uint32_t numLines;
+        uint32_t numBuckets;
+
+    public:
+        IdealLRUPartReplPolicy(PartitionMonitor* _monitor, PartMapper* _mapper, uint32_t _numLines, uint32_t _numBuckets) : PartReplPolicy(_monitor, _mapper), numLines(_numLines), numBuckets(_numBuckets) {
+            partitions = mapper->getNumPartitions();
+            partInfo = gm_calloc<IdPartInfo>(partitions);
+
+            for (uint32_t p = 0; p <= partitions; p++) {
+                new (&partInfo[p]) IdPartInfo();
+                partInfo[p].targetSize = numLines/partitions;
+                partInfo[p].size = 0;
+            }
+
+            array = gm_calloc<Entry>(numLines);
+            for (uint32_t i = 0; i < numLines; i++) {
+                Entry* e = new (&array[i]) Entry(i, 0);
+                partInfo[0].lruList.push_front(e);
+                partInfo[0].size++;
+            }
+        }
+
+        void initStats(AggregateStat* parentStat) {
+            AggregateStat* rpStat = new AggregateStat();
+            rpStat->init("part", "IdealLRUPart replacement policy stats");
+            ProxyStat* pStat;
+            for (uint32_t p = 0; p < partitions; p++) {
+                std::stringstream pss;
+                pss << "part-" << p;
+                AggregateStat* partStat = new AggregateStat();
+                partStat->init(gm_strdup(pss.str().c_str()), "Partition stats");
+                pStat = new ProxyStat(); pStat->init("sz", "Actual size", &partInfo[p].size); partStat->append(pStat);
+                pStat = new ProxyStat(); pStat->init("tgtSz", "Target size", &partInfo[p].targetSize); partStat->append(pStat);
+                partInfo[p].profHits.init("hits", "Hits"); partStat->append(&partInfo[p].profHits);
+                partInfo[p].profMisses.init("misses", "Misses"); partStat->append(&partInfo[p].profMisses);
+                partInfo[p].profSelfEvictions.init("selfEvs", "Evictions caused by us"); partStat->append(&partInfo[p].profSelfEvictions);
+                partInfo[p].profExtEvictions.init("extEvs", "Evictions caused by others (in transients)"); partStat->append(&partInfo[p].profExtEvictions);
+                rpStat->append(partStat);
+            }
+            parentStat->append(rpStat);
+        }
+
+        void setPartitionSizes(const uint32_t* sizes) {
+            for (uint32_t p = 0; p < partitions; p++) {
+                partInfo[p].targetSize = (sizes[p]*numLines)/numBuckets;
+            }
+        }
+
+        void update(uint32_t id, const MemReq* req) {
+            Entry* e = &array[id];
+            if (e->used) {
+                partInfo[e->p].profHits.inc();
+                partInfo[e->p].lruList.remove(e);
+                partInfo[e->p].lruList.push_front(e);
+            } else {
+                uint32_t oldPart = e->p;
+                uint32_t newPart = mapper->getPartition(*req);
+                if (oldPart != newPart) {
+                    partInfo[oldPart].size--;
+                    partInfo[oldPart].profExtEvictions.inc();
+                    partInfo[newPart].size++;
+                } else {
+                    partInfo[oldPart].profSelfEvictions.inc();
+                }
+                partInfo[newPart].profMisses.inc();
+                e->p = newPart;
+                partInfo[oldPart].lruList.remove(e);
+                partInfo[newPart].lruList.push_front(e);
+                e->used = true;
+            }
+
+            //Update partitioner
+            monitor->access(e->p, req->lineAddr);
+        }
+
+        void replaced(uint32_t id) {
+            array[id].used = false;
+        }
+
+        uint32_t rank(const MemReq* req) {
+            //Choose part to evict from as a part with highest *proportional* diff between tgt and actual sizes (minimize/smooth transients); if all parts are within limits, evict from own
+            uint32_t victimPart = mapper->getPartition(*req);
+            double maxPartDiff = 0.0;
+            if (partInfo[victimPart].size == 0) maxPartDiff = -2.0; //force a > 0-size partition
+            for (uint32_t p = 0; p < partitions; p++) {
+                double diff = ((int32_t)partInfo[p].size - (int32_t)partInfo[p].targetSize)/((double)(partInfo[p].targetSize + 1));
+                //info("YYY %d %f", p, diff);
+                if (diff > maxPartDiff && partInfo[p].size > 0) {
+                    maxPartDiff = diff;
+                    victimPart = p;
+                }
+            }
+            //assert(maxPartDiff >= -1e-8, "Evicting from non-full line! diff=%f victimPart %d (sz %d tgt %d) origPart %d", ); //someone must be over...
+            if (maxPartDiff < -1e-8) {
+                warn("Evicting from non-full part! diff=%f victimPart %d (sz %ld tgt %ld) origPart %d",
+                    maxPartDiff, victimPart, partInfo[victimPart].size, partInfo[victimPart].targetSize, mapper->getPartition(*req));
+            }
+
+            //info("rp: %d / %d %d / %d %d", victimPart, partInfo[0].size, partInfo[0].targetSize, partInfo[1].size, partInfo[1].targetSize);
+            assert(partInfo[victimPart].size > 0);
+            assert(partInfo[victimPart].size == partInfo[victimPart].lruList.size());
+            return partInfo[victimPart].lruList.back()->lineId;
+        }
+
+        template <typename C> uint32_t rank(const MemReq* req, C cands) {panic("!!");}
+        DECL_RANK_BINDINGS;
+};
+
+class IdealLRUPartArray : public CacheArray {
+    private:
+        g_unordered_map<Address, uint32_t> lineMap; //address->lineId; if too slow, try an AATree, which does not alloc dynamically
+        Address* lineAddrs; //lineId -> address, for replacements
+        IdealLRUPartReplPolicy* rp;
+        uint32_t numLines;
+
+    public:
+        IdealLRUPartArray(uint32_t _numLines, IdealLRUPartReplPolicy* _rp) : rp(_rp), numLines(_numLines) {
+            lineAddrs = gm_calloc<Address>(numLines);
+        }
+
+        int32_t lookup(const Address lineAddr, const MemReq* req, bool updateReplacement) {
+            g_unordered_map<Address, uint32_t>::iterator it = lineMap.find(lineAddr);
+            if (it == lineMap.end()) return -1;
+
+            uint32_t lineId = it->second;
+            if (updateReplacement) {
+                rp->update(lineId, req);
+            }
+            return lineId;
+        }
+
+        uint32_t preinsert(const Address lineAddr, const MemReq* req, Address* wbLineAddr) {
+            uint32_t lineId = rp->rank(req);
+            *wbLineAddr = lineAddrs[lineId];
+            return lineId;
+        }
+
+        void postinsert(const Address lineAddr, const MemReq* req, uint32_t lineId) {
+            //Update addr mapping for lineId
+            lineMap.erase(lineAddrs[lineId]);
+            assert(lineMap.find(lineAddr) == lineMap.end());
+            lineAddrs[lineId] = lineAddr;
+            lineMap[lineAddr] = lineId;
+
+            //Update repl
+            rp->replaced(lineId);
+            rp->update(lineId, req);
+        }
+};
+
+#endif  // IDEAL_ARRAYS_H_
diff --git a/src/init.cpp b/src/init.cpp
new file mode 100644
index 00000000..5cdf4e1f
--- /dev/null
+++ b/src/init.cpp
@@ -0,0 +1,921 @@
+/** $glic$
+ * Copyright (C) 2012-2014 by Massachusetts Institute of Technology
+ * Copyright (C) 2010-2013 by The Board of Trustees of Stanford University
+ * Copyright (C) 2011 Google Inc.
+ *
+ * This file is part of zsim.
+ *
+ * zsim is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU General Public License as published by the Free Software
+ * Foundation, version 2.
+ *
+ * If you use this software in your research, we request that you reference
+ * the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of
+ * Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the
+ * source of the simulator in any publications that use this software, and that
+ * you send us a citation of your work.
+ *
+ * zsim is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "init.h"
+#include <list>
+#include <sstream>
+#include <stdlib.h>
+#include <string>
+#include <sys/time.h>
+#include <vector>
+#include "cache.h"
+#include "cache_arrays.h"
+#include "config.h"
+#include "constants.h"
+#include "contention_sim.h"
+#include "core.h"
+#include "detailed_mem.h"
+#include "detailed_mem_params.h"
+#include "ddr_mem.h"
+#include "debug_zsim.h"
+#include "dramsim_mem_ctrl.h"
+#include "event_queue.h"
+#include "filter_cache.h"
+#include "galloc.h"
+#include "hash.h"
+#include "ideal_arrays.h"
+#include "locks.h"
+#include "log.h"
+#include "mem_ctrls.h"
+#include "network.h"
+#include "null_core.h"
+#include "ooo_core.h"
+#include "part_repl_policies.h"
+#include "pin_cmd.h"
+#include "prefetcher.h"
+#include "process_stats.h"
+#include "process_tree.h"
+#include "profile_stats.h"
+#include "repl_policies.h"
+#include "scheduler.h"
+#include "simple_core.h"
+#include "stats.h"
+#include "stats_filter.h"
+#include "timing_cache.h"
+#include "timing_core.h"
+#include "timing_event.h"
+#include "virt/port_virtualizer.h"
+#include "weave_md1_mem.h" //validation, could be taken out...
+#include "zsim.h"
+
+extern void EndOfPhaseActions(); //in zsim.cpp
+
+/* zsim should be initialized in a deterministic and logical order, to avoid re-reading config vars
+ * all over the place and give a predictable global state to constructors. Ideally, this should just
+ * follow the layout of zinfo, top-down.
+ */
+
+BaseCache* BuildCacheBank(Config& config, const string& prefix, g_string& name, uint32_t bankSize, bool isTerminal, uint32_t domain) {
+    uint32_t lineSize = zinfo->lineSize;
+    assert(lineSize > 0); //avoid config deps
+    if (bankSize % lineSize != 0) panic("%s: Bank size must be a multiple of line size", name.c_str());
+
+    uint32_t numLines = bankSize/lineSize;
+
+    //Array
+    uint32_t numHashes = 1;
+    uint32_t ways = config.get<uint32_t>(prefix + "array.ways", 4);
+    string arrayType = config.get<const char*>(prefix + "array.type", "SetAssoc");
+    uint32_t candidates = (arrayType == "Z")? config.get<uint32_t>(prefix + "array.candidates", 16) : ways;
+
+    //Need to know number of hash functions before instantiating array
+    if (arrayType == "SetAssoc") {
+        numHashes = 1;
+    } else if (arrayType == "Z") {
+        numHashes = ways;
+        assert(ways > 1);
+    } else if (arrayType == "IdealLRU" || arrayType == "IdealLRUPart") {
+        ways = numLines;
+        numHashes = 0;
+    } else {
+        panic("%s: Invalid array type %s", name.c_str(), arrayType.c_str());
+    }
+
+    // Power of two sets check; also compute setBits, will be useful later
+    uint32_t numSets = numLines/ways;
+    uint32_t setBits = 31 - __builtin_clz(numSets);
+    if ((1u << setBits) != numSets) panic("%s: Number of sets must be a power of two (you specified %d sets)", name.c_str(), numSets);
+
+    //Hash function
+    HashFamily* hf = NULL;
+    string hashType = config.get<const char*>(prefix + "array.hash", (arrayType == "Z")? "H3" : "None"); //zcaches must be hashed by default
+    if (numHashes) {
+        if (hashType == "None") {
+            if (arrayType == "Z") panic("ZCaches must be hashed!"); //double check for stupid user
+            assert(numHashes == 1);
+            hf = new IdHashFamily;
+        } else if (hashType == "H3") {
+            //STL hash function
+            size_t seed = _Fnv_hash_bytes(prefix.c_str(), prefix.size()+1, 0xB4AC5B);
+            //info("%s -> %lx", prefix.c_str(), seed);
+            hf = new H3HashFamily(numHashes, setBits, 0xCAC7EAFFA1 + seed /*make randSeed depend on prefix*/);
+        } else if (hashType == "SHA1") {
+            hf = new SHA1HashFamily(numHashes);
+        } else {
+            panic("%s: Invalid value %s on array.hash", name.c_str(), hashType.c_str());
+        }
+    }
+
+    //Replacement policy
+    string replType = config.get<const char*>(prefix + "repl.type", (arrayType == "IdealLRUPart")? "IdealLRUPart" : "LRU");
+    ReplPolicy* rp = NULL;
+
+    if (replType == "LRU" || replType == "LRUNoSh") {
+        bool sharersAware = (replType == "LRU") && !isTerminal;
+        if (sharersAware) {
+            rp = new LRUReplPolicy<true>(numLines);
+        } else {
+            rp = new LRUReplPolicy<false>(numLines);
+        }
+    } else if (replType == "LFU") {
+        rp = new LFUReplPolicy(numLines);
+    } else if (replType == "LRUProfViol") {
+        ProfViolReplPolicy< LRUReplPolicy<true> >* pvrp = new ProfViolReplPolicy< LRUReplPolicy<true> >(numLines);
+        pvrp->init(numLines);
+        rp = pvrp;
+    } else if (replType == "TreeLRU") {
+        rp = new TreeLRUReplPolicy(numLines, candidates);
+    } else if (replType == "NRU") {
+        rp = new NRUReplPolicy(numLines, candidates);
+    } else if (replType == "Rand") {
+        rp = new RandReplPolicy(candidates);
+    } else if (replType == "WayPart" || replType == "Vantage" || replType == "IdealLRUPart") {
+        if (replType == "WayPart" && arrayType != "SetAssoc") panic("WayPart replacement requires SetAssoc array");
+
+        //Partition mapper
+        // TODO: One partition mapper per cache (not bank).
+        string partMapper = config.get<const char*>(prefix + "repl.partMapper", "Core");
+        PartMapper* pm = NULL;
+        if (partMapper == "Core") {
+            pm = new CorePartMapper(zinfo->numCores); //NOTE: If the cache is not fully shared, trhis will be inefficient...
+        } else if (partMapper == "InstrData") {
+            pm = new InstrDataPartMapper();
+        } else if (partMapper == "InstrDataCore") {
+            pm = new InstrDataCorePartMapper(zinfo->numCores);
+        } else if (partMapper == "Process") {
+            pm = new ProcessPartMapper(zinfo->numProcs);
+        } else if (partMapper == "InstrDataProcess") {
+            pm = new InstrDataProcessPartMapper(zinfo->numProcs);
+        } else if (partMapper == "ProcessGroup") {
+            pm = new ProcessGroupPartMapper();
+        } else {
+            panic("Invalid repl.partMapper %s on %s", partMapper.c_str(), name.c_str());
+        }
+
+        // Partition monitor
+        uint32_t umonLines = config.get<uint32_t>(prefix + "repl.umonLines", 256);
+        uint32_t umonWays = config.get<uint32_t>(prefix + "repl.umonWays", ways);
+        uint32_t buckets;
+        if (replType == "WayPart") {
+            buckets = ways; //not an option with WayPart
+        } else { //Vantage or Ideal
+            buckets = config.get<uint32_t>(prefix + "repl.buckets", 256);
+        }
+
+        PartitionMonitor* mon = new UMonMonitor(numLines, umonLines, umonWays, pm->getNumPartitions(), buckets);
+
+        //Finally, instantiate the repl policy
+        PartReplPolicy* prp;
+        double allocPortion = 1.0;
+        if (replType == "WayPart") {
+            //if set, drives partitioner but doesn't actually do partitioning
+            bool testMode = config.get<bool>(prefix + "repl.testMode", false);
+            prp = new WayPartReplPolicy(mon, pm, numLines, ways, testMode);
+        } else if (replType == "IdealLRUPart") {
+            prp = new IdealLRUPartReplPolicy(mon, pm, numLines, buckets);
+        } else { //Vantage
+            uint32_t assoc = (arrayType == "Z")? candidates : ways;
+            allocPortion = .85;
+            bool smoothTransients = config.get<bool>(prefix + "repl.smoothTransients", false);
+            prp = new VantageReplPolicy(mon, pm, numLines, assoc, (uint32_t)(allocPortion * 100), 10, 50, buckets, smoothTransients);
+        }
+        rp = prp;
+
+        // Partitioner
+        // TODO: Depending on partitioner type, we want one per bank or one per cache.
+        Partitioner* p = new LookaheadPartitioner(prp, pm->getNumPartitions(), buckets, 1, allocPortion);
+
+        //Schedule its tick
+        uint32_t interval = config.get<uint32_t>(prefix + "repl.interval", 5000); //phases
+        zinfo->eventQueue->insert(new Partitioner::PartitionEvent(p, interval));
+    } else {
+        panic("%s: Invalid replacement type %s", name.c_str(), replType.c_str());
+    }
+    assert(rp);
+
+
+    //Alright, build the array
+    CacheArray* array = NULL;
+    if (arrayType == "SetAssoc") {
+        array = new SetAssocArray(numLines, ways, rp, hf);
+    } else if (arrayType == "Z") {
+        array = new ZArray(numLines, ways, candidates, rp, hf);
+    } else if (arrayType == "IdealLRU") {
+        assert(replType == "LRU");
+        assert(!hf);
+        IdealLRUArray* ila = new IdealLRUArray(numLines);
+        rp = ila->getRP();
+        array = ila;
+    } else if (arrayType == "IdealLRUPart") {
+        assert(!hf);
+        IdealLRUPartReplPolicy* irp = dynamic_cast<IdealLRUPartReplPolicy*>(rp);
+        if (!irp) panic("IdealLRUPart array needs IdealLRUPart repl policy!");
+        array = new IdealLRUPartArray(numLines, irp);
+    } else {
+        panic("This should not happen, we already checked for it!"); //unless someone changed arrayStr...
+    }
+
+    //Latency
+    uint32_t latency = config.get<uint32_t>(prefix + "latency", 10);
+    uint32_t accLat = (isTerminal)? 0 : latency; //terminal caches has no access latency b/c it is assumed accLat is hidden by the pipeline
+    uint32_t invLat = latency;
+
+    //Type and inclusion
+    string type = config.get<const char*>(prefix + "type", "Simple");
+    bool nonInclusiveHack = config.get<bool>(prefix + "nonInclusiveHack", false);
+    if (nonInclusiveHack) assert(type == "Simple" && !isTerminal);
+
+    //Finally, build the cache
+    Cache* cache;
+    CC* cc;
+    if (isTerminal) {
+        cc = new MESITerminalCC(numLines, name);
+    } else {
+        cc = new MESICC(numLines, nonInclusiveHack, name);
+    }
+    rp->setCC(cc);
+    if (!isTerminal) {
+        if (type == "Simple") {
+            cache = new Cache(numLines, cc, array, rp, accLat, invLat, name);
+        } else if (type == "Timing") {
+            uint32_t mshrs = config.get<uint32_t>(prefix + "mshrs", 16);
+            uint32_t tagLat = config.get<uint32_t>(prefix + "tagLat", 5);
+            uint32_t timingCandidates = config.get<uint32_t>(prefix + "timingCandidates", candidates);
+            cache = new TimingCache(numLines, cc, array, rp, accLat, invLat, mshrs, tagLat, ways, timingCandidates, domain, name);
+        } else {
+            panic("Invalid cache type %s", type.c_str());
+        }
+    } else {
+        //Filter cache optimization
+        if (type != "Simple") panic("Terminal cache %s can only have type == Simple", name.c_str());
+        if (arrayType != "SetAssoc" || hashType != "None" || replType != "LRU") panic("Invalid FilterCache config %s", name.c_str());
+        cache = new FilterCache(numSets, numLines, cc, array, rp, accLat, invLat, name);
+    }
+
+#if 0
+    info("Built L%d bank, %d bytes, %d lines, %d ways (%d candidates if array is Z), %s array, %s hash, %s replacement, accLat %d, invLat %d name %s",
+            level, bankSize, numLines, ways, candidates, arrayType.c_str(), hashType.c_str(), replType.c_str(), accLat, invLat, name.c_str());
+#endif
+
+    return cache;
+}
+
+MemObject* BuildMemoryController(Config& config, uint32_t lineSize, uint32_t frequency, uint32_t domain, g_string& name) {
+    //Type
+    string type = config.get<const char*>("sys.mem.type", "Simple");
+
+    //Latency
+    uint32_t latency = (type == "DDR")? -1 : config.get<uint32_t>("sys.mem.latency", 100);
+
+    MemObject* mem = NULL;
+    if (type == "Simple") {
+        mem = new SimpleMemory(latency, name);
+    } else if (type == "MD1") {
+        // The following params are for MD1 only
+        // NOTE: Frequency (in MHz) -- note this is a sys parameter (not sys.mem). There is an implicit assumption of having
+        // a single CCT across the system, and we are dealing with latencies in *core* clock cycles
+
+        // Peak bandwidth (in MB/s)
+        uint32_t bandwidth = config.get<uint32_t>("sys.mem.bandwidth", 6400);
+
+        mem = new MD1Memory(lineSize, frequency, bandwidth, latency, name);
+    } else if (type == "WeaveMD1") {
+        uint32_t bandwidth = config.get<uint32_t>("sys.mem.bandwidth", 6400);
+        uint32_t boundLatency = config.get<uint32_t>("sys.mem.boundLatency", latency);
+        mem = new WeaveMD1Memory(lineSize, frequency, bandwidth, latency, boundLatency, domain, name);
+    } else if (type == "WeaveSimple") {
+        uint32_t boundLatency = config.get<uint32_t>("sys.mem.boundLatency", 100);
+        mem = new WeaveSimpleMemory(latency, boundLatency, domain, name);
+    } else if (type == "DDR") {
+        uint32_t ranksPerChannel = config.get<uint32_t>("sys.mem.ranksPerChannel", 4);
+        uint32_t banksPerRank = config.get<uint32_t>("sys.mem.banksPerRank", 8);  // DDR3 std is 8
+        uint32_t pageSize = config.get<uint32_t>("sys.mem.pageSize", 8*1024);  // 1Kb cols, x4 devices
+        const char* tech = config.get<const char*>("sys.mem.tech", "DDR3-1333-CL10");  // see cpp file for other techs
+        const char* addrMapping = config.get<const char*>("sys.mem.addrMapping", "rank:col:bank");  // address splitter interleaves channels; row always on top
+
+        // If set, writes are deferred and bursted out to reduce WTR overheads
+        bool deferWrites = config.get<bool>("sys.mem.deferWrites", true);
+        bool closedPage = config.get<bool>("sys.mem.closedPage", true);
+
+        // Max row hits before we stop prioritizing further row hits to this bank.
+        // Balances throughput and fairness; 0 -> FCFS / high (e.g., -1) -> pure FR-FCFS
+        uint32_t maxRowHits = config.get<uint32_t>("sys.mem.maxRowHits", 4);
+
+        // Request queues
+        uint32_t queueDepth = config.get<uint32_t>("sys.mem.queueDepth", 16);
+        uint32_t controllerLatency = config.get<uint32_t>("sys.mem.controllerLatency", 10);  // in system cycles
+
+        mem = new DDRMemory(zinfo->lineSize, pageSize, ranksPerChannel, banksPerRank, frequency, tech,
+                addrMapping, controllerLatency, queueDepth, maxRowHits, deferWrites, closedPage, domain, name);
+    } else if (type == "DRAMSim") {
+        uint64_t cpuFreqHz = 1000000 * frequency;
+        uint32_t capacity = config.get<uint32_t>("sys.mem.capacityMB", 16384);
+        string dramTechIni = config.get<const char*>("sys.mem.techIni");
+        string dramSystemIni = config.get<const char*>("sys.mem.systemIni");
+        string outputDir = config.get<const char*>("sys.mem.outputDir");
+        string traceName = config.get<const char*>("sys.mem.traceName");
+        mem = new DRAMSimMemory(dramTechIni, dramSystemIni, outputDir, traceName, capacity, cpuFreqHz, latency, domain, name);
+    } else if (type == "Detailed") {
+        // FIXME(dsm): Don't use a separate config file... see DDRMemory
+        g_string mcfg = config.get<const char*>("sys.mem.paramFile", "");
+        mem = new MemControllerBase(mcfg, lineSize, frequency, domain, name);
+    } else {
+        panic("Invalid memory controller type %s", type.c_str());
+    }
+    return mem;
+}
+
+typedef vector<vector<BaseCache*>> CacheGroup;
+
+CacheGroup* BuildCacheGroup(Config& config, const string& name, bool isTerminal) {
+    CacheGroup* cgp = new CacheGroup;
+    CacheGroup& cg = *cgp;
+
+    string prefix = "sys.caches." + name + ".";
+
+    bool isPrefetcher = config.get<bool>(prefix + "isPrefetcher", false);
+    if (isPrefetcher) { //build a prefetcher group
+        uint32_t prefetchers = config.get<uint32_t>(prefix + "prefetchers", 1);
+        cg.resize(prefetchers);
+        for (vector<BaseCache*>& bg : cg) bg.resize(1);
+        for (uint32_t i = 0; i < prefetchers; i++) {
+            stringstream ss;
+            ss << name << "-" << i;
+            g_string pfName(ss.str().c_str());
+            cg[i][0] = new StreamPrefetcher(pfName);
+        }
+        return cgp;
+    }
+
+    uint32_t size = config.get<uint32_t>(prefix + "size", 64*1024);
+    uint32_t banks = config.get<uint32_t>(prefix + "banks", 1);
+    uint32_t caches = config.get<uint32_t>(prefix + "caches", 1);
+
+    uint32_t bankSize = size/banks;
+    if (size % banks != 0) {
+        panic("%s: banks (%d) does not divide the size (%d bytes)", name.c_str(), banks, size);
+    }
+
+    cg.resize(caches);
+    for (vector<BaseCache*>& bg : cg) bg.resize(banks);
+
+    for (uint32_t i = 0; i < caches; i++) {
+        for (uint32_t j = 0; j < banks; j++) {
+            stringstream ss;
+            ss << name << "-" << i;
+            if (banks > 1) {
+                ss << "b" << j;
+            }
+            g_string bankName(ss.str().c_str());
+            uint32_t domain = (i*banks + j)*zinfo->numDomains/(caches*banks); //(banks > 1)? nextDomain() : (i*banks + j)*zinfo->numDomains/(caches*banks);
+            cg[i][j] = BuildCacheBank(config, prefix, bankName, bankSize, isTerminal, domain);
+        }
+    }
+
+    return cgp;
+}
+
+static void InitSystem(Config& config) {
+    unordered_map<string, string> parentMap; //child -> parent
+    unordered_map<string, vector<string>> childMap; //parent -> children (a parent may have multiple children, they are ordered by appearance in the file)
+
+    //If a network file is specificied, build a Network
+    string networkFile = config.get<const char*>("sys.networkFile", "");
+    Network* network = (networkFile != "")? new Network(networkFile.c_str()) : NULL;
+
+    //Build the caches
+    vector<const char*> cacheGroupNames;
+    config.subgroups("sys.caches", cacheGroupNames);
+    string prefix = "sys.caches.";
+
+    for (const char* grp : cacheGroupNames) {
+        string group(grp);
+        if (group == "mem") panic("'mem' is an invalid cache group name");
+        if (parentMap.count(group)) panic("Duplicate cache group %s", (prefix + group).c_str());
+        string parent = config.get<const char*>(prefix + group + ".parent");
+        parentMap[group] = parent;
+        if (!childMap.count(parent)) childMap[parent] = vector<string>();
+        childMap[parent].push_back(group);
+    }
+
+    //Check that all parents are valid: Either another cache, or "mem"
+    for (const char* grp : cacheGroupNames) {
+        string group(grp);
+        string parent = parentMap[group];
+        if (parent != "mem" && !parentMap.count(parent)) panic("%s has invalid parent %s", (prefix + group).c_str(), parent.c_str());
+    }
+
+    //Get the (single) LLC
+    if (!childMap.count("mem")) panic("One cache must have mem as parent, none found");
+    if (childMap["mem"].size() != 1) panic("One cache must have mem as parent, multiple found");
+    string llc = childMap["mem"][0];
+
+    //Build each of the groups, starting with the LLC
+    unordered_map<string, CacheGroup*> cMap;
+    list<string> fringe; //FIFO
+    fringe.push_back(llc);
+    while (!fringe.empty()) {
+        string group = fringe.front();
+        fringe.pop_front();
+
+        bool isTerminal = (childMap.count(group) == 0); //if no children, connected to cores
+        if (cMap.count(group)) panic("The cache 'tree' has a loop at %s", group.c_str());
+        cMap[group] = BuildCacheGroup(config, group, isTerminal);
+        if (!isTerminal) for (string child : childMap[group]) fringe.push_back(child);
+    }
+
+    //Check single LLC
+    if (cMap[llc]->size() != 1) panic("Last-level cache %s must have caches = 1, but %ld were specified", llc.c_str(), cMap[llc]->size());
+
+    /* Since we have checked for no loops, parent is mandatory, and all parents are checked valid,
+     * it follows that we have a fully connected tree finishing at the LLC.
+     */
+
+    //Build the memory controllers
+    uint32_t memControllers = config.get<uint32_t>("sys.mem.controllers", 1);
+    assert(memControllers > 0);
+
+    g_vector<MemObject*> mems;
+    mems.resize(memControllers);
+
+    for (uint32_t i = 0; i < memControllers; i++) {
+        stringstream ss;
+        ss << "mem-" << i;
+        g_string name(ss.str().c_str());
+        //uint32_t domain = nextDomain(); //i*zinfo->numDomains/memControllers;
+        uint32_t domain = i*zinfo->numDomains/memControllers;
+        mems[i] = BuildMemoryController(config, zinfo->lineSize, zinfo->freqMHz, domain, name);
+    }
+
+    if (memControllers > 1) {
+        bool splitAddrs = config.get<bool>("sys.mem.splitAddrs", true);
+        if (splitAddrs) {
+            MemObject* splitter = new SplitAddrMemory(mems, "mem-splitter");
+            mems.resize(1);
+            mems[0] = splitter;
+        }
+    }
+
+    //Connect everything
+
+    // mem to llc is a bit special, only one llc
+    uint32_t childId = 0;
+    for (BaseCache* llcBank : (*cMap[llc])[0]) {
+        llcBank->setParents(childId++, mems, network);
+    }
+
+    // Rest of caches
+    for (const char* grp : cacheGroupNames) {
+        if (childMap.count(grp) == 0) continue; //skip terminal caches
+
+        CacheGroup& parentCaches = *cMap[grp];
+        uint32_t parents = parentCaches.size();
+        assert(parents);
+
+        //Concatenation of all child caches
+        CacheGroup childCaches;
+        for (string child : childMap[grp]) childCaches.insert(childCaches.end(), cMap[child]->begin(), cMap[child]->end());
+
+        uint32_t children = childCaches.size();
+        assert(children);
+
+        uint32_t childrenPerParent = children/parents;
+        if (children % parents != 0) {
+            panic("%s has %d caches and %d children, they are non-divisible. "
+                    "Use multiple groups for non-homogeneous children per parent!", grp, parents, children);
+        }
+
+        //HACK FIXME: This solves the L1I+D-L2 connection bug, but it's not very clear.
+        //A long-term solution is to specify whether the children should be interleaved or concatenated.
+        bool terminalChildren = true;
+        for (string child : childMap[grp]) terminalChildren &= (childMap.count(child) == 0 || config.get<bool>("sys.caches." + child + ".isPrefetcher", false));
+        if (terminalChildren) {
+            info("%s's children are all terminal OR PREFETCHERS, interleaving them", grp);
+            CacheGroup tmp(childCaches);
+            uint32_t stride = children/childrenPerParent;
+            for (uint32_t i = 0; i < children; i++) childCaches[i] = tmp[(i % childrenPerParent)*stride + i/childrenPerParent];
+        }
+
+        for (uint32_t p = 0; p < parents; p++) {
+            g_vector<MemObject*> parentsVec;
+            parentsVec.insert(parentsVec.end(), parentCaches[p].begin(), parentCaches[p].end()); //BaseCache* to MemObject* is a safe cast
+
+            uint32_t childId = 0;
+            g_vector<BaseCache*> childrenVec;
+            for (uint32_t c = p*childrenPerParent; c < (p+1)*childrenPerParent; c++) {
+                for (BaseCache* bank : childCaches[c]) {
+                    bank->setParents(childId++, parentsVec, network);
+                    childrenVec.push_back(bank);
+                }
+            }
+
+            for (BaseCache* bank : parentCaches[p]) {
+                bank->setChildren(childrenVec, network);
+            }
+        }
+    }
+
+    //Check that all the terminal caches have a single bank
+    for (const char* grp : cacheGroupNames) {
+        if (childMap.count(grp) == 0) {
+            uint32_t banks = (*cMap[grp])[0].size();
+            if (banks != 1) panic("Terminal cache group %s needs to have a single bank, has %d", grp, banks);
+        }
+    }
+
+    //Tracks how many terminal caches have been allocated to cores
+    unordered_map<string, uint32_t> assignedCaches;
+    for (const char* grp : cacheGroupNames) if (childMap.count(grp) == 0) assignedCaches[grp] = 0;
+
+    //Instantiate the cores
+    vector<const char*> coreGroupNames;
+    unordered_map <string, vector<Core*>> coreMap;
+
+    config.subgroups("sys.cores", coreGroupNames);
+
+    uint32_t coreIdx = 0;
+    for (const char* group : coreGroupNames) {
+        if (parentMap.count(group)) panic("Core group name %s is invalid, a cache group already has that name", group);
+
+        coreMap[group] = vector<Core*>();
+
+        string prefix = string("sys.cores.") + group + ".";
+        uint32_t cores = config.get<uint32_t>(prefix + "cores", 1);
+        string type = config.get<const char*>(prefix + "type", "Simple");
+
+        //Build the core group
+        union {
+            SimpleCore* simpleCores;
+            TimingCore* timingCores;
+            OOOCore* oooCores;
+            NullCore* nullCores;
+        };
+        if (type == "Simple") {
+            simpleCores = gm_memalign<SimpleCore>(CACHE_LINE_BYTES, cores);
+        } else if (type == "Timing") {
+            timingCores = gm_memalign<TimingCore>(CACHE_LINE_BYTES, cores);
+        } else if (type == "OOO") {
+            oooCores = gm_memalign<OOOCore>(CACHE_LINE_BYTES, cores);
+            zinfo->oooDecode = true; //enable uop decoding, this is false by default, must be true if even one OOO cpu is in the system
+        } else if (type == "Null") {
+            nullCores = gm_memalign<NullCore>(CACHE_LINE_BYTES, cores);
+        } else {
+            panic("%s: Invalid core type %s", group, type.c_str());
+        }
+
+        if (type != "Null") {
+            string icache = config.get<const char*>(prefix + "icache");
+            string dcache = config.get<const char*>(prefix + "dcache");
+
+            if (!assignedCaches.count(icache)) panic("%s: Invalid icache parameter %s", group, icache.c_str());
+            if (!assignedCaches.count(dcache)) panic("%s: Invalid dcache parameter %s", group, dcache.c_str());
+
+            for (uint32_t j = 0; j < cores; j++) {
+                stringstream ss;
+                ss << group << "-" << j;
+                g_string name(ss.str().c_str());
+                Core* core;
+
+                //Get the caches
+                CacheGroup& igroup = *cMap[icache];
+                CacheGroup& dgroup = *cMap[dcache];
+
+                if (assignedCaches[icache] >= igroup.size()) {
+                    panic("%s: icache group %s (%ld caches) is fully used, can't connect more cores to it", name.c_str(), icache.c_str(), igroup.size());
+                }
+                FilterCache* ic = dynamic_cast<FilterCache*>(igroup[assignedCaches[icache]][0]);
+                assert(ic);
+                ic->setSourceId(coreIdx);
+                ic->setFlags(MemReq::IFETCH | MemReq::NOEXCL);
+                assignedCaches[icache]++;
+
+                if (assignedCaches[dcache] >= dgroup.size()) {
+                    panic("%s: dcache group %s (%ld caches) is fully used, can't connect more cores to it", name.c_str(), dcache.c_str(), dgroup.size());
+                }
+                FilterCache* dc = dynamic_cast<FilterCache*>(dgroup[assignedCaches[dcache]][0]);
+                assert(dc);
+                dc->setSourceId(coreIdx);
+                assignedCaches[dcache]++;
+
+                //Build the core
+                if (type == "Simple") {
+                    core = new (&simpleCores[j]) SimpleCore(ic, dc, name);
+                } else if (type == "Timing") {
+                    uint32_t domain = j*zinfo->numDomains/cores;
+                    TimingCore* tcore = new (&timingCores[j]) TimingCore(ic, dc, domain, name);
+                    zinfo->eventRecorders[coreIdx] = tcore->getEventRecorder();
+                    zinfo->eventRecorders[coreIdx]->setSourceId(coreIdx);
+                    core = tcore;
+                } else {
+                    assert(type == "OOO");
+                    OOOCore* ocore = new (&oooCores[j]) OOOCore(ic, dc, name);
+                    zinfo->eventRecorders[coreIdx] = ocore->getEventRecorder();
+                    zinfo->eventRecorders[coreIdx]->setSourceId(coreIdx);
+                    core = ocore;
+                }
+                coreMap[group].push_back(core);
+                coreIdx++;
+            }
+        } else {
+            assert(type == "Null");
+            for (uint32_t j = 0; j < cores; j++) {
+                stringstream ss;
+                ss << group << "-" << j;
+                g_string name(ss.str().c_str());
+                Core* core = new (&nullCores[j]) NullCore(name);
+                coreMap[group].push_back(core);
+                coreIdx++;
+            }
+        }
+    }
+
+    //Check that all the terminal caches are fully connected
+    for (const char* grp : cacheGroupNames) {
+        if (childMap.count(grp) == 0 && assignedCaches[grp] != cMap[grp]->size()) {
+            panic("%s: Terminal cache group not fully connected, %ld caches, %d assigned", grp, cMap[grp]->size(), assignedCaches[grp]);
+        }
+    }
+
+    //Populate global core info
+    assert(zinfo->numCores == coreIdx);
+    zinfo->cores = gm_memalign<Core*>(CACHE_LINE_BYTES, zinfo->numCores);
+    coreIdx = 0;
+    for (const char* group : coreGroupNames) for (Core* core : coreMap[group]) zinfo->cores[coreIdx++] = core;
+
+    //Init stats: cores, caches, mem
+    for (const char* group : coreGroupNames) {
+        AggregateStat* groupStat = new AggregateStat(true);
+        groupStat->init(gm_strdup(group), "Core stats");
+        for (Core* core : coreMap[group]) core->initStats(groupStat);
+        zinfo->rootStat->append(groupStat);
+    }
+
+    for (const char* group : cacheGroupNames) {
+        AggregateStat* groupStat = new AggregateStat(true);
+        groupStat->init(gm_strdup(group), "Cache stats");
+        for (vector<BaseCache*>& banks : *cMap[group]) for (BaseCache* bank : banks) bank->initStats(groupStat);
+        zinfo->rootStat->append(groupStat);
+    }
+
+    //Initialize event recorders
+    //for (uint32_t i = 0; i < zinfo->numCores; i++) eventRecorders[i] = new EventRecorder();
+
+    AggregateStat* memStat = new AggregateStat(true);
+    memStat->init("mem", "Memory controller stats");
+    for (auto mem : mems) mem->initStats(memStat);
+    zinfo->rootStat->append(memStat);
+
+    //Odds and ends: BuildCacheGroup new'd the cache groups, we need to delete them
+    for (pair<string, CacheGroup*> kv : cMap) delete kv.second;
+    cMap.clear();
+
+    info("Initialized system");
+}
+
+static void PreInitStats() {
+    zinfo->rootStat = new AggregateStat();
+    zinfo->rootStat->init("root", "Stats");
+}
+
+static void PostInitStats(bool perProcessDir, Config& config) {
+    zinfo->rootStat->makeImmutable();
+    zinfo->trigger = 15000;
+
+    string pathStr = zinfo->outputDir;
+    pathStr += "/";
+
+    // Absolute paths for stats files. Note these must be in the global heap.
+    const char* pStatsFile = gm_strdup((pathStr + "zsim.h5").c_str());
+    const char* evStatsFile = gm_strdup((pathStr + "zsim-ev.h5").c_str());
+    const char* cmpStatsFile = gm_strdup((pathStr + "zsim-cmp.h5").c_str());
+    const char* statsFile = gm_strdup((pathStr + "zsim.out").c_str());
+
+    if (zinfo->statsPhaseInterval) {
+        const char* periodicStatsFilter = config.get<const char*>("sim.periodicStatsFilter", "");
+        AggregateStat* prStat = (!strlen(periodicStatsFilter))? zinfo->rootStat : FilterStats(zinfo->rootStat, periodicStatsFilter);
+        if (!prStat) panic("No stats match sim.periodicStatsFilter regex (%s)! Set interval to 0 to avoid periodic stats", periodicStatsFilter);
+        zinfo->periodicStatsBackend = new HDF5Backend(pStatsFile, prStat, (1 << 20) /* 1MB chunks */, zinfo->skipStatsVectors, zinfo->compactPeriodicStats);
+        zinfo->periodicStatsBackend->dump(true); //must have a first sample
+
+        class PeriodicStatsDumpEvent : public Event {
+            public:
+                explicit PeriodicStatsDumpEvent(uint32_t period) : Event(period) {}
+                void callback() {
+                    zinfo->trigger = 10000;
+                    zinfo->periodicStatsBackend->dump(true /*buffered*/);
+                }
+        };
+
+        zinfo->eventQueue->insert(new PeriodicStatsDumpEvent(zinfo->statsPhaseInterval));
+
+    } else {
+        zinfo->periodicStatsBackend = NULL;
+    }
+
+    zinfo->eventualStatsBackend = new HDF5Backend(evStatsFile, zinfo->rootStat, (1 << 17) /* 128KB chunks */, zinfo->skipStatsVectors, false /* don't sum regular aggregates*/);
+    zinfo->eventualStatsBackend->dump(true); //must have a first sample
+
+    if (zinfo->maxMinInstrs) {
+        warn("maxMinInstrs IS DEPRECATED");
+        for (uint32_t i = 0; i < zinfo->numCores; i++) {
+            auto getInstrs = [i]() { return zinfo->cores[i]->getInstrs(); };
+            auto dumpStats = [i]() {
+                info("Dumping eventual stats for core %d", i);
+                zinfo->trigger = i;
+                zinfo->eventualStatsBackend->dump(true /*buffered*/);
+            };
+            zinfo->eventQueue->insert(makeAdaptiveEvent(getInstrs, dumpStats, 0, zinfo->maxMinInstrs, MAX_IPC*zinfo->phaseLength));
+        }
+    }
+
+    zinfo->compactStatsBackend = new HDF5Backend(cmpStatsFile, zinfo->rootStat, 0 /* no aggregation, this is just 1 record */, zinfo->skipStatsVectors, true); //don't dump a first sample.
+
+    zinfo->statsBackend = new TextBackend(statsFile, zinfo->rootStat);
+}
+
+static void InitGlobalStats() {
+    zinfo->profSimTime = new TimeBreakdownStat();
+    const char* stateNames[] = {"init", "bound", "weave", "ff"};
+    zinfo->profSimTime->init("time", "Simulator time breakdown", 4, stateNames);
+    zinfo->rootStat->append(zinfo->profSimTime);
+
+    ProxyStat* triggerStat = new ProxyStat();
+    triggerStat->init("trigger", "Reason for this stats dump", &zinfo->trigger);
+    zinfo->rootStat->append(triggerStat);
+
+    ProxyStat* phaseStat = new ProxyStat();
+    phaseStat->init("phase", "Simulated phases", &zinfo->numPhases);
+    zinfo->rootStat->append(phaseStat);
+}
+
+
+void SimInit(const char* configFile, const char* outputDir, uint32_t shmid) {
+    zinfo = gm_calloc<GlobSimInfo>();
+    zinfo->outputDir = gm_strdup(outputDir);
+
+    Config config(configFile);
+
+    //Debugging
+    //NOTE: This should be as early as possible, so that we can attach to the debugger before initialization.
+    zinfo->attachDebugger = config.get<bool>("sim.attachDebugger", false);
+    zinfo->harnessPid = getppid();
+    getLibzsimAddrs(&zinfo->libzsimAddrs);
+
+    if (zinfo->attachDebugger) {
+        gm_set_secondary_ptr(&zinfo->libzsimAddrs);
+        notifyHarnessForDebugger(zinfo->harnessPid);
+    }
+
+    PreInitStats();
+
+    //Get the number of cores
+    //TODO: There is some duplication with the core creation code. This should be fixed eventually.
+    uint32_t numCores = 0;
+    vector<const char*> groups;
+    config.subgroups("sys.cores", groups);
+    for (const char* group : groups) {
+        uint32_t cores = config.get<uint32_t>(string("sys.cores.") + group + ".cores", 1);
+        numCores += cores;
+    }
+
+    if (numCores == 0) panic("Config must define some core classes in sys.cores; sys.numCores is deprecated");
+    zinfo->numCores = numCores;
+    assert(numCores <= MAX_THREADS); //TODO: Is there any reason for this limit?
+
+    zinfo->numDomains = config.get<uint32_t>("sim.domains", 1);
+    uint32_t numSimThreads = config.get<uint32_t>("sim.contentionThreads", MAX((uint32_t)1, zinfo->numDomains/2)); //gives a bit of parallelism, TODO tune
+    zinfo->contentionSim = new ContentionSim(zinfo->numDomains, numSimThreads);
+    zinfo->contentionSim->initStats(zinfo->rootStat);
+    zinfo->eventRecorders = gm_calloc<EventRecorder*>(numCores);
+
+    // Global simulation values
+    zinfo->numPhases = 0;
+
+    zinfo->phaseLength = config.get<uint32_t>("sim.phaseLength", 10000);
+    zinfo->statsPhaseInterval = config.get<uint32_t>("sim.statsPhaseInterval", 100);
+    zinfo->freqMHz = config.get<uint32_t>("sys.frequency", 2000);
+
+    //Maxima/termination conditions
+    zinfo->maxPhases = config.get<uint64_t>("sim.maxPhases", 0);
+    zinfo->maxMinInstrs = config.get<uint64_t>("sim.maxMinInstrs", 0);
+    zinfo->maxTotalInstrs = config.get<uint64_t>("sim.maxTotalInstrs", 0);
+
+    uint64_t maxSimTime = config.get<uint32_t>("sim.maxSimTime", 0);
+    zinfo->maxSimTimeNs = maxSimTime*1000L*1000L*1000L;
+
+    zinfo->maxProcEventualDumps = config.get<uint32_t>("sim.maxProcEventualDumps", 0);
+    zinfo->procEventualDumps = 0;
+
+    zinfo->skipStatsVectors = config.get<bool>("sim.skipStatsVectors", false);
+    zinfo->compactPeriodicStats = config.get<bool>("sim.compactPeriodicStats", false);
+
+    //Fast-forwarding and magic ops
+    zinfo->ignoreHooks = config.get<bool>("sim.ignoreHooks", false);
+    zinfo->ffReinstrument = config.get<bool>("sim.ffReinstrument", false);
+    if (zinfo->ffReinstrument) warn("sim.ffReinstrument = true, switching fast-forwarding on a multi-threaded process may be unstable");
+
+    zinfo->registerThreads = config.get<bool>("sim.registerThreads", false);
+    zinfo->globalPauseFlag = config.get<bool>("sim.startInGlobalPause", false);
+
+    zinfo->eventQueue = new EventQueue(); //must be instantiated before the memory hierarchy
+
+    //Build the scheduler
+    uint32_t parallelism = config.get<uint32_t>("sim.parallelism", 2*sysconf(_SC_NPROCESSORS_ONLN));
+    if (parallelism < zinfo->numCores) info("Limiting concurrent threads to %d", parallelism);
+    assert(parallelism > 0); //jeez...
+
+    uint32_t schedQuantum = config.get<uint32_t>("sim.schedQuantum", 10000); //phases
+    zinfo->sched = new Scheduler(EndOfPhaseActions, parallelism, zinfo->numCores, schedQuantum);
+
+    zinfo->blockingSyscalls = config.get<bool>("sim.blockingSyscalls", false);
+
+    if (zinfo->blockingSyscalls) {
+        warn("sim.blockingSyscalls = True, will likely deadlock with multi-threaded apps!");
+    }
+
+    InitGlobalStats();
+
+    //Core stats (initialized here for cosmetic reasons, to be above cache stats)
+    AggregateStat* allCoreStats = new AggregateStat(false);
+    allCoreStats->init("core", "Core stats");
+    zinfo->rootStat->append(allCoreStats);
+
+    //Process tree needs this initialized, even though it is part of the memory hierarchy
+    zinfo->lineSize = config.get<uint32_t>("sys.lineSize", 64);
+    assert(zinfo->lineSize > 0);
+
+    //Port virtualization
+    for (uint32_t i = 0; i < MAX_PORT_DOMAINS; i++) zinfo->portVirt[i] = new PortVirtualizer();
+
+    //Process hierarchy
+    //NOTE: Due to partitioning, must be done before initializing memory hierarchy
+    CreateProcessTree(config);
+    zinfo->procArray[0]->notifyStart(); //called here so that we can detect end-before-start races
+
+    zinfo->pinCmd = new PinCmd(&config, NULL /*don't pass config file to children --- can go either way, it's optional*/, outputDir, shmid);
+
+    //Caches, cores, memory controllers
+    InitSystem(config);
+
+    //Sched stats (deferred because of circular deps)
+    zinfo->sched->initStats(zinfo->rootStat);
+
+    zinfo->processStats = new ProcessStats(zinfo->rootStat);
+
+    //It's a global stat, but I want it to be last...
+    zinfo->profHeartbeats = new VectorCounter();
+    zinfo->profHeartbeats->init("heartbeats", "Per-process heartbeats", zinfo->lineSize);
+    zinfo->rootStat->append(zinfo->profHeartbeats);
+
+    bool perProcessDir = config.get<bool>("sim.perProcessDir", false);
+    PostInitStats(perProcessDir, config);
+
+    zinfo->perProcessCpuEnum = config.get<bool>("sim.perProcessCpuEnum", false);
+
+    //Odds and ends
+    bool printMemoryStats = config.get<bool>("sim.printMemoryStats", false);
+    if (printMemoryStats) {
+        gm_stats();
+    }
+
+    //HACK: Read all variables that are read in the harness but not in init
+    //This avoids warnings on those elements
+    config.get<uint32_t>("sim.gmMBytes", (1 << 10));
+    if (!zinfo->attachDebugger) config.get<bool>("sim.deadlockDetection", true);
+    config.get<bool>("sim.aslr", false);
+
+    //Write config out
+    bool strictConfig = config.get<bool>("sim.strictConfig", true); //if true, panic on unused variables
+    config.writeAndClose((string(zinfo->outputDir) + "/out.cfg").c_str(), strictConfig);
+
+    zinfo->contentionSim->postInit();
+
+    info("Initialization complete");
+
+    //Causes every other process to wake up
+    gm_set_glob_ptr(zinfo);
+}
+
diff --git a/src/init.h b/src/init.h
new file mode 100644
index 00000000..98dac8de
--- /dev/null
+++ b/src/init.h
@@ -0,0 +1,34 @@
+/** $lic$
+ * Copyright (C) 2012-2014 by Massachusetts Institute of Technology
+ * Copyright (C) 2010-2013 by The Board of Trustees of Stanford University
+ *
+ * This file is part of zsim.
+ *
+ * zsim is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU General Public License as published by the Free Software
+ * Foundation, version 2.
+ *
+ * If you use this software in your research, we request that you reference
+ * the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of
+ * Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the
+ * source of the simulator in any publications that use this software, and that
+ * you send us a citation of your work.
+ *
+ * zsim is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef INIT_H_
+#define INIT_H_
+
+#include <stdint.h>
+
+/* Read configuration options, configure system */
+void SimInit(const char* configFile, const char* outputDir, uint32_t shmid);
+
+#endif  // INIT_H_
diff --git a/src/intrusive_list.h b/src/intrusive_list.h
new file mode 100644
index 00000000..4cf9a693
--- /dev/null
+++ b/src/intrusive_list.h
@@ -0,0 +1,178 @@
+/** $lic$
+ * Copyright (C) 2012-2014 by Massachusetts Institute of Technology
+ * Copyright (C) 2010-2013 by The Board of Trustees of Stanford University
+ *
+ * This file is part of zsim.
+ *
+ * zsim is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU General Public License as published by the Free Software
+ * Foundation, version 2.
+ *
+ * If you use this software in your research, we request that you reference
+ * the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of
+ * Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the
+ * source of the simulator in any publications that use this software, and that
+ * you send us a citation of your work.
+ *
+ * zsim is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef INTRUSIVE_LIST_H_
+#define INTRUSIVE_LIST_H_
+
+/* Intrusive doubly-linked list -- simple enough to not include boost,
+ * but might want to switch at some point
+ */
+
+#include "log.h"
+
+template <typename T>
+class InList;
+
+template <typename T>
+struct InListNode {
+    T* next;
+    T* prev;
+    InList<T>* owner;
+
+    InListNode() {
+        next = NULL;
+        prev = NULL;
+        owner = NULL;
+    }
+
+    void unlink(InList<T>* lst) {
+        if (next) next->prev = prev;
+        if (prev) prev->next = next;
+        next = NULL;
+        prev = NULL;
+        assert(lst == owner);
+        owner = NULL;
+    }
+
+    void linkPrev(T* p, InList<T>* lst) {
+        assert(p);
+        assert(owner == NULL);
+        assert(prev == NULL && next == NULL);
+        if (p->next) {
+            assert(p->next->prev == p);
+            p->next->prev = static_cast<T*>(this);
+            next = p->next;
+        }
+        p->next = static_cast<T*>(this);
+        prev = p;
+        owner = lst;
+    }
+};
+
+template <typename T>
+class InList {
+    private:
+        T* head;
+        T* tail;
+        size_t elems;
+
+    public:
+        InList() : head(NULL), tail(NULL), elems(0) {}
+        bool empty() const {return !head;}
+
+        T* front() const {return head;}
+        T* back() const {return tail;}
+
+        void push_front(T* e) {
+            assert(e && e->next == NULL && e->prev == NULL && e->owner == NULL);
+            if (empty()) {
+                head = e;
+                tail = e;
+            } else {
+                assert(head && head->prev == NULL && head->owner == this);
+                e->next = head;
+                head->prev = e;
+                head = e;
+            }
+            e->owner = this;
+            elems++;
+        }
+
+        void push_back(T* e) {
+            assert(e && e->next == NULL && e->prev == NULL && e->owner == NULL);
+            if (empty()) {
+                head = e;
+                tail = e;
+                e->owner = this;
+            } else {
+                assert(tail);
+                e->linkPrev(tail, this);
+                tail = e;
+            }
+            elems++;
+        }
+
+        void pop_front() {
+            if (empty()) return;
+            T* e = head;
+            head = e->next;
+            e->unlink(this);
+            if (!head) tail = NULL;
+            elems--;
+        }
+
+        void pop_back() {
+            if (empty()) return;
+            T* e = tail;
+            tail = e->prev;
+            e->unlink(this);
+            if (!tail) head = NULL;
+            elems--;
+        }
+
+        //Note how remove is O(1)
+        void remove(T* e) {
+            //info("Remove PRE h=%p t=%p e=%p", head, tail, e);
+            if (e == head) head = e->next;
+            if (e == tail) tail = e->prev;
+            e->unlink(this);
+            elems--;
+            //info("Remove POST h=%p t=%p e=%p", head, tail);
+        }
+
+        void insertAfter(T* prev, T* e) {
+            assert(e && e->owner == NULL);
+            assert(prev && prev->owner == this);
+            e->linkPrev(prev, this);
+            if (prev == tail) tail = e;
+            elems++;
+        }
+
+        size_t size() const {
+            return elems;
+        }
+
+#if 0  // Verify all internal state; call to test list implementation
+        void verify() {
+            if (empty()) {
+                assert(head == NULL && tail == NULL && elems == 0);
+            } else {
+                T* c = head;
+                size_t count = 0;
+                while (c) {
+                    if (c->next) assert(c->next->prev);
+                    if (!c->next) assert(c == tail);
+                    assert(c->owner == this);
+                    count++;
+                    c = c->next;
+                }
+                assert(count == elems);
+            }
+        }
+#endif
+};
+
+#endif  // INTRUSIVE_LIST_H_
+
diff --git a/src/locks.h b/src/locks.h
new file mode 100644
index 00000000..98497294
--- /dev/null
+++ b/src/locks.h
@@ -0,0 +1,221 @@
+/** $lic$
+ * Copyright (C) 2012-2014 by Massachusetts Institute of Technology
+ * Copyright (C) 2010-2013 by The Board of Trustees of Stanford University
+ *
+ * This file is part of zsim.
+ *
+ * zsim is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU General Public License as published by the Free Software
+ * Foundation, version 2.
+ *
+ * If you use this software in your research, we request that you reference
+ * the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of
+ * Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the
+ * source of the simulator in any publications that use this software, and that
+ * you send us a citation of your work.
+ *
+ * zsim is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/* dsm: An attempt at having locks that don't suck */
+
+#ifndef LOCKS_H_
+#define LOCKS_H_
+
+#include <linux/futex.h>
+#include <stdint.h>
+#include <syscall.h>
+#include <unistd.h>
+
+#ifdef WITH_MWAIT //careful with this define; most kernels don't allow mwait in userspace
+#include <pmmintrin.h>  // NOLINT
+#else
+#include <xmmintrin.h>  // NOLINT
+#endif
+
+#include "log.h"
+
+typedef volatile uint32_t lock_t;
+
+/* SPINLOCK: A simple T&T&S spinlock. Lock can use monitor/mwait */
+
+static inline void spin_init(volatile uint32_t* lock) {
+    *lock = 0;
+    __sync_synchronize();
+}
+
+static inline void spin_destroy(volatile uint32_t* lock) {}
+
+
+static inline void spin_lock(volatile uint32_t* lock) {
+    while (1) {
+        if ((*lock) == 0 /*test (read)*/ && __sync_bool_compare_and_swap(lock, 0, 1) /*test&set*/) {
+            break;
+        }
+
+        // At this point, we have the line in S/E/O, or M if we have tried the test&set and failed.
+#if WITH_MWAIT
+        //Monitor / mwait
+        _mm_monitor((const void*)lock, 0, 0);
+
+        //Must test again, might have intervening write BEFORE monitor (so we would get stuck in mwait)
+        if (*lock) {
+            _mm_mwait(0, 0);
+        }
+#else
+        //If we don't have mwait, we can at least pause
+        _mm_pause();
+#endif
+    }
+}
+
+static inline int spin_trylock(volatile uint32_t* lock) {
+    return !((*lock) == 0 /*T*/ && __sync_bool_compare_and_swap(lock, 0, 1) /*T&S*/);
+}
+
+
+static inline void spin_unlock(volatile uint32_t* lock) {
+    assert(*lock == 1); //should own lock if we're unlocking...
+    *lock = 0;
+    __sync_synchronize();
+}
+
+/* TICKET LOCK: Provides FIFO ordering for fairness.
+ * WARNING: Will not work with more than 64K threads
+ */
+
+#define TICKET_MASK ((1<<16) - 1)
+
+static inline void ticket_init(volatile uint32_t* lock) {
+    *lock = 0;
+    __sync_synchronize();
+}
+
+static inline void ticket_destroy(volatile uint32_t* lock) {}
+
+static inline void ticket_lock(volatile uint32_t* lock) {
+    /* Technically, we want to do this, but I'm guessing the 64-bit
+     * datapath is not very well optimized for 16-bit xadd...
+     * volatile uint16_t* low = ((volatile uint16_t*) lock) + 1;
+     * uint32_t ticket = atomic_fetchadd_16(low, 1);
+     */
+    uint32_t val, hi, newLo;
+    while (true) {
+        val = *lock;
+        hi = val & (TICKET_MASK << 16);
+        newLo = (val + 1) & TICKET_MASK;
+        if (__sync_bool_compare_and_swap(lock, val, (hi | newLo))) break;
+    }
+
+    uint32_t ticket = val & TICKET_MASK;
+
+    while ((((*lock) >> 16) & TICKET_MASK) != ticket) {
+#if WITH_MWAIT
+        //Monitor / mwait
+        _mm_monitor((const void*)lock, 0, 0);
+
+        //Must test again, might have intervening write BEFORE monitor (so we would get stuck in mwait)
+        if (*lock) {
+            _mm_mwait(0, 0);
+        }
+#else
+        //If we don't have mwait, we can at least pause
+        _mm_pause();
+#endif
+    }
+}
+
+static inline int ticket_trylock(volatile uint32_t* lock) {
+    uint32_t val = *lock;
+    uint32_t hi = (val >> 16) & TICKET_MASK;
+    uint32_t lo = val & TICKET_MASK;
+    uint32_t newLo = (lo + 1) & TICKET_MASK;
+    return (hi == lo /*This is up for grabs*/ && __sync_bool_compare_and_swap(lock, val, ((hi << 16) | newLo)) /*T&S*/);
+}
+
+
+static inline void ticket_unlock(volatile uint32_t* lock) {
+    __sync_fetch_and_add(lock, 1<<16);
+}
+
+
+static inline void futex_init(volatile uint32_t* lock) {
+    spin_init(lock);
+}
+
+/* NOTE: The current implementation of this lock is quite unfair. Not that we care for its current use. */
+static inline void futex_lock(volatile uint32_t* lock) {
+    uint32_t c;
+    do {
+        for (int i = 0; i < 1000; i++) { //this should be tuned to balance syscall/context-switch and user-level spinning costs
+            if (*lock == 0 && __sync_bool_compare_and_swap(lock, 0, 1)) {
+                return;
+            }
+            _mm_pause();
+        }
+
+        //At this point, we will block
+        c = __sync_lock_test_and_set(lock, 2); //this is not exactly T&S, but atomic exchange; see GCC docs
+        if (c == 0) return;
+        syscall(SYS_futex, lock, FUTEX_WAIT, 2, NULL, NULL, 0);
+        c = __sync_lock_test_and_set(lock, 2); //atomic exchange
+    } while (c != 0);
+}
+
+static inline void futex_lock_nospin(volatile uint32_t* lock) {
+    uint32_t c;
+    do {
+        if (*lock == 0 && __sync_bool_compare_and_swap(lock, 0, 1)) {
+            return;
+        }
+
+        //At this point, we will block
+        c = __sync_lock_test_and_set(lock, 2); //this is not exactly T&S, but atomic exchange; see GCC docs
+        if (c == 0) return;
+        syscall(SYS_futex, lock, FUTEX_WAIT, 2, NULL, NULL, 0);
+        c = __sync_lock_test_and_set(lock, 2); //atomic exchange
+    } while (c != 0);
+}
+
+#define BILLION (1000000000L)
+static inline bool futex_trylock_nospin_timeout(volatile uint32_t* lock, uint64_t timeoutNs) {
+    if (*lock == 0 && __sync_bool_compare_and_swap(lock, 0, 1)) {
+        return true;
+    }
+
+    //At this point, we will block
+    uint32_t c = __sync_lock_test_and_set(lock, 2); //this is not exactly T&S, but atomic exchange; see GCC docs
+    if (c == 0) return true;
+    const struct timespec timeout = {(time_t) timeoutNs/BILLION, (time_t) timeoutNs % BILLION};
+    syscall(SYS_futex, lock, FUTEX_WAIT, 2, &timeout, NULL, 0);
+    c = __sync_lock_test_and_set(lock, 2); //atomic exchange
+    if (c == 0) return true;
+    return false;
+}
+
+static inline void futex_unlock(volatile uint32_t* lock) {
+    if (__sync_fetch_and_add(lock, -1) != 1) {
+        *lock = 0;
+        /* This may result in additional wakeups, but avoids completely starving processes that are
+         * sleeping on this. Still, if there is lots of contention in userland, this doesn't work
+         * that well. But I don't care that much, as this only happens between phase locks.
+         */
+        syscall(SYS_futex, lock, FUTEX_WAKE, 1 /*wake next*/, NULL, NULL, 0);
+    }
+}
+
+// Returns true if this futex has *detectable waiters*, i.e., waiters in the kernel
+// There may still be waiters spinning, but if you (a) acquire the lock, and (b) want
+// to see if someone is queued behind you, this will eventually return true
+// No false positives (if true, for sure there's someone)
+static inline bool futex_haswaiters(volatile uint32_t* lock) {
+    return *lock == 2;
+}
+
+#endif  // LOCKS_H_
diff --git a/src/log.cpp b/src/log.cpp
new file mode 100644
index 00000000..555f460b
--- /dev/null
+++ b/src/log.cpp
@@ -0,0 +1,59 @@
+/** $lic$
+ * Copyright (C) 2012-2014 by Massachusetts Institute of Technology
+ * Copyright (C) 2010-2013 by The Board of Trustees of Stanford University
+ *
+ * This file is part of zsim.
+ *
+ * zsim is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU General Public License as published by the Free Software
+ * Foundation, version 2.
+ *
+ * If you use this software in your research, we request that you reference
+ * the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of
+ * Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the
+ * source of the simulator in any publications that use this software, and that
+ * you send us a citation of your work.
+ *
+ * zsim is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "log.h"
+#include <stdlib.h>
+#include <string.h>
+#include "locks.h"
+
+const char* logHeader = "";
+
+const char* logTypeNames[] = {"Harness", "Config", "Process", "Cache", "Mem", "Sched", "FSVirt", "TimeVirt"};
+
+FILE* logFdOut = stdout;
+FILE* logFdErr = stderr;
+
+static lock_t log_printLock;
+
+
+void InitLog(const char* header, const char* file) {
+    logHeader = strdup(header);
+    futex_init(&log_printLock);
+
+    if (file) {
+        FILE* fd = fopen(file, "a");
+        if (fd == NULL) {
+            perror("fopen() failed");
+            panic("Could not open logfile %s", file); //we can panic in InitLog (will dump to stderr)
+        }
+        logFdOut = fd;
+        logFdErr = fd;
+        //NOTE: We technically never close this fd, but always flush it
+    }
+}
+
+void __log_lock() {futex_lock(&log_printLock);}
+void __log_unlock() {futex_unlock(&log_printLock);}
+
diff --git a/src/log.h b/src/log.h
new file mode 100644
index 00000000..7c488898
--- /dev/null
+++ b/src/log.h
@@ -0,0 +1,156 @@
+/** $lic$
+ * Copyright (C) 2012-2014 by Massachusetts Institute of Technology
+ * Copyright (C) 2010-2013 by The Board of Trustees of Stanford University
+ *
+ * This file is part of zsim.
+ *
+ * zsim is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU General Public License as published by the Free Software
+ * Foundation, version 2.
+ *
+ * If you use this software in your research, we request that you reference
+ * the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of
+ * Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the
+ * source of the simulator in any publications that use this software, and that
+ * you send us a citation of your work.
+ *
+ * zsim is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/* General logging/info/warn/panic routines */
+
+#ifndef LOG_H_
+#define LOG_H_
+
+#include <stdio.h>
+#include <stdlib.h>
+
+void __log_lock();
+void __log_unlock();
+
+#ifdef MT_SAFE_LOG
+#define log_lock() __log_lock()
+#define log_unlock() __log_unlock()
+#else
+#define log_lock()
+#define log_unlock()
+#endif
+
+#define PANIC_EXIT_CODE (112)
+
+// assertions are often frequently executed but never inlined. Might as well tell the compiler about it
+#define likely(x)       __builtin_expect((x), 1)
+#define unlikely(x)     __builtin_expect((x), 0)
+
+typedef enum {
+    LOG_Harness,
+    LOG_Config,
+    LOG_Process,
+    LOG_Cache,
+    LOG_Mem,
+    LOG_Sched,
+    LOG_FSVirt,
+    LOG_TimeVirt,
+} LogType;
+
+// defined in log.cpp
+extern const char* logTypeNames[];
+extern const char* logHeader;
+extern FILE* logFdOut;
+extern FILE* logFdErr;
+
+/* Set per-process header for log/info/warn/panic messages
+ * Calling this is not needed (the default header is ""),
+ * but it helps in multi-process runs
+ * If file is NULL or InitLog is not called, logs to stdout/stderr
+ */
+void InitLog(const char* header, const char* file = NULL);
+
+#define panic(args...) \
+{ \
+    fprintf(logFdErr, "%sPanic on %s:%d: ", logHeader, __FILE__, __LINE__); \
+    fprintf(logFdErr, args); \
+    fprintf(logFdErr, "\n"); \
+    fflush(logFdErr); \
+    /**reinterpret_cast<int*>(0L) = 42;*/ /*SIGSEGVs*/ \
+    exit(PANIC_EXIT_CODE); \
+}
+
+#define warn(args...) \
+{ \
+    log_lock(); \
+    fprintf(logFdErr, "%sWARN: ", logHeader); \
+    fprintf(logFdErr, args); \
+    fprintf(logFdErr, "\n"); \
+    fflush(logFdErr); \
+    log_unlock(); \
+}
+
+#define info(args...) \
+{ \
+    log_lock(); \
+    fprintf(logFdOut, "%s", logHeader); \
+    fprintf(logFdOut, args); \
+    fprintf(logFdOut, "\n"); \
+    fflush(logFdOut); \
+    log_unlock(); \
+}
+
+/* I would call these macros log, but there's this useless math function
+ * that happens to conflict with this...
+ */
+/* FIXME: Better conditional tracing (e.g., via mask) */
+#ifdef _LOG_TRACE_
+#define trace(type, args...) \
+{ \
+    if ( LOG_##type == LOG_Sched) { \
+        log_lock(); \
+        fprintf(logFdErr, "%sLOG(%s): ", logHeader, logTypeNames[(int) LOG_##type]); \
+        fprintf(logFdErr, args); \
+        fprintf(logFdErr, "\n"); \
+        fflush(logFdErr); \
+        log_unlock(); \
+    } \
+}
+#else
+#define trace(type, args...)
+#endif
+
+
+#ifndef NASSERT
+#define assert(cond) \
+if (unlikely(!(cond))) { \
+    fprintf(logFdErr, "%sFailed assertion on %s:%d\n", logHeader, __FILE__, __LINE__); \
+    fflush(logFdErr); \
+    *reinterpret_cast<int*>(0L) = 42; /*SIGSEGVs*/ \
+    exit(1); \
+};
+
+#define assert_msg(cond, args...) \
+if (unlikely(!(cond))) { \
+    fprintf(logFdErr, "%sFailed assertion on %s:%d: ", logHeader, __FILE__, __LINE__); \
+    fprintf(logFdErr, args); \
+    fprintf(logFdErr, "\n"); \
+    fflush(logFdErr); \
+    *reinterpret_cast<int*>(0L) = 42; /*SIGSEGVs*/ \
+    exit(1); \
+};
+#else
+// Avoid unused warnings, never emit any code
+// see http://cnicholson.net/2009/02/stupid-c-tricks-adventures-in-assert/
+#define assert(cond) do { (void)sizeof(cond); } while (0);
+#define assert_msg(cond, args...) do { (void)sizeof(cond); } while (0);
+#endif
+
+#define checkpoint()                                            \
+    do {                                                        \
+        info("%s:%d %s", __FILE__, __LINE__, __FUNCTION__);     \
+    } while (0)
+
+#endif  // LOG_H_
diff --git a/src/lookahead.cpp b/src/lookahead.cpp
new file mode 100644
index 00000000..ac93524c
--- /dev/null
+++ b/src/lookahead.cpp
@@ -0,0 +1,164 @@
+/** $lic$
+ * Copyright (C) 2012-2014 by Massachusetts Institute of Technology
+ * Copyright (C) 2010-2013 by The Board of Trustees of Stanford University
+ *
+ * This file is part of zsim.
+ *
+ * zsim is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU General Public License as published by the Free Software
+ * Foundation, version 2.
+ *
+ * If you use this software in your research, we request that you reference
+ * the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of
+ * Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the
+ * source of the simulator in any publications that use this software, and that
+ * you send us a citation of your work.
+ *
+ * zsim is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <algorithm>
+#include <tuple>
+#include "part_repl_policies.h"
+#include "partitioner.h"
+
+using std::tuple;
+using std::tie;
+using std::make_tuple;
+
+// generic lookahead algorithm
+namespace lookahead {
+
+static tuple<double, uint32_t> getMaxMarginalUtility(
+    uint32_t numPartitions, uint32_t part, uint32_t partAlloc,
+    uint32_t balance, const PartitionMonitor& monitor) {
+    double maxMu = -1.0;
+    uint32_t maxMuAlloc = 0;
+    for (uint32_t i = 1; i <= balance; i++) {
+        //Use this when utility == misses
+        uint64_t extraHits = monitor.get(part, partAlloc) - monitor.get(part, partAlloc+i);
+        double mu = ((double)extraHits)/((double)i);
+
+        if (mu > maxMu) {
+            maxMu = mu;
+            maxMuAlloc = i;
+        }
+    }
+    return make_tuple(maxMu, maxMuAlloc);
+}
+
+//Utility is defined as misses saved over not having a cache
+uint64_t computePartitioningTotalUtility(
+    uint32_t numPartitions, const uint32_t* parts, const PartitionMonitor& monitor) {
+    uint64_t noCacheMisses = 0;
+    uint64_t curPartMisses = 0;
+    for (uint32_t p = 0; p < numPartitions; p++) {
+        noCacheMisses += monitor.get(p, 0);
+        curPartMisses += monitor.get(p, parts[p]);
+    }
+    return noCacheMisses - curPartMisses;
+}
+
+void computeBestPartitioning(
+    uint32_t numPartitions, uint32_t buckets, uint32_t minAlloc, bool* forbidden,
+    uint32_t* allocs, const PartitionMonitor& monitor) {
+    uint32_t balance = buckets;
+
+    // Zero out allocs or set to mins
+    for (uint32_t i = 0; i < numPartitions; i++) {
+        allocs[i] = minAlloc;
+    }
+
+    balance -= minAlloc;
+
+    uint32_t iter = 0;  // purely for debug purposes
+    while (balance > 0) {
+        double maxMu = -1.0;
+        uint32_t maxMuPart = numPartitions;  // illegal
+        uint32_t maxMuAlloc = 0;
+        (void)maxMuAlloc;  // make gcc happy when we're not profiling
+        for (uint32_t i = 0; i < numPartitions; i++) {
+            if (forbidden && forbidden[i]) {  // this partition doesn't get anything
+                //info("Allocating to %d forbiddden, skipping", i);
+                continue;
+            }
+
+            uint32_t muAlloc;
+            double mu;
+            tie(mu, muAlloc) = getMaxMarginalUtility(numPartitions, i, allocs[i], balance, monitor);
+            if (mu > maxMu) {
+                maxMu = mu;
+                maxMuPart = i;
+                maxMuAlloc = muAlloc;
+            }
+        }
+#if UMON_INFO
+        //info("LookaheadPartitioner: Iteration %d maxMu %f partition %d alloc %d newAlloc %d remaining %d", iter, maxMu, maxMuPart, maxMuAlloc, allocs[maxMuPart] + maxMuAlloc, balance - maxMuAlloc);
+#endif
+        assert(maxMuPart < numPartitions);
+        allocs[maxMuPart] += maxMuAlloc;
+        balance -= maxMuAlloc;
+        iter++;
+    }
+}
+
+}  // namespace lookahead
+
+// LookaheadPartitioner
+
+LookaheadPartitioner::LookaheadPartitioner(PartReplPolicy* _repl, uint32_t _numPartitions, uint32_t _buckets,
+                                           uint32_t _minAlloc, double _allocPortion, bool* _forbidden)
+        : Partitioner(_minAlloc, _allocPortion, _forbidden)
+        , repl(_repl)
+        , numPartitions(_numPartitions)
+        , buckets(_buckets) {
+    assert_msg(buckets > 0, "Must have non-zero buckets to avoid divide-by-zero exception.");
+
+    curAllocs = gm_calloc<uint32_t>(buckets + 1);
+
+    info("LookaheadPartitioner: %d part buckets", buckets);
+}
+
+//allocs are in buckets
+void LookaheadPartitioner::partition() {
+    auto& monitor = *repl->getMonitor();
+
+    uint32_t bestAllocs[numPartitions];
+    lookahead::computeBestPartitioning(
+        numPartitions, allocPortion*buckets, minAlloc*numPartitions,
+        forbidden, bestAllocs, monitor);
+
+    uint64_t newUtility = lookahead::computePartitioningTotalUtility(
+        numPartitions, bestAllocs, monitor);
+    uint64_t curUtility = lookahead::computePartitioningTotalUtility(
+        numPartitions, curAllocs, monitor);
+
+    bool switchAllocs = newUtility > 102*curUtility/100; //must be 2% better
+    if (curUtility == 0) switchAllocs = true; //always switch on start (this happens when we only have recorded misses)
+    switchAllocs = true; //FIXME
+
+    if (switchAllocs) {
+#if UMON_INFO
+        info("LookaheadPartitioner: Switching allocation, new util %ld, old util %ld", newUtility, curUtility);
+#endif
+        std::copy(bestAllocs, bestAllocs+numPartitions, curAllocs);
+    } else {
+#if UMON_INFO
+        info("LookaheadPartitioner: KEEPING allocation, new util %ld, old util %ld", newUtility, curUtility);
+#endif
+    }
+
+#if UMON_INFO
+    info("LookaheadPartitioner: Partitioning done,");
+    for (uint32_t i = 0; i < numPartitions; i++) info("buckets[%d] = %d", i, curAllocs[i]);
+#endif
+
+    repl->setPartitionSizes(curAllocs);
+    repl->getMonitor()->reset();
+}
diff --git a/src/mem_ctrls.cpp b/src/mem_ctrls.cpp
new file mode 100644
index 00000000..c6a1e157
--- /dev/null
+++ b/src/mem_ctrls.cpp
@@ -0,0 +1,147 @@
+/** $lic$
+ * Copyright (C) 2012-2014 by Massachusetts Institute of Technology
+ * Copyright (C) 2010-2013 by The Board of Trustees of Stanford University
+ *
+ * This file is part of zsim.
+ *
+ * zsim is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU General Public License as published by the Free Software
+ * Foundation, version 2.
+ *
+ * If you use this software in your research, we request that you reference
+ * the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of
+ * Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the
+ * source of the simulator in any publications that use this software, and that
+ * you send us a citation of your work.
+ *
+ * zsim is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+//#include "timing_event.h"
+//#include "event_recorder.h"
+#include "mem_ctrls.h"
+#include "zsim.h"
+
+uint64_t SimpleMemory::access(MemReq& req) {
+    switch (req.type) {
+        case PUTS:
+        case PUTX:
+            *req.state = I;
+            break;
+        case GETS:
+            *req.state = req.is(MemReq::NOEXCL)? S : E;
+            break;
+        case GETX:
+            *req.state = M;
+            break;
+
+        default: panic("!?");
+    }
+
+    uint64_t respCycle = req.cycle + latency;
+    assert(respCycle > req.cycle);
+/*
+    if ((req.type == GETS || req.type == GETX) && eventRecorders[req.srcId]) {
+        Address addr = req.lineAddr<<lineBits;
+        MemAccReqEvent* memEv = new (eventRecorders[req.srcId]->alloc<MemAccReqEvent>()) MemAccReqEvent(NULL, false, addr);
+        TimingRecord tr = {addr, req.cycle, respCycle, req.type, memEv, memEv};
+        eventRecorders[req.srcId]->pushRecord(tr);
+    }
+*/
+    return respCycle;
+}
+
+
+
+
+MD1Memory::MD1Memory(uint32_t requestSize, uint32_t megacyclesPerSecond, uint32_t megabytesPerSecond, uint32_t _zeroLoadLatency, g_string& _name)
+    : zeroLoadLatency(_zeroLoadLatency), name(_name)
+{
+    lastPhase = 0;
+
+    double bytesPerCycle = ((double)megabytesPerSecond)/((double)megacyclesPerSecond);
+    maxRequestsPerCycle = bytesPerCycle/requestSize;
+    assert(maxRequestsPerCycle > 0.0);
+
+    zeroLoadLatency = _zeroLoadLatency;
+
+    smoothedPhaseAccesses = 0.0;
+    curPhaseAccesses = 0;
+    curLatency = zeroLoadLatency;
+
+    futex_init(&updateLock);
+}
+
+void MD1Memory::updateLatency() {
+    uint32_t phaseCycles = (zinfo->numPhases - lastPhase)*(zinfo->phaseLength);
+    if (phaseCycles < 10000) return; //Skip with short phases
+
+    smoothedPhaseAccesses =  (curPhaseAccesses*0.5) + (smoothedPhaseAccesses*0.5);
+    double requestsPerCycle = smoothedPhaseAccesses/((double)phaseCycles);
+    double load = requestsPerCycle/maxRequestsPerCycle;
+
+    //Clamp load
+    if (load > 0.95) {
+        //warn("MC: Load exceeds limit, %f, clamping, curPhaseAccesses %d, smoothed %f, phase %ld", load, curPhaseAccesses, smoothedPhaseAccesses, zinfo->numPhases);
+        load = 0.95;
+        profClampedLoads.inc();
+    }
+
+    double latMultiplier = 1.0 + 0.5*load/(1.0 - load); //See Pollancek-Khinchine formula
+    curLatency = (uint32_t)(latMultiplier*zeroLoadLatency);
+
+    //info("%s: Load %.2f, latency multiplier %.2f, latency %d", name.c_str(), load, latMultiplier, curLatency);
+    uint32_t intLoad = (uint32_t)(load*100.0);
+    profLoad.inc(intLoad);
+    profUpdates.inc();
+
+    curPhaseAccesses = 0;
+    __sync_synchronize();
+    lastPhase = zinfo->numPhases;
+}
+
+uint64_t MD1Memory::access(MemReq& req) {
+    if (zinfo->numPhases > lastPhase) {
+        futex_lock(&updateLock);
+        //Recheck, someone may have updated already
+        if (zinfo->numPhases > lastPhase) {
+            updateLatency();
+        }
+        futex_unlock(&updateLock);
+    }
+
+    switch (req.type) {
+        case PUTX:
+            //Dirty wback
+            profWrites.atomicInc();
+            profTotalWrLat.atomicInc(curLatency);
+            __sync_fetch_and_add(&curPhaseAccesses, 1);
+            //Note no break
+        case PUTS:
+            //Not a real access -- memory must treat clean wbacks as if they never happened.
+            *req.state = I;
+            break;
+        case GETS:
+            profReads.atomicInc();
+            profTotalRdLat.atomicInc(curLatency);
+            __sync_fetch_and_add(&curPhaseAccesses, 1);
+            *req.state = req.is(MemReq::NOEXCL)? S : E;
+            break;
+        case GETX:
+            profReads.atomicInc();
+            profTotalRdLat.atomicInc(curLatency);
+            __sync_fetch_and_add(&curPhaseAccesses, 1);
+            *req.state = M;
+            break;
+
+        default: panic("!?");
+    }
+    return req.cycle + ((req.type == PUTS)? 0 /*PUTS is not a real access*/ : curLatency);
+}
+
diff --git a/src/mem_ctrls.h b/src/mem_ctrls.h
new file mode 100644
index 00000000..4009f020
--- /dev/null
+++ b/src/mem_ctrls.h
@@ -0,0 +1,100 @@
+/** $lic$
+ * Copyright (C) 2012-2014 by Massachusetts Institute of Technology
+ * Copyright (C) 2010-2013 by The Board of Trustees of Stanford University
+ *
+ * This file is part of zsim.
+ *
+ * zsim is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU General Public License as published by the Free Software
+ * Foundation, version 2.
+ *
+ * If you use this software in your research, we request that you reference
+ * the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of
+ * Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the
+ * source of the simulator in any publications that use this software, and that
+ * you send us a citation of your work.
+ *
+ * zsim is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef MEM_CTRLS_H_
+#define MEM_CTRLS_H_
+
+#include "g_std/g_string.h"
+#include "memory_hierarchy.h"
+#include "pad.h"
+#include "stats.h"
+
+/* Simple memory (or memory bank), has a fixed latency */
+class SimpleMemory : public MemObject {
+    private:
+        g_string name;
+        uint32_t latency;
+
+    public:
+        uint64_t access(MemReq& req);
+
+        const char* getName() {return name.c_str();}
+
+        SimpleMemory(uint32_t _latency, g_string& _name) : name(_name), latency(_latency) {}
+};
+
+
+/* Implements a memory controller with limited bandwidth, throttling latency
+ * using an M/D/1 queueing model.
+ */
+class MD1Memory : public MemObject {
+    private:
+        uint64_t lastPhase;
+        double maxRequestsPerCycle;
+        double smoothedPhaseAccesses;
+        uint32_t zeroLoadLatency;
+        uint32_t curLatency;
+
+        PAD();
+
+        Counter profReads;
+        Counter profWrites;
+        Counter profTotalRdLat;
+        Counter profTotalWrLat;
+        Counter profLoad;
+        Counter profUpdates;
+        Counter profClampedLoads;
+        uint32_t curPhaseAccesses;
+
+        g_string name; //barely used
+        lock_t updateLock;
+        PAD();
+
+    public:
+        MD1Memory(uint32_t lineSize, uint32_t megacyclesPerSecond, uint32_t megabytesPerSecond, uint32_t _zeroLoadLatency, g_string& _name);
+
+        void initStats(AggregateStat* parentStat) {
+            AggregateStat* memStats = new AggregateStat();
+            memStats->init(name.c_str(), "Memory controller stats");
+            profReads.init("rd", "Read requests"); memStats->append(&profReads);
+            profWrites.init("wr", "Write requests"); memStats->append(&profWrites);
+            profTotalRdLat.init("rdlat", "Total latency experienced by read requests"); memStats->append(&profTotalRdLat);
+            profTotalWrLat.init("wrlat", "Total latency experienced by write requests"); memStats->append(&profTotalWrLat);
+            profLoad.init("load", "Sum of load factors (0-100) per update"); memStats->append(&profLoad);
+            profUpdates.init("ups", "Number of latency updates"); memStats->append(&profUpdates);
+            profClampedLoads.init("clampedLoads", "Number of updates where the load was clamped to 95%"); memStats->append(&profClampedLoads);
+            parentStat->append(memStats);
+        }
+
+        //uint32_t access(Address lineAddr, AccessType type, uint32_t childId, MESIState* state /*both input and output*/, MESIState initialState, lock_t* childLock);
+        uint64_t access(MemReq& req);
+
+        const char* getName() {return name.c_str();}
+
+    private:
+        void updateLatency();
+};
+
+#endif  // MEM_CTRLS_H_
diff --git a/src/memory_hierarchy.cpp b/src/memory_hierarchy.cpp
new file mode 100644
index 00000000..740e66c4
--- /dev/null
+++ b/src/memory_hierarchy.cpp
@@ -0,0 +1,52 @@
+/** $lic$
+ * Copyright (C) 2012-2014 by Massachusetts Institute of Technology
+ * Copyright (C) 2010-2013 by The Board of Trustees of Stanford University
+ *
+ * This file is part of zsim.
+ *
+ * zsim is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU General Public License as published by the Free Software
+ * Foundation, version 2.
+ *
+ * If you use this software in your research, we request that you reference
+ * the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of
+ * Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the
+ * source of the simulator in any publications that use this software, and that
+ * you send us a citation of your work.
+ *
+ * zsim is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "memory_hierarchy.h"
+
+static const char* accessTypeNames[] = {"GETS", "GETX", "PUTS", "PUTX"};
+static const char* invTypeNames[] = {"INV", "INVX"};
+static const char* mesiStateNames[] = {"I", "S", "E", "M"};
+
+const char* AccessTypeName(AccessType t) {
+    assert_msg(t >= 0 && (size_t)t < sizeof(accessTypeNames)/sizeof(const char*), "AccessTypeName got an out-of-range input, %d", t);
+    return accessTypeNames[t];
+}
+
+const char* InvTypeName(InvType t) {
+    assert_msg(t >= 0 && (size_t)t < sizeof(invTypeNames)/sizeof(const char*), "InvTypeName got an out-of-range input, %d", t);
+    return invTypeNames[t];
+}
+
+const char* MESIStateName(MESIState s) {
+    assert_msg(s >= 0 && (size_t)s < sizeof(mesiStateNames)/sizeof(const char*), "MESIStateName got an out-of-range input, %d", s);
+    return mesiStateNames[s];
+}
+
+#include <type_traits>
+
+static inline void CompileTimeAsserts() {
+    static_assert(std::is_pod<MemReq>::value, "MemReq not POD!");
+}
+
diff --git a/src/memory_hierarchy.h b/src/memory_hierarchy.h
new file mode 100644
index 00000000..15a6711b
--- /dev/null
+++ b/src/memory_hierarchy.h
@@ -0,0 +1,125 @@
+/** $lic$
+ * Copyright (C) 2012-2014 by Massachusetts Institute of Technology
+ * Copyright (C) 2010-2013 by The Board of Trustees of Stanford University
+ *
+ * This file is part of zsim.
+ *
+ * zsim is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU General Public License as published by the Free Software
+ * Foundation, version 2.
+ *
+ * If you use this software in your research, we request that you reference
+ * the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of
+ * Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the
+ * source of the simulator in any publications that use this software, and that
+ * you send us a citation of your work.
+ *
+ * zsim is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef MEMORY_HIERARCHY_H_
+#define MEMORY_HIERARCHY_H_
+
+/* Type and interface definitions of memory hierarchy objects */
+
+#include <stdint.h>
+#include "g_std/g_vector.h"
+#include "galloc.h"
+#include "locks.h"
+
+/** TYPES **/
+
+/* Addresses are plain 64-bit uints. This should be kept compatible with PIN addrints */
+typedef uint64_t Address;
+
+/* Types of Access. An Access is a request that proceeds from lower to upper
+ * levels of the hierarchy (core->l1->l2, etc.)
+ */
+typedef enum {
+    GETS, // get line, exclusive permission not needed (triggered by a processor load)
+    GETX, // get line, exclusive permission needed (triggered by a processor store o atomic access)
+    PUTS, // clean writeback (lower cache is evicting this line, line was not modified)
+    PUTX  // dirty writeback (lower cache is evicting this line, line was modified)
+} AccessType;
+
+/* Types of Invalidation. An Invalidation is a request issued from upper to lower
+ * levels of the hierarchy.
+ */
+typedef enum {
+    INV,  // fully invalidate this line
+    INVX, // invalidate exclusive access to this line (lower level can still keep a non-exclusive copy)
+    FWD,  // don't invalidate, just send up the data (used by directories). Only valid on S lines.
+} InvType;
+
+/* Coherence states for the MESI protocol */
+typedef enum {
+    I, // invalid
+    S, // shared (and clean)
+    E, // exclusive and clean
+    M  // exclusive and dirty
+} MESIState;
+
+//Convenience methods for clearer debug traces
+const char* AccessTypeName(AccessType t);
+const char* InvTypeName(InvType t);
+const char* MESIStateName(MESIState s);
+
+/* Memory request */
+struct MemReq {
+    Address lineAddr;
+    AccessType type;
+    uint32_t childId;
+    MESIState* state;
+    uint64_t cycle; //cycle where request arrives at component
+
+    //Used for race detection/sync
+    lock_t* childLock;
+    MESIState initialState;
+
+    //Requester id --- used for contention simulation
+    uint32_t srcId;
+
+    //Flags propagate across levels, though not to evictions
+    //Some other things that can be indicated here: Demand vs prefetch accesses, TLB accesses, etc.
+    enum Flag {
+        IFETCH        = (1<<1), //For instruction fetches. Purely informative for now, does not imply NOEXCL (but ifetches should be marked NOEXCL)
+        NOEXCL        = (1<<2), //Do not give back E on a GETS request (turns MESI protocol into MSI for this line). Used on e.g., ifetches and NUCA.
+        NONINCLWB     = (1<<3), //This is a non-inclusive writeback. Do not assume that the line was in the lower level. Used on NUCA (BankDir).
+        PUTX_KEEPEXCL = (1<<4), //Non-relinquishing PUTX. On a PUTX, maintain the requestor's E state instead of removing the sharer (i.e., this is a pure writeback)
+        PREFETCH      = (1<<5), //Prefetch GETS access. Only set at level where prefetch is issued; handled early in MESICC
+    };
+    uint32_t flags;
+
+    inline void set(Flag f) {flags |= f;}
+    inline bool is (Flag f) const {return flags & f;}
+};
+
+/** INTERFACES **/
+
+class AggregateStat;
+class Network;
+
+/* Base class for all memory objects (caches and memories) */
+class MemObject : public GlobAlloc {
+    public:
+        //Returns response cycle
+        virtual uint64_t access(MemReq& req) = 0;
+        virtual void initStats(AggregateStat* parentStat) {}
+        virtual const char* getName() = 0;
+};
+
+/* Base class for all cache objects */
+class BaseCache : public MemObject {
+    public:
+        virtual void setParents(uint32_t _childId, const g_vector<MemObject*>& parents, Network* network) = 0;
+        virtual void setChildren(const g_vector<BaseCache*>& children, Network* network) = 0;
+        virtual uint64_t invalidate(Address lineAddr, InvType type, bool* reqWriteback, uint64_t reqCycle, uint32_t srcId) = 0;
+};
+
+#endif  // MEMORY_HIERARCHY_H_
diff --git a/src/monitor.cpp b/src/monitor.cpp
new file mode 100644
index 00000000..8295305c
--- /dev/null
+++ b/src/monitor.cpp
@@ -0,0 +1,135 @@
+/** $lic$
+ * Copyright (C) 2012-2014 by Massachusetts Institute of Technology
+ * Copyright (C) 2010-2013 by The Board of Trustees of Stanford University
+ *
+ * This file is part of zsim.
+ *
+ * zsim is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU General Public License as published by the Free Software
+ * Foundation, version 2.
+ *
+ * If you use this software in your research, we request that you reference
+ * the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of
+ * Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the
+ * source of the simulator in any publications that use this software, and that
+ * you send us a citation of your work.
+ *
+ * zsim is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "partitioner.h"
+
+// UMon
+
+UMonMonitor::UMonMonitor(uint32_t _numLines, uint32_t _umonLines, uint32_t _umonBuckets, uint32_t _numPartitions, uint32_t _buckets)
+        : PartitionMonitor(_buckets)
+        , missCache(NULL)
+        , missCacheValid(false)
+        , monitors(_numPartitions, NULL) {
+    assert(_numPartitions > 0);
+
+    missCache = gm_calloc<uint32_t>(_buckets * _numPartitions);
+
+    for (auto& monitor : monitors) {
+        monitor = new UMon(_numLines, _umonLines, _umonBuckets);
+    }
+}
+
+UMonMonitor::~UMonMonitor() {
+    for (auto monitor : monitors) {
+        delete monitor;
+    }
+    gm_free(missCache);
+    monitors.clear();
+}
+
+void UMonMonitor::access(uint32_t partition, Address lineAddr) {
+    assert(partition < monitors.size());
+    monitors[partition]->access(lineAddr);
+
+    // check optimization assumption -- we shouldn't cache all misses
+    // if they are getting accessed while they are updated! -nzb
+    assert(!missCacheValid);
+    missCacheValid = false;
+}
+
+uint32_t UMonMonitor::getNumAccesses(uint32_t partition) const {
+    assert(partition < monitors.size());
+
+    auto monitor = monitors[partition];
+    return monitor->getNumAccesses();
+}
+
+uint32_t UMonMonitor::get(uint32_t partition, uint32_t bucket) const {
+    assert(partition < monitors.size());
+
+    if (!missCacheValid) {
+        getMissCurves();
+        missCacheValid = true;
+    }
+
+    return missCache[partition*buckets+bucket];
+}
+
+void UMonMonitor::getMissCurves() const {
+    for (uint32_t partition = 0; partition < getNumPartitions(); partition++) {
+        getMissCurve(&missCache[partition*buckets], partition);
+    }
+}
+
+void UMonMonitor::getMissCurve(uint32_t* misses, uint32_t partition) const {
+    assert(partition < monitors.size());
+
+    auto monitor = monitors[partition];
+    uint32_t umonBuckets = monitor->getBuckets();
+    uint64_t umonMisses[ umonBuckets ];
+
+    monitor->getMisses(umonMisses);
+
+    // Upsample or downsample
+
+    // We have an odd number of elements; the last one is the one that
+    // should not be aliased, as it is the one without buckets
+    if (umonBuckets >= buckets) {
+        uint32_t downsampleRatio = umonBuckets/buckets;
+        assert(umonBuckets % buckets == 0);
+        //info("Downsampling (or keeping sampling), ratio %d", downsampleRatio);
+        for (uint32_t j = 0; j < buckets; j++) {
+            misses[j] = umonMisses[j*downsampleRatio];
+        }
+        misses[buckets] = umonMisses[umonBuckets];
+    } else {
+        uint32_t upsampleRatio = buckets/umonBuckets;
+        assert(buckets % umonBuckets == 0);
+        //info("Upsampling , ratio %d", upsampleRatio);
+        for (uint32_t j = 0; j < umonBuckets; j++) {
+            misses[upsampleRatio*j] = umonMisses[j];
+            double m0 = umonMisses[j];
+            double m1 = umonMisses[j+1];
+            for (uint32_t k = 1; k < upsampleRatio; k++) {
+                double frac = ((double)k)/((double)upsampleRatio);
+                double m = m0*(1-frac) + m1*(frac);
+                misses[upsampleRatio*j + k] = (uint64_t)m;
+            }
+            misses[buckets] = umonMisses[umonBuckets];
+        }
+    }
+
+    /*info("Miss utility curves %d:", partition);
+      for (uint32_t j = 0; j <= buckets; j++) info(" misses[%d] = %ld", j, misses[j]);
+      for (uint32_t j = 0; j <= umonBuckets; j++) info(" umonMisses[%d] = %ld", j, umonMisses[j]);
+      */
+}
+
+void UMonMonitor::reset() {
+    for (auto monitor : monitors) {
+        monitor->startNextInterval();
+    }
+    missCacheValid = false;
+}
diff --git a/src/mtrand.h b/src/mtrand.h
new file mode 100644
index 00000000..36c1b113
--- /dev/null
+++ b/src/mtrand.h
@@ -0,0 +1,457 @@
+/** $lic$
+ * Copyright (C) 2012-2014 by Massachusetts Institute of Technology
+ * Copyright (C) 2010-2013 by The Board of Trustees of Stanford University
+ *
+ * This file is part of zsim.
+ *
+ * zsim is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU General Public License as published by the Free Software
+ * Foundation, version 2.
+ *
+ * If you use this software in your research, we request that you reference
+ * the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of
+ * Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the
+ * source of the simulator in any publications that use this software, and that
+ * you send us a citation of your work.
+ *
+ * zsim is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+// MersenneTwister.h
+// Mersenne Twister random number generator -- a C++ class MTRand
+// Based on code by Makoto Matsumoto, Takuji Nishimura, and Shawn Cokus
+// Richard J. Wagner  v1.1  28 September 2009  wagnerr@umich.edu
+
+// The Mersenne Twister is an algorithm for generating random numbers.  It
+// was designed with consideration of the flaws in various other generators.
+// The period, 2^19937-1, and the order of equidistribution, 623 dimensions,
+// are far greater.  The generator is also fast; it avoids multiplication and
+// division, and it benefits from caches and pipelines.  For more information
+// see the inventors' web page at
+// http://www.math.sci.hiroshima-u.ac.jp/~m-mat/MT/emt.html
+
+// Reference
+// M. Matsumoto and T. Nishimura, "Mersenne Twister: A 623-Dimensionally
+// Equidistributed Uniform Pseudo-Random Number Generator", ACM Transactions on
+// Modeling and Computer Simulation, Vol. 8, No. 1, January 1998, pp 3-30.
+
+// Copyright (C) 1997 - 2002, Makoto Matsumoto and Takuji Nishimura,
+// Copyright (C) 2000 - 2009, Richard J. Wagner
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//
+//   1. Redistributions of source code must retain the above copyright
+//      notice, this list of conditions and the following disclaimer.
+//
+//   2. Redistributions in binary form must reproduce the above copyright
+//      notice, this list of conditions and the following disclaimer in the
+//      documentation and/or other materials provided with the distribution.
+//
+//   3. The names of its contributors may not be used to endorse or promote
+//      products derived from this software without specific prior written
+//      permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+
+// The original code included the following notice:
+//
+//     When you use this, send an email to: m-mat@math.sci.hiroshima-u.ac.jp
+//     with an appropriate reference to your work.
+//
+// It would be nice to CC: wagnerr@umich.edu and Cokus@math.washington.edu
+// when you write.
+
+#ifndef MTRAND_H_
+#define MTRAND_H_
+
+// Not thread safe (unless auto-initialization is avoided and each thread has
+// its own MTRand object)
+
+#include <climits>
+#include <cmath>
+#include <cstdio>
+#include <ctime>
+#include <iostream>
+#include <stdint.h>
+#include "galloc.h"
+
+class MTRand : public GlobAlloc {
+    // Data
+    public:
+        // typedef unsigned long uint32;  // unsigned integer type, at least 32 bits
+        // dsm: WTF??? In x86-64, unsigned long is 64 bits! Using uint32_t broke
+        // everything using this class, so I just turned all uint32 into uint64_t
+
+        enum { N = 624 };       // length of state vector
+        enum { SAVE = N + 1 };  // length of array for save()
+
+    protected:
+        enum { M = 397 };  // period parameter
+
+        uint64_t state[N];   // internal state
+        uint64_t *pNext;     // next value to get from state
+        int left;          // number of values left before reload needed
+
+        // Methods
+    public:
+        explicit MTRand(const uint64_t oneSeed);  // initialize with a simple uint64_t
+        MTRand(uint64_t *const bigSeed, uint64_t const seedLength = N);  // or array
+        MTRand();  // auto-initialize with /dev/urandom or time() and clock()
+        explicit MTRand(const MTRand& o);  // copy
+
+        // Do NOT use for CRYPTOGRAPHY without securely hashing several returned
+        // values together, otherwise the generator state can be learned after
+        // reading 624 consecutive values.
+
+        // Access to 32-bit random numbers
+        uint64_t randInt();                     // integer in [0,2^32-1]
+        uint64_t randInt(const uint64_t n);     // integer in [0,n] for n < 2^32
+        double rand();                        // real number in [0,1]
+        double rand(const double n);        // real number in [0,n]
+        double randExc();                     // real number in [0,1)
+        double randExc(const double n);     // real number in [0,n)
+        double randDblExc();                  // real number in (0,1)
+        double randDblExc(const double n);  // real number in (0,n)
+        double operator()();                  // same as rand()
+
+        // Access to 53-bit random numbers (capacity of IEEE double precision)
+        double rand53();  // real number in [0,1)
+
+        // Access to nonuniform random number distributions
+        double randNorm(const double mean = 0.0, const double stddev = 1.0);
+
+        // Re-seeding functions with same behavior as initializers
+        void seed(const uint64_t oneSeed);
+        void seed(uint64_t *const bigSeed, const uint64_t seedLength = N);
+        void seed();
+
+        // Saving and loading generator state
+        void save(uint64_t* saveArray) const;  // to array of size SAVE
+        void load(uint64_t *const loadArray);  // from such array
+        friend std::ostream& operator<<(std::ostream& os, const MTRand& mtrand);
+        friend std::istream& operator>>(std::istream& is, MTRand& mtrand);
+        MTRand& operator=(const MTRand& o);
+
+    protected:
+        void initialize(const uint64_t oneSeed);
+        void reload();
+        uint64_t hiBit(const uint64_t u) const { return u & 0x80000000UL; }
+        uint64_t loBit(const uint64_t u) const { return u & 0x00000001UL; }
+        uint64_t loBits(const uint64_t u) const { return u & 0x7fffffffUL; }
+        uint64_t mixBits(const uint64_t u, const uint64_t v) const { return hiBit(u) | loBits(v); }
+        uint64_t magic(const uint64_t u) const { return loBit(u) ? 0x9908b0dfUL : 0x0UL; }
+        uint64_t twist(const uint64_t m, const uint64_t s0, const uint64_t s1) const {
+            return m ^ (mixBits(s0, s1)>>1) ^ magic(s1);
+        }
+        static uint64_t hash(time_t t, clock_t c);
+};
+
+// Functions are defined in order of usage to assist inlining
+
+inline uint64_t MTRand::hash(time_t t, clock_t c) {
+    // Get a uint64_t from t and c
+    // Better than uint64_t(x) in case x is floating point in [0,1]
+    // Based on code by Lawrence Kirby (fred@genesis.demon.co.uk)
+
+    static uint64_t differ = 0;  // guarantee time-based seeds will change
+
+    uint64_t h1 = 0;
+    unsigned char *p = (unsigned char *) &t;
+    for (size_t i = 0; i < sizeof(t); ++i) {
+        h1 *= UCHAR_MAX + 2U;
+        h1 += p[i];
+    }
+    uint64_t h2 = 0;
+    p = (unsigned char *) &c;
+    for (size_t j = 0; j < sizeof(c); ++j) {
+        h2 *= UCHAR_MAX + 2U;
+        h2 += p[j];
+    }
+    return (h1 + differ++) ^ h2;
+}
+
+inline void MTRand::initialize(const uint64_t seed) {
+    // Initialize generator state with seed
+    // See Knuth TAOCP Vol 2, 3rd Ed, p.106 for multiplier.
+    // In previous versions, most significant bits (MSBs) of the seed affect
+    // only MSBs of the state array.  Modified 9 Jan 2002 by Makoto Matsumoto.
+    register uint64_t *s = state;
+    register uint64_t *r = state;
+    register int i = 1;
+    *s++ = seed & 0xffffffffUL;
+    for (; i < N; ++i) {
+        *s++ = (1812433253UL * (*r ^ (*r >> 30)) + i) & 0xffffffffUL;
+        r++;
+    }
+}
+
+inline void MTRand::reload() {
+    // Generate N new values in state
+    // Made clearer and faster by Matthew Bellew (matthew.bellew@home.com)
+    static const int MmN = int(M) - int(N);  // in case enums are unsigned
+    register uint64_t *p = state;
+    register int i;
+    for (i = N - M; i--; ++p)
+        *p = twist(p[M], p[0], p[1]);
+    for (i = M; --i; ++p)
+        *p = twist(p[MmN], p[0], p[1]);
+    *p = twist(p[MmN], p[0], state[0]);
+
+    left = N, pNext = state;
+}
+
+inline void MTRand::seed(const uint64_t oneSeed) {
+    // Seed the generator with a simple uint64_t
+    initialize(oneSeed);
+    reload();
+}
+
+inline void MTRand::seed(uint64_t *const bigSeed, const uint64_t seedLength) {
+    // Seed the generator with an array of uint64_t's
+    // There are 2^19937-1 possible initial states.  This function allows
+    // all of those to be accessed by providing at least 19937 bits (with a
+    // default seed length of N = 624 uint64_t's).  Any bits above the lower 32
+    // in each element are discarded.
+    // Just call seed() if you want to get array from /dev/urandom
+    initialize(19650218UL);
+    register int i = 1;
+    register uint64_t j = 0;
+    register int k = (N > seedLength ? N : seedLength);
+    for (; k; --k) {
+        state[i] =
+            state[i] ^ ((state[i-1] ^ (state[i-1] >> 30)) * 1664525UL);
+        state[i] += (bigSeed[j] & 0xffffffffUL) + j;
+        state[i] &= 0xffffffffUL;
+        ++i;  ++j;
+        if (i >= N) { state[0] = state[N-1];  i = 1; }
+        if (j >= seedLength) j = 0;
+    }
+    for (k = N - 1; k; --k) {
+        state[i] =
+            state[i] ^ ((state[i-1] ^ (state[i-1] >> 30)) * 1566083941UL);
+        state[i] -= i;
+        state[i] &= 0xffffffffUL;
+        ++i;
+        if (i >= N) { state[0] = state[N-1];  i = 1; }
+    }
+    state[0] = 0x80000000UL;  // MSB is 1, assuring non-zero initial array
+    reload();
+}
+
+inline void MTRand::seed() {
+    // Seed the generator with an array from /dev/urandom if available
+    // Otherwise use a hash of time() and clock() values
+
+    // First try getting an array from /dev/urandom
+    FILE* urandom = fopen("/dev/urandom", "rb");
+    if (urandom) {
+        uint64_t bigSeed[N];
+        register uint64_t *s = bigSeed;
+        register int i = N;
+        register bool success = true;
+        while (success && i--)
+            success = fread(s++, sizeof(uint64_t), 1, urandom);
+        fclose(urandom);
+        if (success) { seed(bigSeed, N); return; }
+    }
+
+    // Was not successful, so use time() and clock() instead
+    seed(hash(time(NULL), clock()));
+}
+
+inline MTRand::MTRand(const uint64_t oneSeed) { seed(oneSeed); }
+
+inline MTRand::MTRand(uint64_t *const bigSeed, const uint64_t seedLength) {
+    seed(bigSeed, seedLength);
+}
+
+inline MTRand::MTRand() { seed(); }
+
+inline MTRand::MTRand(const MTRand& o) {
+    register const uint64_t *t = o.state;
+    register uint64_t *s = state;
+    register int i = N;
+    for (; i--; *s++ = *t++) {}
+    left = o.left;
+    pNext = &state[N-left];
+}
+
+inline uint64_t MTRand::randInt() {
+    // Pull a 32-bit integer from the generator state
+    // Every other access function simply transforms the numbers extracted here
+
+    if (left == 0) reload();
+    --left;
+
+    register uint64_t s1;
+    s1 = *pNext++;
+    s1 ^= (s1 >> 11);
+    s1 ^= (s1 <<  7) & 0x9d2c5680UL;
+    s1 ^= (s1 << 15) & 0xefc60000UL;
+    return (s1 ^ (s1 >> 18));
+}
+
+inline uint64_t MTRand::randInt(const uint64_t n) {
+    // Find which bits are used in n
+    // Optimized by Magnus Jonsson (magnus@smartelectronix.com)
+    uint64_t used = n;
+    used |= used >> 1;
+    used |= used >> 2;
+    used |= used >> 4;
+    used |= used >> 8;
+    used |= used >> 16;
+
+    // Draw numbers until one is found in [0,n]
+    uint64_t i;
+    do {
+        i = randInt() & used;  // toss unused bits to shorten search
+    } while (i > n);
+    return i;
+}
+
+inline double MTRand::rand() { return double(randInt()) * (1.0/4294967295.0); }
+
+inline double MTRand::rand(const double n) { return rand() * n; }
+
+inline double MTRand::randExc() { return double(randInt()) * (1.0/4294967296.0); }
+
+inline double MTRand::randExc(const double n) { return randExc() * n; }
+
+inline double MTRand::randDblExc() { return (double(randInt()) + 0.5) * (1.0/4294967296.0); }
+
+inline double MTRand::randDblExc(const double n) { return randDblExc() * n; }
+
+inline double MTRand::rand53() {
+    uint64_t a = randInt() >> 5, b = randInt() >> 6;
+    return (a * 67108864.0 + b) * (1.0/9007199254740992.0);  // by Isaku Wada
+}
+
+inline double MTRand::randNorm(const double mean, const double stddev) {
+    // Return a real number from a normal (Gaussian) distribution with given
+    // mean and standard deviation by polar form of Box-Muller transformation
+    double x, y, r;
+    do {
+        x = 2.0 * rand() - 1.0;
+        y = 2.0 * rand() - 1.0;
+        r = x * x + y * y;
+    } while (r >= 1.0 || r == 0.0);
+    double s = sqrt(-2.0 * log(r) / r);
+    return mean + x * s * stddev;
+}
+
+inline double MTRand::operator()() {
+    return rand();
+}
+
+inline void MTRand::save(uint64_t* saveArray) const {
+    register const uint64_t *s = state;
+    register uint64_t *sa = saveArray;
+    register int i = N;
+    for (; i--; *sa++ = *s++) {}
+    *sa = left;
+}
+
+inline void MTRand::load(uint64_t *const loadArray) {
+    register uint64_t *s = state;
+    register uint64_t *la = loadArray;
+    register int i = N;
+    for (; i--; *s++ = *la++) {}
+    left = *la;
+    pNext = &state[N-left];
+}
+
+inline std::ostream& operator<<(std::ostream& os, const MTRand& mtrand) {
+    register const uint64_t *s = mtrand.state;
+    register int i = mtrand.N;
+    for (; i--; os << *s++ << "\t") {}
+    return os << mtrand.left;
+}
+
+inline std::istream& operator>>(std::istream& is, MTRand& mtrand) {
+    register uint64_t *s = mtrand.state;
+    register int i = mtrand.N;
+    for (; i--; is >> *s++) {}
+    is >> mtrand.left;
+    mtrand.pNext = &mtrand.state[mtrand.N-mtrand.left];
+    return is;
+}
+
+inline MTRand& MTRand::operator=(const MTRand& o) {
+    if (this == &o) return (*this);
+    register const uint64_t *t = o.state;
+    register uint64_t *s = state;
+    register int i = N;
+    for (; i--; *s++ = *t++) {}
+    left = o.left;
+    pNext = &state[N-left];
+    return (*this);
+}
+
+#endif  // MTRAND_H_
+
+// Change log:
+//
+// v0.1 - First release on 15 May 2000
+//      - Based on code by Makoto Matsumoto, Takuji Nishimura, and Shawn Cokus
+//      - Translated from C to C++
+//      - Made completely ANSI compliant
+//      - Designed convenient interface for initialization, seeding, and
+//        obtaining numbers in default or user-defined ranges
+//      - Added automatic seeding from /dev/urandom or time() and clock()
+//      - Provided functions for saving and loading generator state
+//
+// v0.2 - Fixed bug which reloaded generator one step too late
+//
+// v0.3 - Switched to clearer, faster reload() code from Matthew Bellew
+//
+// v0.4 - Removed trailing newline in saved generator format to be consistent
+//        with output format of built-in types
+//
+// v0.5 - Improved portability by replacing static const int's with enum's and
+//        clarifying return values in seed(); suggested by Eric Heimburg
+//      - Removed MAXINT constant; use 0xffffffffUL instead
+//
+// v0.6 - Eliminated seed overflow when uint32 is larger than 32 bits
+//      - Changed integer [0,n] generator to give better uniformity
+//
+// v0.7 - Fixed operator precedence ambiguity in reload()
+//      - Added access for real numbers in (0,1) and (0,n)
+//
+// v0.8 - Included time.h header to properly support time_t and clock_t
+//
+// v1.0 - Revised seeding to match 26 Jan 2002 update of Nishimura and Matsumoto
+//      - Allowed for seeding with arrays of any length
+//      - Added access for real numbers in [0,1) with 53-bit resolution
+//      - Added access for real numbers from normal (Gaussian) distributions
+//      - Increased overall speed by optimizing twist()
+//      - Doubled speed of integer [0,n] generation
+//      - Fixed out-of-range number generation on 64-bit machines
+//      - Improved portability by substituting literal constants for long enum's
+//      - Changed license from GNU LGPL to BSD
+//
+// v1.1 - Corrected parameter label in randNorm from "variance" to "stddev"
+//      - Changed randNorm algorithm from basic to polar form for efficiency
+//      - Updated includes from deprecated <xxxx.h> to standard <cxxxx> forms
+//      - Cleaned declarations and definitions to please Intel compiler
+//      - Revised twist() operator to work on ones'-complement machines
+//      - Fixed reload() function to work when N and M are unsigned
+//      - Added copy constructor and copy operator from Salvador Espana
diff --git a/src/network.cpp b/src/network.cpp
new file mode 100644
index 00000000..f1d8bd70
--- /dev/null
+++ b/src/network.cpp
@@ -0,0 +1,81 @@
+/** $lic$
+ * Copyright (C) 2012-2014 by Massachusetts Institute of Technology
+ * Copyright (C) 2010-2013 by The Board of Trustees of Stanford University
+ *
+ * This file is part of zsim.
+ *
+ * zsim is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU General Public License as published by the Free Software
+ * Foundation, version 2.
+ *
+ * If you use this software in your research, we request that you reference
+ * the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of
+ * Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the
+ * source of the simulator in any publications that use this software, and that
+ * you send us a citation of your work.
+ *
+ * zsim is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "network.h"
+#include <fstream>
+#include <string>
+#include "log.h"
+
+using std::ifstream;
+using std::string;
+
+Network::Network(const char* filename) {
+    ifstream inFile(filename);
+
+    if (!inFile) {
+        panic("Could not open network description file %s", filename);
+    }
+
+    while (inFile.good()) {
+        string src, dst;
+        uint32_t delay;
+        inFile >> src;
+        inFile >> dst;
+        inFile >> delay;
+
+        if (inFile.eof()) break;
+
+        string s1 = src + " " + dst;
+        string s2 = dst + " " + src;
+
+        assert(delayMap.find(s1) == delayMap.end());
+        assert(delayMap.find(s2) == delayMap.end());
+
+        delayMap[s1] = delay;
+        delayMap[s2] = delay;
+
+        //info("Parsed %s %s %d", src.c_str(), dst.c_str(), delay);
+    }
+
+    inFile.close();
+}
+
+uint32_t Network::getRTT(const char* src, const char* dst) {
+    string key(src);
+    key += " ";
+    key += dst;
+/* dsm: Be sloppy, deadline deadline deadline
+    assert_msg(delayMap.find(key) != delayMap.end(), "%s and %s cannot communicate, according to the network description file", src, dst);
+    return 2*delayMap[key];
+    */
+
+    if (delayMap.find(key) != delayMap.end()) {
+        return 2*delayMap[key];
+    } else {
+        warn("%s and %s have no entry in network description file, returning 0 latency", src, dst);
+        return 0;
+    }
+}
+
diff --git a/src/network.h b/src/network.h
new file mode 100644
index 00000000..c2ecac0d
--- /dev/null
+++ b/src/network.h
@@ -0,0 +1,48 @@
+/** $lic$
+ * Copyright (C) 2012-2014 by Massachusetts Institute of Technology
+ * Copyright (C) 2010-2013 by The Board of Trustees of Stanford University
+ *
+ * This file is part of zsim.
+ *
+ * zsim is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU General Public License as published by the Free Software
+ * Foundation, version 2.
+ *
+ * If you use this software in your research, we request that you reference
+ * the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of
+ * Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the
+ * source of the simulator in any publications that use this software, and that
+ * you send us a citation of your work.
+ *
+ * zsim is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef NETWORK_H_
+#define NETWORK_H_
+
+/* Very simple fixed-delay network model. Parses a list of delays between
+ * entities, then accepts queries for roundtrip times between these entities.
+ * There is no contention modeling or even support for serialization latency.
+ * This is a basic model that should be extended as appropriate.
+ */
+
+#include <string>
+#include <unordered_map>
+
+class Network {
+    private:
+        std::unordered_map<std::string, uint32_t> delayMap;
+
+    public:
+        explicit Network(const char* filename);
+        uint32_t getRTT(const char* src, const char* dst);
+};
+
+#endif  // NETWORK_H_
+
diff --git a/src/null_core.cpp b/src/null_core.cpp
new file mode 100644
index 00000000..60e8d50e
--- /dev/null
+++ b/src/null_core.cpp
@@ -0,0 +1,86 @@
+/** $lic$
+ * Copyright (C) 2012-2014 by Massachusetts Institute of Technology
+ * Copyright (C) 2010-2013 by The Board of Trustees of Stanford University
+ *
+ * This file is part of zsim.
+ *
+ * zsim is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU General Public License as published by the Free Software
+ * Foundation, version 2.
+ *
+ * If you use this software in your research, we request that you reference
+ * the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of
+ * Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the
+ * source of the simulator in any publications that use this software, and that
+ * you send us a citation of your work.
+ *
+ * zsim is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "null_core.h"
+#include "zsim.h"
+
+NullCore::NullCore(g_string& _name) : Core(_name), instrs(0), curCycle(0), phaseEndCycle(0) {}
+
+void NullCore::initStats(AggregateStat* parentStat) {
+    AggregateStat* coreStat = new AggregateStat();
+    coreStat->init(name.c_str(), "Core stats");
+    ProxyStat* cyclesStat = new ProxyStat();
+    cyclesStat->init("cycles", "Simulated cycles", &instrs); //simulated instrs == simulated cycles; curCycle can be skewed forward
+    ProxyStat* instrsStat = new ProxyStat();
+    instrsStat->init("instrs", "Simulated instructions", &instrs);
+    coreStat->append(cyclesStat);
+    coreStat->append(instrsStat);
+    parentStat->append(coreStat);
+}
+
+uint64_t NullCore::getPhaseCycles() const {
+    return curCycle - zinfo->globPhaseCycles;
+}
+
+void NullCore::bbl(BblInfo* bblInfo) {
+    instrs += bblInfo->instrs;
+    curCycle += bblInfo->instrs;
+}
+
+void NullCore::contextSwitch(int32_t gid) {}
+
+void NullCore::join() {
+    curCycle = MAX(curCycle, zinfo->globPhaseCycles);
+    phaseEndCycle = zinfo->globPhaseCycles + zinfo->phaseLength;
+}
+
+//Static class functions: Function pointers and trampolines
+
+InstrFuncPtrs NullCore::GetFuncPtrs() {
+    return {LoadFunc, StoreFunc, BblFunc, BranchFunc, PredLoadFunc, PredStoreFunc, FPTR_ANALYSIS, {0}};
+}
+
+void NullCore::LoadFunc(THREADID tid, ADDRINT addr) {}
+void NullCore::StoreFunc(THREADID tid, ADDRINT addr) {}
+void NullCore::PredLoadFunc(THREADID tid, ADDRINT addr, BOOL pred) {}
+void NullCore::PredStoreFunc(THREADID tid, ADDRINT addr, BOOL pred) {}
+
+void NullCore::BblFunc(THREADID tid, ADDRINT bblAddr, BblInfo* bblInfo) {
+    NullCore* core = static_cast<NullCore*>(cores[tid]);
+    core->bbl(bblInfo);
+
+    while (unlikely(core->curCycle > core->phaseEndCycle)) {
+        assert(core->phaseEndCycle == zinfo->globPhaseCycles + zinfo->phaseLength);
+        core->phaseEndCycle += zinfo->phaseLength;
+
+        uint32_t cid = getCid(tid);
+        //NOTE: TakeBarrier may take ownership of the core, and so it will be used by some other thread. If TakeBarrier context-switches us,
+        //the *only* safe option is to return inmmediately after we detect this, or we can race and corrupt core state. If newCid == cid,
+        //we're not at risk of racing, even if we were switched out and then switched in.
+        uint32_t newCid = TakeBarrier(tid, cid);
+        if (newCid != cid) break; /*context-switch*/
+    }
+}
+
diff --git a/src/null_core.h b/src/null_core.h
new file mode 100644
index 00000000..fd837f6e
--- /dev/null
+++ b/src/null_core.h
@@ -0,0 +1,66 @@
+/** $lic$
+ * Copyright (C) 2012-2014 by Massachusetts Institute of Technology
+ * Copyright (C) 2010-2013 by The Board of Trustees of Stanford University
+ *
+ * This file is part of zsim.
+ *
+ * zsim is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU General Public License as published by the Free Software
+ * Foundation, version 2.
+ *
+ * If you use this software in your research, we request that you reference
+ * the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of
+ * Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the
+ * source of the simulator in any publications that use this software, and that
+ * you send us a citation of your work.
+ *
+ * zsim is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef NULL_CORE_H_
+#define NULL_CORE_H_
+
+//A core model with IPC=1 and no hooks into the memory hierarchy. Useful to isolate threads that need to be run for simulation purposes.
+
+#include "core.h"
+#include "pad.h"
+
+class NullCore : public Core {
+    protected:
+        uint64_t instrs;
+        uint64_t curCycle;
+        uint64_t phaseEndCycle; //next stopping point
+
+    public:
+        explicit NullCore(g_string& _name);
+        void initStats(AggregateStat* parentStat);
+
+        uint64_t getInstrs() const {return instrs;}
+        uint64_t getPhaseCycles() const;
+        uint64_t getCycles() const {return instrs; /*IPC=1*/ }
+
+        void contextSwitch(int32_t gid);
+        virtual void join();
+
+        InstrFuncPtrs GetFuncPtrs();
+
+    protected:
+        inline void bbl(BblInfo* bblInstrs);
+
+        static void LoadFunc(THREADID tid, ADDRINT addr);
+        static void StoreFunc(THREADID tid, ADDRINT addr);
+        static void BblFunc(THREADID tid, ADDRINT bblAddr, BblInfo* bblInfo);
+        static void PredLoadFunc(THREADID tid, ADDRINT addr, BOOL pred);
+        static void PredStoreFunc(THREADID tid, ADDRINT addr, BOOL pred);
+
+        static void BranchFunc(THREADID, ADDRINT, BOOL, ADDRINT, ADDRINT) {}
+} ATTR_LINE_ALIGNED; //This needs to take up a whole cache line, or false sharing will be extremely frequent
+
+#endif  // NULL_CORE_H_
+
diff --git a/src/ooo_core.cpp b/src/ooo_core.cpp
new file mode 100644
index 00000000..74797aad
--- /dev/null
+++ b/src/ooo_core.cpp
@@ -0,0 +1,525 @@
+/** $lic$
+ * Copyright (C) 2012-2014 by Massachusetts Institute of Technology
+ * Copyright (C) 2010-2013 by The Board of Trustees of Stanford University
+ *
+ * This file is part of zsim.
+ *
+ * zsim is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU General Public License as published by the Free Software
+ * Foundation, version 2.
+ *
+ * If you use this software in your research, we request that you reference
+ * the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of
+ * Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the
+ * source of the simulator in any publications that use this software, and that
+ * you send us a citation of your work.
+ *
+ * zsim is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "ooo_core.h"
+#include <algorithm>
+#include <queue>
+#include <string>
+#include "bithacks.h"
+#include "decoder.h"
+#include "filter_cache.h"
+#include "zsim.h"
+
+/* Uncomment to induce backpressure to the IW when the load/store buffers fill up. In theory, more detailed,
+ * but sometimes much slower (as it relies on range poisoning in the IW, potentially O(n^2)), and in practice
+ * makes a negligible difference (ROB backpressures).
+ */
+//#define LSU_IW_BACKPRESSURE
+
+#define DEBUG_MSG(args...)
+//#define DEBUG_MSG(args...) info(args)
+
+// Core parameters
+// TODO(dsm): Make OOOCore templated, subsuming these
+
+// Stages --- more or less matched to Westmere, but have not seen detailed pipe diagrams anywhare
+#define FETCH_STAGE 1
+#define DECODE_STAGE 4  // NOTE: Decoder adds predecode delays to decode
+#define ISSUE_STAGE 7
+#define DISPATCH_STAGE 13  // RAT + ROB + RS, each is easily 2 cycles
+
+#define L1D_LAT 4  // fixed, and FilterCache does not include L1 delay
+#define FETCH_BYTES_PER_CYCLE 16
+#define ISSUES_PER_CYCLE 4
+#define RF_READS_PER_CYCLE 3
+
+OOOCore::OOOCore(FilterCache* _l1i, FilterCache* _l1d, g_string& _name) : Core(_name), l1i(_l1i), l1d(_l1d), cRec(0, _name) {
+    decodeCycle = DECODE_STAGE;  // allow subtracting from it
+    curCycle = 0;
+    phaseEndCycle = zinfo->phaseLength;
+
+    for (uint32_t i = 0; i < MAX_REGISTERS; i++) {
+        regScoreboard[i] = 0;
+    }
+    prevBbl = NULL;
+
+    lastStoreCommitCycle = 0;
+    lastStoreAddrCommitCycle = 0;
+    curCycleRFReads = 0;
+    curCycleIssuedUops = 0;
+    branchPc = 0;
+
+    instrs = uops = bbls = approxInstrs = mispredBranches = 0;
+
+    for (uint32_t i = 0; i < FWD_ENTRIES; i++) fwdArray[i].set((Address)(-1L), 0);
+}
+
+void OOOCore::initStats(AggregateStat* parentStat) {
+    AggregateStat* coreStat = new AggregateStat();
+    coreStat->init(name.c_str(), "Core stats");
+
+    auto x = [this]() { return cRec.getUnhaltedCycles(curCycle); };
+    LambdaStat<decltype(x)>* cyclesStat = new LambdaStat<decltype(x)>(x);
+    cyclesStat->init("cycles", "Simulated unhalted cycles");
+
+    auto y = [this]() { return cRec.getContentionCycles(); };
+    LambdaStat<decltype(y)>* cCyclesStat = new LambdaStat<decltype(y)>(y);
+    cCyclesStat->init("cCycles", "Cycles due to contention stalls");
+
+    ProxyStat* instrsStat = new ProxyStat();
+    instrsStat->init("instrs", "Simulated instructions", &instrs);
+    ProxyStat* uopsStat = new ProxyStat();
+    uopsStat->init("uops", "Retired micro-ops", &uops);
+    ProxyStat* bblsStat = new ProxyStat();
+    bblsStat->init("bbls", "Basic blocks", &bbls);
+    ProxyStat* approxInstrsStat = new ProxyStat();
+    approxInstrsStat->init("approxInstrs", "Instrs with approx uop decoding", &approxInstrs);
+    ProxyStat* mispredBranchesStat = new ProxyStat();
+    mispredBranchesStat->init("mispredBranches", "Mispredicted branches", &mispredBranches);
+
+    coreStat->append(cyclesStat);
+    coreStat->append(cCyclesStat);
+    coreStat->append(instrsStat);
+    coreStat->append(uopsStat);
+    coreStat->append(bblsStat);
+    coreStat->append(approxInstrsStat);
+    coreStat->append(mispredBranchesStat);
+
+#ifdef OOO_STALL_STATS
+    profFetchStalls.init("fetchStalls",  "Fetch stalls");  coreStat->append(&profFetchStalls);
+    profDecodeStalls.init("decodeStalls", "Decode stalls"); coreStat->append(&profDecodeStalls);
+    profIssueStalls.init("issueStalls",  "Issue stalls");  coreStat->append(&profIssueStalls);
+#endif
+
+    parentStat->append(coreStat);
+}
+
+uint64_t OOOCore::getInstrs() const {return instrs;}
+uint64_t OOOCore::getPhaseCycles() const {return curCycle % zinfo->phaseLength;}
+
+void OOOCore::contextSwitch(int32_t gid) {
+    if (gid == -1) {
+        // Do not execute previous BBL, as we were context-switched
+        prevBbl = NULL;
+
+        // Invalidate virtually-addressed filter caches
+        l1i->contextSwitch();
+        l1d->contextSwitch();
+    }
+}
+
+
+InstrFuncPtrs OOOCore::GetFuncPtrs() {return {LoadFunc, StoreFunc, BblFunc, BranchFunc, PredLoadFunc, PredStoreFunc, FPTR_ANALYSIS, {0}};}
+
+inline void OOOCore::load(Address addr) {
+    loadAddrs[loads++] = addr;
+}
+
+void OOOCore::store(Address addr) {
+    storeAddrs[stores++] = addr;
+}
+
+// Predicated loads and stores call this function, gets recorded as a 0-cycle op.
+// Predication is rare enough that we don't need to model it perfectly to be accurate (i.e. the uops still execute, retire, etc), but this is needed for correctness.
+void OOOCore::predFalseMemOp() {
+    // I'm going to go out on a limb and assume just loads are predicated (this will not fail silently if it's a store)
+    loadAddrs[loads++] = -1L;
+}
+
+void OOOCore::branch(Address pc, bool taken, Address takenNpc, Address notTakenNpc) {
+    branchPc = pc;
+    branchTaken = taken;
+    branchTakenNpc = takenNpc;
+    branchNotTakenNpc = notTakenNpc;
+}
+
+inline void OOOCore::bbl(Address bblAddr, BblInfo* bblInfo) {
+    if (!prevBbl) {
+        // This is the 1st BBL since scheduled, nothing to simulate
+        prevBbl = bblInfo;
+        // Kill lingering ops from previous BBL
+        loads = stores = 0;
+        return;
+    }
+
+    /* Simulate execution of previous BBL */
+
+    uint32_t bblInstrs = prevBbl->instrs;
+    DynBbl* bbl = &(prevBbl->oooBbl[0]);
+    prevBbl = bblInfo;
+
+    uint32_t loadIdx = 0;
+    uint32_t storeIdx = 0;
+
+    uint32_t prevDecCycle = 0;
+    uint64_t lastCommitCycle = 0;  // used to find misprediction penalty
+
+    // Run dispatch/IW
+    for (uint32_t i = 0; i < bbl->uops; i++) {
+        DynUop* uop = &(bbl->uop[i]);
+        
+        // Decode stalls
+        uint32_t decDiff = uop->decCycle - prevDecCycle;
+        decodeCycle = MAX(decodeCycle + decDiff, uopQueue.minAllocCycle());
+        if (decodeCycle > curCycle) {
+            //info("Decode stall %ld %ld | %d %d", decodeCycle, curCycle, uop->decCycle, prevDecCycle);
+            uint32_t cdDiff = decodeCycle - curCycle;
+#ifdef OOO_STALL_STATS
+            profDecodeStalls.inc(cdDiff);
+#endif
+            curCycleIssuedUops = 0;
+            curCycleRFReads = 0;
+            for (uint32_t i = 0; i < cdDiff; i++) insWindow.advancePos(curCycle);
+        }
+        prevDecCycle = uop->decCycle;
+        uopQueue.markLeave(curCycle);
+
+        // Implement issue width limit --- we can only issue 4 uops/cycle
+        if (curCycleIssuedUops >= ISSUES_PER_CYCLE) {
+#ifdef OOO_STALL_STATS
+            profIssueStalls.inc();
+#endif
+            // info("Advancing due to uop issue width");
+            curCycleIssuedUops = 0;
+            curCycleRFReads = 0;
+            insWindow.advancePos(curCycle);
+        }
+        curCycleIssuedUops++;
+
+        // Kill dependences on invalid register
+        // Using curCycle saves us two unpredictable branches in the RF read stalls code
+        regScoreboard[0] = curCycle;
+
+        uint64_t c0 = regScoreboard[uop->rs[0]];
+        uint64_t c1 = regScoreboard[uop->rs[1]];
+
+        // RF read stalls
+        // if srcs are not available at issue time, we have to go thru the RF
+        curCycleRFReads += ((c0 < curCycle)? 1 : 0) + ((c1 < curCycle)? 1 : 0);
+        if (curCycleRFReads > RF_READS_PER_CYCLE) {
+            curCycleRFReads -= RF_READS_PER_CYCLE;
+            curCycleIssuedUops = 0;  // or 1? that's probably a 2nd-order detail
+            insWindow.advancePos(curCycle);
+        }
+
+        uint64_t c2 = rob.minAllocCycle();
+        uint64_t c3 = curCycle;
+
+        uint64_t cOps = MAX(c0, c1);
+
+        // Model RAT + ROB + RS delay between issue and dispatch
+        uint64_t dispatchCycle = MAX(cOps, MAX(c2, c3) + (DISPATCH_STAGE - ISSUE_STAGE));
+
+        // info("IW 0x%lx %d %ld %ld %x", bblAddr, i, c2, dispatchCycle, uop->portMask);
+        // NOTE: Schedule can adjust both cur and dispatch cycles
+        insWindow.schedule(curCycle, dispatchCycle, uop->portMask, uop->extraSlots);
+
+        // If we have advanced, we need to reset the curCycle counters
+        if (curCycle > c3) {
+            curCycleIssuedUops = 0;
+            curCycleRFReads = 0;
+        }
+
+        uint64_t commitCycle;
+
+        // LSU simulation
+        // NOTE: Ever-so-slightly faster than if-else if-else if-else
+        switch (uop->type) {
+            case UOP_GENERAL:
+                commitCycle = dispatchCycle + uop->lat;
+                break;
+
+            case UOP_LOAD:
+                {
+                    // dispatchCycle = MAX(loadQueue.minAllocCycle(), dispatchCycle);
+                    uint64_t lqCycle = loadQueue.minAllocCycle();
+                    if (lqCycle > dispatchCycle) {
+#ifdef LSU_IW_BACKPRESSURE
+                        insWindow.poisonRange(curCycle, lqCycle, 0x4 /*PORT_2, loads*/);
+#endif
+                        dispatchCycle = lqCycle;
+                    }
+
+                    // Wait for all previous store addresses to be resolved
+                    dispatchCycle = MAX(lastStoreAddrCommitCycle+1, dispatchCycle);
+
+                    Address addr = loadAddrs[loadIdx++];
+                    uint64_t reqSatisfiedCycle = dispatchCycle;
+                    if (addr != ((Address)-1L)) {
+                        reqSatisfiedCycle = l1d->load(addr, dispatchCycle) + L1D_LAT;
+                        cRec.record(curCycle, dispatchCycle, reqSatisfiedCycle);
+                    }
+
+                    // Enforce st-ld forwarding
+                    uint32_t fwdIdx = (addr>>2) & (FWD_ENTRIES-1);
+                    if (fwdArray[fwdIdx].addr == addr) {
+                        // info("0x%lx FWD %ld %ld", addr, reqSatisfiedCycle, fwdArray[fwdIdx].storeCycle);
+                        /* Take the MAX (see FilterCache's code) Our fwdArray
+                         * imposes more stringent timing constraints than the
+                         * l1d, b/c FilterCache does not change the line's
+                         * availCycle on a store. This allows FilterCache to
+                         * track per-line, not per-word availCycles.
+                         */
+                        reqSatisfiedCycle = MAX(reqSatisfiedCycle, fwdArray[fwdIdx].storeCycle);
+                    }
+
+                    commitCycle = reqSatisfiedCycle;
+                    loadQueue.markRetire(commitCycle);
+                }
+                break;
+
+            case UOP_STORE:
+                {
+                    // dispatchCycle = MAX(storeQueue.minAllocCycle(), dispatchCycle);
+                    uint64_t sqCycle = storeQueue.minAllocCycle();
+                    if (sqCycle > dispatchCycle) {
+#ifdef LSU_IW_BACKPRESSURE
+                        insWindow.poisonRange(curCycle, sqCycle, 0x10 /*PORT_4, stores*/);
+#endif
+                        dispatchCycle = sqCycle;
+                    }
+
+                    // Wait for all previous store addresses to be resolved (not just ours :))
+                    dispatchCycle = MAX(lastStoreAddrCommitCycle+1, dispatchCycle);
+                    
+                    Address addr = storeAddrs[storeIdx++];
+                    uint64_t reqSatisfiedCycle = l1d->store(addr, dispatchCycle) + L1D_LAT;
+                    cRec.record(curCycle, dispatchCycle, reqSatisfiedCycle);
+
+                    // Fill the forwarding table
+                    fwdArray[(addr>>2) & (FWD_ENTRIES-1)].set(addr, reqSatisfiedCycle);
+
+                    commitCycle = reqSatisfiedCycle;
+                    lastStoreCommitCycle = MAX(lastStoreCommitCycle, reqSatisfiedCycle);
+                    storeQueue.markRetire(commitCycle);
+                }
+                break;
+
+            case UOP_STORE_ADDR:
+                commitCycle = dispatchCycle + uop->lat;
+                lastStoreAddrCommitCycle = MAX(lastStoreAddrCommitCycle, commitCycle);
+                break;
+
+            //case UOP_FENCE:  //make gcc happy
+            default:
+                assert((UopType) uop->type == UOP_FENCE);
+                commitCycle = dispatchCycle + uop->lat;
+                // info("%d %ld %ld", uop->lat, lastStoreAddrCommitCycle, lastStoreCommitCycle);
+                // force future load serialization
+                lastStoreAddrCommitCycle = MAX(commitCycle, MAX(lastStoreAddrCommitCycle, lastStoreCommitCycle + uop->lat));
+                // info("%d %ld %ld X", uop->lat, lastStoreAddrCommitCycle, lastStoreCommitCycle);
+        }
+
+        // Mark retire at ROB
+        rob.markRetire(commitCycle);
+
+        // Record dependences
+        regScoreboard[uop->rd[0]] = commitCycle;
+        regScoreboard[uop->rd[1]] = commitCycle;
+
+        lastCommitCycle = commitCycle;
+
+        //info("0x%lx %3d [%3d %3d] -> [%3d %3d]  %8ld %8ld %8ld %8ld", bbl->addr, i, uop->rs[0], uop->rs[1], uop->rd[0], uop->rd[1], decCycle, c3, dispatchCycle, commitCycle);
+    }
+
+    instrs += bblInstrs;
+    uops += bbl->uops;
+    bbls++;
+    approxInstrs += bbl->approxInstrs;
+
+#ifdef BBL_PROFILING
+    if (approxInstrs) Decoder::profileBbl(bbl->bblIdx);
+#endif
+
+    // Check full match between expected and actual mem ops
+    // If these assertions fail, most likely, something's off in the decoder
+    assert_msg(loadIdx == loads, "%s: loadIdx(%d) != loads (%d)", name.c_str(), loadIdx, loads);
+    assert_msg(storeIdx == stores, "%s: storeIdx(%d) != stores (%d)", name.c_str(), storeIdx, stores);
+    loads = stores = 0;
+
+
+    /* Simulate frontend for branch pred + fetch of this BBL
+     *
+     * NOTE: We assume that the instruction length predecoder and the IQ are
+     * weak enough that they can't hide any ifetch or bpred stalls. In fact,
+     * predecoder stalls are incorporated in the decode stall component (see
+     * decoder.cpp). So here, we compute fetchCycle, then use it to adjust
+     * decodeCycle.
+     */
+
+    // Model fetch-decode delay (fixed, weak predec/IQ assumption)
+    uint64_t fetchCycle = decodeCycle - (DECODE_STAGE - FETCH_STAGE);
+    uint32_t lineSize = 1 << lineBits;
+
+    // Simulate branch prediction
+    if (branchPc && !branchPred.predict(branchPc, branchTaken)) {
+        mispredBranches++;
+
+        /* Simulate wrong-path fetches
+         *
+         * This is not for a latency reason, but sometimes it increases fetched
+         * code footprint and L1I MPKI significantly. Also, we assume a perfect
+         * BTB here: we always have the right address to missfetch on, and we
+         * never need resteering.
+         *
+         * NOTE: Resteering due to BTB misses is done at the BAC unit, is
+         * relatively rare, and carries an 8-cycle penalty, which should be
+         * partially hidden if the branch is predicted correctly --- so we
+         * don't simulate it.
+         *
+         * Since we don't have a BTB, we just assume the next branch is not
+         * taken. With a typical branch mispred penalty of 17 cycles, we
+         * typically fetch 3-4 lines in advance (16B/cycle). This sets a higher
+         * limit, which can happen with branches that take a long time to
+         * resolve (because e.g., they depend on a load). To set this upper
+         * bound, assume a completely backpressured IQ (18 instrs), uop queue
+         * (28 uops), IW (36 uops), and 16B instr length predecoder buffer. At
+         * ~3.5 bytes/instr, 1.2 uops/instr, this is about 5 64-byte lines.
+         */
+         
+        // info("Mispredicted branch, %ld %ld %ld | %ld %ld", decodeCycle, curCycle, lastCommitCycle,
+        //         lastCommitCycle-decodeCycle, lastCommitCycle-curCycle);
+        Address wrongPathAddr = branchTaken? branchNotTakenNpc : branchTakenNpc;
+        uint64_t reqCycle = fetchCycle;
+        for (uint32_t i = 0; i < 5*64/lineSize; i++) {
+            uint64_t fetchLat = l1i->load(wrongPathAddr + lineSize*i, curCycle) - curCycle;
+            cRec.record(curCycle, curCycle, curCycle + fetchLat);
+            uint64_t respCycle = reqCycle + fetchLat;
+            if (respCycle > lastCommitCycle) {
+                break;
+            }
+            // Model fetch throughput limit
+            reqCycle = respCycle + lineSize/FETCH_BYTES_PER_CYCLE;
+        }
+
+        fetchCycle = lastCommitCycle;
+    }
+    branchPc = 0;  // clear for next BBL
+
+    // Simulate current bbl ifetch
+    Address endAddr = bblAddr + bblInfo->bytes;
+    for (Address fetchAddr = bblAddr; fetchAddr < endAddr; fetchAddr += lineSize) {
+        // The Nehalem frontend fetches instructions in 16-byte-wide accesses.
+        // Do not model fetch throughput limit here, decoder-generated stalls already include it
+        // We always call fetches with curCycle to avoid upsetting the weave
+        // models (but we could move to a fetch-centric recorder to avoid this)
+        uint64_t fetchLat = l1i->load(fetchAddr, curCycle) - curCycle;
+        cRec.record(curCycle, curCycle, curCycle + fetchLat);
+        fetchCycle += fetchLat;
+    }
+
+    // If fetch rules, take into account delay between fetch and decode;
+    // If decode rules, different BBLs make the decoders skip a cycle
+    decodeCycle++;
+    uint64_t minFetchDecCycle = fetchCycle + (DECODE_STAGE - FETCH_STAGE);
+    if (minFetchDecCycle > decodeCycle) {
+#ifdef OOO_STALL_STATS
+        profFetchStalls.inc(decodeCycle - minFetchDecCycle);
+#endif
+        decodeCycle = minFetchDecCycle;
+    }
+}
+
+// Timing simulation code
+void OOOCore::join() {
+    DEBUG_MSG("[%s] Joining, curCycle %ld phaseEnd %ld", name.c_str(), curCycle, phaseEndCycle);
+    uint64_t targetCycle = cRec.notifyJoin(curCycle);
+    if (targetCycle > curCycle) advance(targetCycle);
+    phaseEndCycle = zinfo->globPhaseCycles + zinfo->phaseLength;
+    // assert(targetCycle <= phaseEndCycle);
+    DEBUG_MSG("[%s] Joined, curCycle %ld phaseEnd %ld", name.c_str(), curCycle, phaseEndCycle);
+}
+
+void OOOCore::leave() {
+    DEBUG_MSG("[%s] Leaving, curCycle %ld phaseEnd %ld", name.c_str(), curCycle, phaseEndCycle);
+    cRec.notifyLeave(curCycle);
+}
+
+void OOOCore::cSimStart() {
+    uint64_t targetCycle = cRec.cSimStart(curCycle);
+    assert(targetCycle >= curCycle);
+    if (targetCycle > curCycle) advance(targetCycle);
+}
+
+void OOOCore::cSimEnd() {
+    uint64_t targetCycle = cRec.cSimEnd(curCycle);
+    assert(targetCycle >= curCycle);
+    if (targetCycle > curCycle) advance(targetCycle);
+}
+
+void OOOCore::advance(uint64_t targetCycle) {
+    assert(targetCycle > curCycle);
+    decodeCycle += targetCycle - curCycle;
+    insWindow.longAdvance(curCycle, targetCycle);
+    curCycleRFReads = 0;
+    curCycleIssuedUops = 0;
+    assert(targetCycle == curCycle);
+    /* NOTE: Validation with weave mems shows that not advancing internal cycle
+     * counters in e.g., the ROB does not change much; consider full-blown
+     * rebases though if weave models fail to validate for some app.
+     */
+}
+
+// Pin interface code
+
+void OOOCore::LoadFunc(THREADID tid, ADDRINT addr) {static_cast<OOOCore*>(cores[tid])->load(addr);}
+void OOOCore::StoreFunc(THREADID tid, ADDRINT addr) {static_cast<OOOCore*>(cores[tid])->store(addr);}
+
+void OOOCore::PredLoadFunc(THREADID tid, ADDRINT addr, BOOL pred) {
+    OOOCore* core = static_cast<OOOCore*>(cores[tid]);
+    if (pred) core->load(addr);
+    else core->predFalseMemOp();
+}
+
+void OOOCore::PredStoreFunc(THREADID tid, ADDRINT addr, BOOL pred) {
+    OOOCore* core = static_cast<OOOCore*>(cores[tid]);
+    if (pred) core->store(addr);
+    else core->predFalseMemOp();
+}
+
+void OOOCore::BblFunc(THREADID tid, ADDRINT bblAddr, BblInfo* bblInfo) {
+    OOOCore* core = static_cast<OOOCore*>(cores[tid]);
+    core->bbl(bblAddr, bblInfo);
+
+    while (core->curCycle > core->phaseEndCycle) {
+        core->phaseEndCycle += zinfo->phaseLength;
+
+        uint32_t cid = getCid(tid);
+        // NOTE: TakeBarrier may take ownership of the core, and so it will be used by some other thread. If TakeBarrier context-switches us,
+        // the *only* safe option is to return inmmediately after we detect this, or we can race and corrupt core state. However, the information
+        // here is insufficient to do that, so we could wind up double-counting phases.
+        uint32_t newCid = TakeBarrier(tid, cid);
+        // NOTE: Upon further observation, we cannot race if newCid == cid, so this code should be enough.
+        // It may happen that we had an intervening context-switch and we are now back to the same core.
+        // This is fine, since the loop looks at core values directly and there are no locals involved,
+        // so we should just advance as needed and move on.
+        if (newCid != cid) break;  /*context-switch, we do not own this context anymore*/
+    }
+}
+
+void OOOCore::BranchFunc(THREADID tid, ADDRINT pc, BOOL taken, ADDRINT takenNpc, ADDRINT notTakenNpc) {
+    static_cast<OOOCore*>(cores[tid])->branch(pc, taken, takenNpc, notTakenNpc);
+}
+
diff --git a/src/ooo_core.h b/src/ooo_core.h
new file mode 100644
index 00000000..5a0eb804
--- /dev/null
+++ b/src/ooo_core.h
@@ -0,0 +1,487 @@
+/** $lic$
+ * Copyright (C) 2012-2014 by Massachusetts Institute of Technology
+ * Copyright (C) 2010-2013 by The Board of Trustees of Stanford University
+ *
+ * This file is part of zsim.
+ *
+ * zsim is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU General Public License as published by the Free Software
+ * Foundation, version 2.
+ *
+ * If you use this software in your research, we request that you reference
+ * the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of
+ * Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the
+ * source of the simulator in any publications that use this software, and that
+ * you send us a citation of your work.
+ *
+ * zsim is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef OOO_CORE_H_
+#define OOO_CORE_H_
+
+#include <algorithm>
+#include <queue>
+#include <string>
+#include "core.h"
+#include "g_std/g_multimap.h"
+#include "memory_hierarchy.h"
+#include "ooo_core_recorder.h"
+#include "pad.h"
+
+// Uncomment to enable stall stats
+// #define OOO_STALL_STATS
+
+class FilterCache;
+
+/* 2-level branch predictor:
+ *  - L1: Branch history shift registers (bshr): 2^NB entries, HB bits of history/entry, indexed by XOR'd PC
+ *  - L2: Pattern history table (pht): 2^LB entries, 2-bit sat counters, indexed by XOR'd bshr contents
+ *  NOTE: Assumes LB is in [NB, HB] range for XORing (e.g., HB = 18 and NB = 10, LB = 13 is OK)
+ */
+template<uint32_t NB, uint32_t HB, uint32_t LB>
+class BranchPredictorPAg {
+    private:
+        uint32_t bhsr[1 << NB];
+        uint8_t pht[1 << LB];
+
+    public:
+        BranchPredictorPAg() {
+            uint32_t numBhsrs = 1 << NB;
+            uint32_t phtSize = 1 << LB;
+
+            for (uint32_t i = 0; i < numBhsrs; i++) {
+                bhsr[i] = 0;
+            }
+            for (uint32_t i = 0; i < phtSize; i++) {
+                pht[i] = 1;  // weak non-taken
+            }
+
+            static_assert(LB <= HB, "Too many PHT entries");
+            static_assert(LB >= NB, "Too few PHT entries (you'll need more XOR'ing)");
+        }
+
+        // Predicts and updates; returns false if mispredicted
+        inline bool predict(Address branchPc, bool taken) {
+            uint32_t bhsrMask = (1 << NB) - 1;
+            uint32_t histMask = (1 << HB) - 1;
+            uint32_t phtMask  = (1 << LB) - 1;
+           
+            // Predict
+            // uint32_t bhsrIdx = ((uint32_t)( branchPc ^ (branchPc >> NB) ^ (branchPc >> 2*NB) )) & bhsrMask;
+            uint32_t bhsrIdx = ((uint32_t)( branchPc >> 1)) & bhsrMask;
+            uint32_t phtIdx = bhsr[bhsrIdx];
+
+            // Shift-XOR-mask to fit in PHT
+            phtIdx ^= (phtIdx & ~phtMask) >> (HB - LB); // take the [HB-1, LB] bits of bshr, XOR with [LB-1, ...] bits
+            phtIdx &= phtMask;
+            
+            // If uncommented, behaves like a global history predictor
+            // bhsrIdx = 0;
+            // phtIdx = (bhsr[bhsrIdx] ^ ((uint32_t)branchPc)) & phtMask;
+
+            bool pred = pht[phtIdx] > 1;
+
+            // info("BP Pred: 0x%lx bshr[%d]=%x taken=%d pht=%d pred=%d", branchPc, bhsrIdx, phtIdx, taken, pht[phtIdx], pred);
+
+            // Update
+            pht[phtIdx] = taken? (pred? 3 : (pht[phtIdx]+1)) : (pred? (pht[phtIdx]-1) : 0); //2-bit saturating counter
+            bhsr[bhsrIdx] = ((bhsr[bhsrIdx] << 1) & histMask ) | (taken? 1: 0); //we apply phtMask here, dependence is further away
+
+            // info("BP Update: newPht=%d newBshr=%x", pht[phtIdx], bhsr[bhsrIdx]);
+            return (taken == pred);
+        }
+};
+
+
+template<uint32_t H, uint32_t WSZ>
+class WindowStructure {
+    private:
+        // NOTE: Nehalem has POPCNT, but we want this to run reasonably fast on Core2's, so let's keep track of both count and mask.
+        struct WinCycle {
+            uint8_t occUnits;
+            uint8_t count;
+            inline void set(uint8_t o, uint8_t c) {occUnits = o; count = c;}
+        };
+
+        WinCycle* curWin;
+        WinCycle* nextWin;
+        typedef g_map<uint64_t, WinCycle> UBWin;
+        typedef typename UBWin::iterator UBWinIterator;
+        UBWin ubWin;
+        uint32_t occupancy;  // elements scheduled in the future
+
+        uint32_t curPos;
+
+        uint8_t lastPort;
+
+    public:
+        WindowStructure() {
+            curWin = gm_calloc<WinCycle>(H);
+            nextWin = gm_calloc<WinCycle>(H);
+            curPos = 0;
+            occupancy = 0;
+        }
+
+
+        void schedule(uint64_t& curCycle, uint64_t& schedCycle, uint8_t portMask, uint32_t extraSlots = 0) {
+            if (!extraSlots) {
+                scheduleInternal<true, false>(curCycle, schedCycle, portMask);
+            } else {
+                scheduleInternal<true, true>(curCycle, schedCycle, portMask);
+                uint64_t extraSlotCycle = schedCycle+1;
+                uint8_t extraSlotPortMask = 1 << lastPort;
+                // This is not entirely accurate, as an instruction may have been scheduled already
+                // on this port and we'll have a non-contiguous allocation. In practice, this is rare.
+                for (uint32_t i = 0; i < extraSlots; i++) {
+                    scheduleInternal<false, false>(curCycle, extraSlotCycle, extraSlotPortMask);
+                    // info("extra slot %d allocated on cycle %ld", i, extraSlotCycle);
+                    extraSlotCycle++;
+                }
+            }
+            assert(occupancy <= WSZ);
+        }
+
+        inline void advancePos(uint64_t& curCycle) {
+            occupancy -= curWin[curPos].count;
+            curWin[curPos].set(0, 0);
+            curPos++;
+            curCycle++;
+
+            if (curPos == H) {  // rebase
+                // info("[%ld] Rebasing, curCycle=%ld", curCycle/H, curCycle);
+                std::swap(curWin, nextWin);
+                curPos = 0;
+                uint64_t nextWinHorizon = curCycle + 2*H;  // first cycle out of range
+
+                if (!ubWin.empty()) {
+                    UBWinIterator it = ubWin.begin();
+                    while (it != ubWin.end() && it->first < nextWinHorizon) {
+                        uint32_t nextWinPos = it->first - H - curCycle;
+                        assert_msg(nextWinPos < H, "WindowStructure: ubWin elem exceeds limit cycle=%ld curCycle=%ld nextWinPos=%d", it->first, curCycle, nextWinPos);
+                        nextWin[nextWinPos] = it->second;
+                        // info("Moved %d events from unbounded window, cycle %ld (%d cycles away)", it->second, it->first, it->first - curCycle);
+                        it++;
+                    }
+                    ubWin.erase(ubWin.begin(), it);
+                }
+            }
+        }
+
+        void longAdvance(uint64_t& curCycle, uint64_t targetCycle) {
+            assert(curCycle <= targetCycle);
+
+            // Drain IW
+            while (occupancy && curCycle < targetCycle) {
+                advancePos(curCycle);
+            }
+
+            if (occupancy) {
+                // info("advance: window not drained at %ld, %d uops left", curCycle, occupancy);
+                assert(curCycle == targetCycle);
+            } else {
+                // info("advance: window drained at %ld, jumping to %ld", curCycle, targetCycle);
+                assert(curCycle <= targetCycle);
+                curCycle = targetCycle;  // with zero occupancy, we can just jump to it
+            }
+        }
+
+        // Poisons a range of cycles; used by the LSU to apply backpressure to the IW
+        void poisonRange(uint64_t curCycle, uint64_t targetCycle, uint8_t portMask) {
+            uint64_t startCycle = curCycle;  // curCycle should not be modified...
+            uint64_t poisonCycle = curCycle;
+            while (poisonCycle < targetCycle) {
+                scheduleInternal<false, false>(curCycle, poisonCycle, portMask);
+            }
+            // info("Poisoned port mask %x from %ld to %ld (tgt %ld)", portMask, curCycle, poisonCycle, targetCycle);
+            assert(startCycle == curCycle);
+        }
+
+    private:
+        template <bool touchOccupancy, bool recordPort>
+        void scheduleInternal(uint64_t& curCycle, uint64_t& schedCycle, uint8_t portMask) {
+            // If the window is full, advance curPos until it's not
+            while (touchOccupancy && occupancy == WSZ) {
+                advancePos(curCycle);
+            }
+
+            uint32_t delay = (schedCycle > curCycle)? (schedCycle - curCycle) : 0;
+
+            // Schedule, progressively increasing delay if we cannot find a slot
+            uint32_t curWinPos = curPos + delay;
+            while (curWinPos < H) {
+                if (trySchedule<touchOccupancy, recordPort>(curWin[curWinPos], portMask)) {
+                    schedCycle = curCycle + (curWinPos - curPos);
+                    break;
+                } else {
+                    curWinPos++;
+                }
+            }
+            if (curWinPos >= H) {
+                uint32_t nextWinPos = curWinPos - H;
+                while (nextWinPos < H) {
+                    if (trySchedule<touchOccupancy, recordPort>(nextWin[nextWinPos], portMask)) {
+                        schedCycle = curCycle + (nextWinPos + H - curPos);
+                        break;
+                    } else {
+                        nextWinPos++;
+                    }
+                }
+                if (nextWinPos >= H) {
+                    schedCycle = curCycle + (nextWinPos + H - curPos);
+                    UBWinIterator it = ubWin.lower_bound(schedCycle);
+                    while (true) {
+                        if (it == ubWin.end()) {
+                            WinCycle wc = {0, 0};
+                            bool success = trySchedule<touchOccupancy, recordPort>(wc, portMask);
+                            assert(success);
+                            ubWin.insert(std::pair<uint64_t, WinCycle>(schedCycle, wc));
+                        } else if (it->first != schedCycle) {
+                            WinCycle wc = {0, 0};
+                            bool success = trySchedule<touchOccupancy, recordPort>(wc, portMask);
+                            assert(success);
+                            ubWin.insert(it /*hint, makes insert faster*/, std::pair<uint64_t, WinCycle>(schedCycle, wc));
+                        } else {
+                            if (!trySchedule<touchOccupancy, recordPort>(it->second, portMask)) {
+                                // Try next cycle
+                                it++;
+                                schedCycle++;
+                                continue;
+                            }  // else scheduled correctly
+                        }
+                        break;
+                    }
+                    // info("Scheduled event in unbounded window, cycle %ld", schedCycle);
+                }
+            }
+            if (touchOccupancy) occupancy++;
+        }
+
+        template <bool touchOccupancy, bool recordPort>
+        inline uint8_t trySchedule(WinCycle& wc, uint8_t portMask) {
+            static_assert(!(recordPort && !touchOccupancy), "Can't have recordPort and !touchOccupancy");
+            if (touchOccupancy) {
+                uint8_t availMask = (~wc.occUnits) & portMask;
+                if (availMask) {
+                    // info("PRE: occUnits=%x portMask=%x availMask=%x", wc.occUnits, portMask, availMask);
+                    uint8_t firstAvail = __builtin_ffs(availMask) - 1;
+                    // NOTE: This is not fair across ports. I tried round-robin scheduling, and there is no measurable difference
+                    // (in our case, fairness comes from following program order)
+                    if (recordPort) lastPort = firstAvail;
+                    wc.occUnits |= 1 << firstAvail;
+                    wc.count++;
+                    // info("POST: occUnits=%x count=%x firstAvail=%d", wc.occUnits, wc.count, firstAvail);
+                }
+                return availMask;
+            } else {
+                // This is a shadow req, port has only 1 bit set
+                uint8_t availMask = (~wc.occUnits) & portMask;
+                wc.occUnits |= portMask;  // or anyway, no conditionals
+                return availMask;
+            }
+        }
+};
+
+template<uint32_t SZ, uint32_t W>
+class ReorderBuffer {
+    private:
+        uint64_t buf[SZ];
+        uint64_t curRetireCycle;
+        uint32_t curCycleRetires;
+        uint32_t idx;
+
+    public:
+        ReorderBuffer() {
+            for (uint32_t i = 0; i < SZ; i++) buf[i] = 0;
+            idx = 0;
+            curRetireCycle = 0;
+            curCycleRetires = 1;
+        }
+
+        inline uint64_t minAllocCycle() {
+            return buf[idx];
+        }
+
+        inline void markRetire(uint64_t minRetireCycle) {
+            if (minRetireCycle <= curRetireCycle) {  // retire with bundle
+                if (curCycleRetires == W) {
+                    curRetireCycle++;
+                    curCycleRetires = 0;
+                } else {
+                    curCycleRetires++;
+                }
+
+                /* No branches version (careful, width should be power of 2...)
+                 * curRetireCycle += curCycleRetires/W;
+                 * curCycleRetires = (curCycleRetires + 1) % W;
+                 *  NOTE: After profiling, version with branch seems faster
+                 */
+            } else {  // advance
+                curRetireCycle = minRetireCycle;
+                curCycleRetires = 1;
+            }
+
+            buf[idx++] = curRetireCycle;
+            if (idx == SZ) idx = 0;
+        }
+};
+
+// Similar to ReorderBuffer, but must have in-order allocations and retires (--> faster)
+template<uint32_t SZ>
+class CycleQueue {
+    private:
+        uint64_t buf[SZ];
+        uint32_t idx;
+
+    public:
+        CycleQueue() {
+            for (uint32_t i = 0; i < SZ; i++) buf[i] = 0;
+            idx = 0;
+        }
+
+        inline uint64_t minAllocCycle() {
+            return buf[idx];
+        }
+
+        inline void markLeave(uint64_t leaveCycle) {
+            //assert(buf[idx] <= leaveCycle);
+            buf[idx++] = leaveCycle;
+            if (idx == SZ) idx = 0;
+        }
+};
+
+struct BblInfo;
+
+class OOOCore : public Core {
+    private:
+        FilterCache* l1i;
+        FilterCache* l1d;
+
+        uint64_t phaseEndCycle; //next stopping point
+
+        uint64_t curCycle; //this model is issue-centric; curCycle refers to the current issue cycle
+        uint64_t regScoreboard[MAX_REGISTERS]; //contains timestamp of next issue cycles where each reg can be sourced
+
+        BblInfo* prevBbl;
+
+        //Record load and store addresses
+        Address loadAddrs[256];
+        Address storeAddrs[256];
+        uint32_t loads;
+        uint32_t stores;
+
+        uint64_t lastStoreCommitCycle;
+        uint64_t lastStoreAddrCommitCycle; //tracks last store addr uop, all loads queue behind it
+
+        //LSU queues are modeled like the ROB. Surprising? Entries are grabbed in dataflow order,
+        //and for ordering purposes should leave in program order. In reality they are associative
+        //buffers, but we split the associative component from the limited-size modeling.
+        //NOTE: We do not model the 10-entry fill buffer here; the weave model should take care
+        //to not overlap more than 10 misses.
+        ReorderBuffer<32, 4> loadQueue;
+        ReorderBuffer<32, 4> storeQueue;
+
+        uint32_t curCycleRFReads; //for RF read stalls
+        uint32_t curCycleIssuedUops; //for uop issue limits
+
+        //This would be something like the Atom... (but careful, the iw probably does not allow 2-wide when configured with 1 slot)
+        //WindowStructure<1024, 1 /*size*/, 2 /*width*/> insWindow; //this would be something like an Atom, except all the instruction pairing business...
+
+        //Nehalem
+        WindowStructure<1024, 36 /*size*/> insWindow; //NOTE: IW width is implicitly determined by the decoder, which sets the port masks according to uop type
+        ReorderBuffer<128, 4> rob;
+
+        // Agner's guide says it's a 2-level pred and BHSR is 18 bits, so this is the config that makes sense;
+        // in practice, this is probably closer to the Pentium M's branch predictor, (see Uzelac and Milenkovic,
+        // ISPASS 2009), which get the 18 bits of history through a hybrid predictor (2-level + bimodal + loop)
+        // where a few of the 2-level history bits are in the tag.
+        // Since this is close enough, we'll leave it as is for now. Feel free to reverse-engineer the real thing...
+        // UPDATE: Now pht index is XOR-folded BSHR. This has 6656 bytes total -- not negligible, but not ridiculous.
+        BranchPredictorPAg<11, 18, 14> branchPred;
+
+        Address branchPc;  //0 if last bbl was not a conditional branch
+        bool branchTaken;
+        Address branchTakenNpc;
+        Address branchNotTakenNpc;
+
+        uint64_t decodeCycle;
+        CycleQueue<28> uopQueue;  // models issue queue 
+
+        uint64_t instrs, uops, bbls, approxInstrs, mispredBranches;
+
+#ifdef OOO_STALL_STATS
+        Counter profFetchStalls, profDecodeStalls, profIssueStalls;
+#endif
+
+        // Load-store forwarding
+        // Just a direct-mapped array of last store cycles to 4B-wide blocks
+        // (i.e., indexed by (addr >> 2) & (FWD_ENTRIES-1))
+        struct FwdEntry {
+            Address addr;
+            uint64_t storeCycle;
+            void set(Address a, uint64_t c) {addr = a; storeCycle = c;}
+        };
+
+        #define FWD_ENTRIES 32  // 2 lines, 16 4B entries/line
+        FwdEntry fwdArray[FWD_ENTRIES];
+
+        OOOCoreRecorder cRec;
+
+    public:
+        OOOCore(FilterCache* _l1i, FilterCache* _l1d, g_string& _name);
+
+        void initStats(AggregateStat* parentStat);
+
+        uint64_t getInstrs() const;
+        uint64_t getPhaseCycles() const;
+        uint64_t getCycles() const {return cRec.getUnhaltedCycles(curCycle);}
+
+        void contextSwitch(int32_t gid);
+
+        virtual void join();
+        virtual void leave();
+
+        InstrFuncPtrs GetFuncPtrs();
+
+        // Contention simulation interface
+        inline EventRecorder* getEventRecorder() {return cRec.getEventRecorder();}
+        void cSimStart();
+        void cSimEnd();
+
+    private:
+        inline void load(Address addr);
+        inline void store(Address addr);
+
+        /* NOTE: Analysis routines cannot touch curCycle directly, must use
+         * advance() for long jumps or insWindow.advancePos() for 1-cycle
+         * jumps.
+         *
+         * UPDATE: With decodeCycle, this difference is more serious. ONLY
+         * cSimStart and cSimEnd should call advance(). advance() is now meant
+         * to advance the cycle counters in the whole core in lockstep.
+         */
+        inline void advance(uint64_t targetCycle);
+
+        // Predicated loads and stores call this function, gets recorded as a 0-cycle op.
+        // Predication is rare enough that we don't need to model it perfectly to be accurate (i.e. the uops still execute, retire, etc), but this is needed for correctness.
+        inline void predFalseMemOp();
+
+        inline void branch(Address pc, bool taken, Address takenNpc, Address notTakenNpc);
+
+        inline void bbl(Address bblAddr, BblInfo* bblInfo);
+
+        static void LoadFunc(THREADID tid, ADDRINT addr);
+        static void StoreFunc(THREADID tid, ADDRINT addr);
+        static void PredLoadFunc(THREADID tid, ADDRINT addr, BOOL pred);
+        static void PredStoreFunc(THREADID tid, ADDRINT addr, BOOL pred);
+        static void BblFunc(THREADID tid, ADDRINT bblAddr, BblInfo* bblInfo);
+        static void BranchFunc(THREADID tid, ADDRINT pc, BOOL taken, ADDRINT takenNpc, ADDRINT notTakenNpc);
+} ATTR_LINE_ALIGNED;  // Take up an int number of cache lines
+
+#endif  // OOO_CORE_H_
diff --git a/src/ooo_core_recorder.cpp b/src/ooo_core_recorder.cpp
new file mode 100644
index 00000000..33e0e93b
--- /dev/null
+++ b/src/ooo_core_recorder.cpp
@@ -0,0 +1,395 @@
+/** $lic$
+ * Copyright (C) 2012-2014 by Massachusetts Institute of Technology
+ * Copyright (C) 2010-2013 by The Board of Trustees of Stanford University
+ *
+ * This file is part of zsim.
+ *
+ * zsim is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU General Public License as published by the Free Software
+ * Foundation, version 2.
+ *
+ * If you use this software in your research, we request that you reference
+ * the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of
+ * Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the
+ * source of the simulator in any publications that use this software, and that
+ * you send us a citation of your work.
+ *
+ * zsim is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "ooo_core_recorder.h"
+#include <string>
+#include "timing_event.h"
+#include "zsim.h"
+
+#define DEBUG_MSG(args...)
+//#define DEBUG_MSG(args...) info(args)
+
+#define TRACE_MSG(args...)
+//#define TRACE_MSG(args...) info(args)
+
+class OOOIssueEvent : public TimingEvent {
+    private:
+        uint64_t zllStartCycle; //minStartCycle - gapCycles, stable across readjustments of gapCycles
+        uint64_t startCycle; //not set up to simulate
+        OOOCoreRecorder* cRec;
+        uint64_t id;
+
+    public:
+        OOOIssueEvent(uint32_t preDelay, uint64_t _zllStartCycle, OOOCoreRecorder* _cRec, int32_t domain = -1) : TimingEvent(preDelay, 0, domain), zllStartCycle(_zllStartCycle), cRec(_cRec) {}
+
+        void simulate(uint64_t _startCycle) {
+            TRACE_MSG("Issue %ld zllStartCycle %ld startCycle %ld minStartCycle %ld", id, zllStartCycle, _startCycle, getMinStartCycle());
+            startCycle = _startCycle;
+            cRec->reportIssueEventSimulated(this);
+            done(startCycle);
+        }
+
+        virtual std::string str() {
+            std::string res = "rec: ";
+            res += cRec->getName().c_str();
+            return res;
+        }
+
+        friend class OOOCoreRecorder;
+};
+
+
+class OOODispatchEvent : public TimingEvent {
+    private:
+        uint64_t zllStartCycle; //minStartCycle - gapCycles, stable across readjustments of gapCycles
+        uint64_t id;
+
+    public:
+        OOODispatchEvent(uint64_t preDelay, uint64_t _zllStartCycle, int32_t domain = -1) : TimingEvent(preDelay, 0, domain), zllStartCycle(_zllStartCycle) {}
+
+        void simulate(uint64_t startCycle) {
+            TRACE_MSG("Dispatch %ld zllStartCycle %ld startCycle %ld minStartCycle %ld", id, zllStartCycle, startCycle, getMinStartCycle());
+            done(startCycle);
+        }
+
+        friend class OOOCoreRecorder;
+};
+
+class OOORespEvent : public TimingEvent {
+    private:
+        uint64_t zllStartCycle; //minStartCycle - gapCycles, stable across readjustments of gapCycles
+        volatile uint64_t startCycle;
+        OOOCoreRecorder* cRec;
+        uint64_t id;
+
+    public:
+        OOORespEvent(uint64_t preDelay, uint64_t _zllStartCycle, OOOCoreRecorder* _cRec, int32_t domain = -1) : TimingEvent(preDelay, 0, domain), zllStartCycle(_zllStartCycle), startCycle(0), cRec(_cRec) {}
+
+        void simulate(uint64_t _startCycle) {
+            startCycle = _startCycle;
+            TRACE_MSG("Resp %ld zllStartCycle %ld startCycle %ld minStartCycle %ld", id, zllStartCycle, startCycle, getMinStartCycle());
+            done(_startCycle);
+        }
+
+        friend class OOOCoreRecorder;
+};
+
+//For the futureResponses min-heap
+bool OOOCoreRecorder::CompareRespEvents::operator()(OOORespEvent* e1, OOORespEvent* e2) const {
+    return (e1->zllStartCycle > e2->zllStartCycle);
+}
+
+
+
+OOOCoreRecorder::OOOCoreRecorder(uint32_t _domain, g_string& _name)
+    : domain(_domain), name(_name + "-rec")
+{
+    state = HALTED;
+    gapCycles = 0;
+    eventRecorder.setGapCycles(gapCycles);
+
+    lastUnhaltedCycle = 0;
+    totalGapCycles = 0;
+    totalHaltedCycles = 0;
+
+    curId = 0;
+
+    lastEvProduced = NULL;
+    lastEvSimulated = NULL;
+}
+
+
+uint64_t OOOCoreRecorder::notifyJoin(uint64_t curCycle) {
+    if (state == HALTED) {
+        assert(!lastEvProduced);
+        curCycle = zinfo->globPhaseCycles; //start at beginning of the phase
+
+        totalGapCycles += gapCycles;
+        gapCycles = 0;
+        eventRecorder.setGapCycles(gapCycles);
+        assert(lastUnhaltedCycle <= curCycle);
+        totalHaltedCycles += curCycle - lastUnhaltedCycle;
+
+        lastEvProduced = new (eventRecorder) OOOIssueEvent(0, curCycle - gapCycles, this, domain);
+        lastEvProduced->id = curId++;
+        lastEvProduced->setMinStartCycle(curCycle);
+        lastEvProduced->queue(curCycle);
+        eventRecorder.setStartSlack(0);
+        DEBUG_MSG("[%s] Joined, was HALTED, curCycle %ld halted %ld", name.c_str(), curCycle, totalHaltedCycles);
+    } else if (state == DRAINING) {
+        assert(curCycle >= zinfo->globPhaseCycles); //should not have gone out of sync...
+        DEBUG_MSG("[%s] Joined, was DRAINING, curCycle %ld", name.c_str(), curCycle);
+        assert(lastEvProduced);
+        addIssueEvent(curCycle);
+    } else {
+        panic("[%s] Invalid state %d on join()", name.c_str(), state);
+    }
+
+    //Common actions
+    state = RUNNING;
+    return curCycle;
+}
+
+//Properly stitches a previous event against prior events properly
+//After the call, lastEvProduced is updated to this event
+void OOOCoreRecorder::addIssueEvent(uint64_t evCycle) {
+    assert(lastEvProduced);
+    uint64_t zllCycle = evCycle - gapCycles;
+    assert_msg(zllCycle >= lastEvProduced->zllStartCycle, "zllCycle %ld last %ld", zllCycle, lastEvProduced->zllStartCycle);
+    OOOIssueEvent* ev = new (eventRecorder) OOOIssueEvent(0, zllCycle, this, domain);
+    ev->id = curId++;
+    // 1. Link with prior (<) outstanding responses
+    uint64_t maxCycle = 0;
+    while (!futureResponses.empty()) {
+        OOORespEvent* firstResp = futureResponses.top();
+        if (firstResp->zllStartCycle > zllCycle) break;
+        //HACK: Some responses get reordered because gapCycles goes with issue events
+        //FIXME: This disables bound-weave pipelining
+        //FIXME: The way to fix this is to introduce ordering dependences between events
+        // (which to a good extent was done; this needs a pass, but I'm pretty sure it's a non-issue at this point)
+        //NOTE (2013-04-03): Looks like these are all old, these warns do not happen...
+        //NOTE (2013-04-08): Yes, these warns happen with tiny phases, and they are OK (should not be warns, probably we should scan and cleanup futureResponses at cSimEnd)
+        if (firstResp->startCycle == 0) {
+            TRACE_MSG("linked Issue zll %ld with Resp zll %ld", zllCycle, firstResp->zllStartCycle);
+            firstResp->addChild(ev, eventRecorder);
+            assert(maxCycle <= firstResp->zllStartCycle);
+            assert(firstResp->zllStartCycle >= lastEvProduced->zllStartCycle);
+            maxCycle = firstResp->zllStartCycle;
+        } else {
+            warn("Skipping linkage with already simulated response");
+        }
+        futureResponses.pop();
+    }
+#if 0
+    //The superqueue can only have 10 misses in flight...
+    //NOTE: In practice, this makes zero difference on all workloads
+    while (futureResponses.size() > 10) {
+        OOORespEvent* firstResp = futureResponses.top();
+        if (firstResp->startCycle != 0) panic("Guru meditation error");
+        firstResp->addChild(ev, eventRecorder); // our ev lower bound is too low, but this should be OK
+        //info("SQ full, linking %ld %ld", zllCycle, firstResp->zllStartCycle);
+        assert(maxCycle <= firstResp->zllStartCycle);
+        maxCycle = firstResp->zllStartCycle;
+        futureResponses.pop();
+    }
+#endif
+    uint32_t preDelay = maxCycle? ((maxCycle < zllCycle)? (zllCycle - maxCycle) : 0) : 0;
+    ev->setPreDelay(preDelay);
+
+    //2. Link with prior issue event
+    //We need a delay of at least the min-lat delay to avoid negative skews
+    uint32_t issueDelay = zllCycle - lastEvProduced->zllStartCycle - preDelay;
+    DelayEvent* dIssue = new (eventRecorder) DelayEvent(issueDelay);
+    dIssue->setMinStartCycle(lastEvProduced->getMinStartCycle());
+    lastEvProduced->addChild(dIssue, eventRecorder)->addChild(ev, eventRecorder);
+
+    TRACE_MSG("linked Issue zll %ld with prev Issue, delay %d", zllCycle, issueDelay);
+
+    ev->setMinStartCycle(evCycle);
+    lastEvProduced = ev;
+}
+
+void OOOCoreRecorder::notifyLeave(uint64_t curCycle) {
+    assert_msg(state == RUNNING, "invalid state = %d on leave", state);
+    state = DRAINING;
+    assert(lastEvProduced);
+    // Cover delay to curCycle
+    uint64_t zllCycle = curCycle - gapCycles;
+    assert(zllCycle >= lastEvProduced->zllStartCycle);
+    addIssueEvent(curCycle);
+
+    TRACE_MSG("LEAVING, curCycle %ld", curCycle);
+    DEBUG_MSG("[%s] Left, curCycle %ld", name.c_str(), curCycle);
+}
+
+void OOOCoreRecorder::recordAccess(uint64_t curCycle, uint64_t dispatchCycle, uint64_t respCycle) {
+    assert(eventRecorder.numRecords() <= 2);
+
+    //If numRecords == 2, we have PUT (at 0) + GET (at 1) ; if 1, we have a single GET (at record 0)
+
+    //1. Handle GET
+    uint32_t getIdx = eventRecorder.numRecords()-1;
+    TimingRecord tr = eventRecorder.getRecord(getIdx);
+    assert(tr.type == GETX || tr.type == GETS);
+
+    //info("Handling: curCycle %ld ev(reqCycle %ld respCycle %ld) respCycle %ld", curCycle, tr.reqCycle, tr.respCycle, respCycle);
+
+    addIssueEvent(curCycle);
+
+    //Delay
+    DelayEvent* dDisp = new (eventRecorder) DelayEvent(dispatchCycle - curCycle);
+    dDisp->setMinStartCycle(curCycle);
+
+
+    //Dispatch event
+    OOODispatchEvent* dispEv = new (eventRecorder) OOODispatchEvent(/*dispatchCycle - curCycle*/ 0, dispatchCycle);
+    dispEv->setMinStartCycle(dispatchCycle);
+    dispEv->id = curId++;
+
+    uint64_t zllDispatchCycle = dispatchCycle - gapCycles;
+#if 1
+    //Traverse min heap, link with preceding resps...
+    g_vector<OOORespEvent*>& rVec =  *((g_vector<OOORespEvent*>*) (&futureResponses)); //FIXME!!! Unsafe, works just because of prio_queue's layout; should use a tree or write a traverse_heap function...
+    for (uint32_t i = 0; i < rVec.size(); i++) {
+        if (rVec[i]->zllStartCycle < zllDispatchCycle) {
+            DelayEvent* dl = new (eventRecorder) DelayEvent(zllDispatchCycle - rVec[i]->zllStartCycle);
+            rVec[i]->addChild(dl, eventRecorder)->addChild(dispEv, eventRecorder);
+        }
+    }
+#endif
+    //Link request
+    DelayEvent* dUp = new (eventRecorder) DelayEvent(tr.reqCycle - dispatchCycle); //TODO: remove, postdelay in dispatch...
+    dUp->setMinStartCycle(dispatchCycle);
+    lastEvProduced->addChild(dDisp, eventRecorder)->addChild(dispEv, eventRecorder)->addChild(dUp, eventRecorder)->addChild(tr.startEvent, eventRecorder);
+
+    //Link response
+    uint32_t downDelay = respCycle - tr.respCycle;
+    OOORespEvent* respEvent = new (eventRecorder) OOORespEvent(downDelay, respCycle - gapCycles, this, domain);
+    respEvent->id = curId++;
+    respEvent->setMinStartCycle(respCycle);
+    tr.endEvent->addChild(respEvent, eventRecorder);
+    TRACE_MSG("Adding resp zllCycle %ld delay %ld", respCycle - gapCycles, respCycle-curCycle);
+    futureResponses.push(respEvent);
+
+    //2. If we have it, handle PUT
+    if (eventRecorder.numRecords() == 2) {
+        TimingRecord trPut = eventRecorder.getRecord(0);
+        assert(trPut.type == PUTX || trPut.type == PUTS);
+
+        //Link request
+        DelayEvent* putUp = new (eventRecorder) DelayEvent(trPut.reqCycle-curCycle);
+        putUp->setMinStartCycle(curCycle);
+        lastEvProduced->addChild(putUp, eventRecorder)->addChild(trPut.startEvent, eventRecorder);
+
+        //trPut's endEvent not linked to anything, it's a wback and we should not capture it
+    }
+
+    //For multi-domain
+    lastEvProduced->produceCrossings(&eventRecorder);
+    eventRecorder.getCrossingStack().clear();
+
+    eventRecorder.clearRecords();
+}
+
+
+uint64_t OOOCoreRecorder::cSimStart(uint64_t curCycle) {
+    if (state == HALTED) return curCycle; //nothing to do
+
+    DEBUG_MSG("[%s] Cycle %ld cSimStart %d", name.c_str(), curCycle, state);
+
+    uint64_t nextPhaseCycle = zinfo->globPhaseCycles + zinfo->phaseLength;
+
+    uint64_t zllCycle = curCycle - gapCycles;
+    uint64_t zllNextPhaseCycle = nextPhaseCycle - gapCycles;
+
+    // If needed, bring us to the next phase
+    if (state == RUNNING) {
+        assert(curCycle > nextPhaseCycle);
+        assert(lastEvProduced->zllStartCycle <= zllCycle);
+
+        // Taper phase if it's not already tapered
+        if (lastEvProduced->zllStartCycle < zllNextPhaseCycle) {
+            addIssueEvent(nextPhaseCycle);
+        }
+    } else if (state == DRAINING) { // add no event --- that's how we detect we're done draining
+        //Drain futureResponses... we could be a bit more exact by doing partial drains,
+        //but if the thread has not joined back by the end of phase, chances are this is a long leave
+        while (!futureResponses.empty()) futureResponses.pop();
+        if (curCycle < nextPhaseCycle) curCycle = nextPhaseCycle; // bring cycle up
+    }
+    return curCycle;
+}
+
+uint64_t OOOCoreRecorder::cSimEnd(uint64_t curCycle) {
+    if (state == HALTED) return curCycle; //nothing to do
+
+    DEBUG_MSG("[%s] Cycle %ld done state %d", name.c_str(), curCycle, state);
+
+    assert(lastEvSimulated);
+
+    // Adjust curCycle to account for contention simulation delay
+
+    // In our current clock, when did the last event start (1) before contention simulation, and (2) after contention simulation
+    uint64_t lastEvCycle1 = lastEvSimulated->zllStartCycle + gapCycles; //we add gapCycles because zllStartCycle is in zll clocks
+    uint64_t lastEvCycle2 = lastEvSimulated->startCycle;
+
+    assert(lastEvCycle1 <= curCycle);
+    assert_msg(lastEvCycle2 <= curCycle, "[%s] lec2 %ld cc %ld, state %d", name.c_str(), lastEvCycle2, curCycle, state);
+    if (unlikely(lastEvCycle1 > lastEvCycle2)) panic("[%s] Contention simulation introduced a negative skew, curCycle %ld, lc1 %ld lc2 %ld, gapCycles %ld", name.c_str(), curCycle, lastEvCycle1, lastEvCycle2, gapCycles);
+
+    uint64_t skew = lastEvCycle2 - lastEvCycle1;
+
+    // Skew clock
+    // Note that by adding to gapCycles, we keep the zll clock (defined as curCycle - gapCycles) constant.
+    // We use the zll clock to translate origStartCycle correctly, even if it's coming from several phases back.
+    curCycle += skew;
+    gapCycles += skew;
+    eventRecorder.setGapCycles(gapCycles);
+    //We deal with all our events in zllCycles, so no need to update any event counts
+
+    //NOTE: Suppose that we had a really long event, so long that in the next phase, lastEvSimulated is still the same. In this case, skew will be 0, so we do not need to remove it.
+
+    DEBUG_MSG("[%s] curCycle %ld zllCurCycle %ld lec1 %ld lec2 %ld skew %ld", name.c_str(), curCycle, curCycle-gapCycles, lastEvCycle1, lastEvCycle2, skew);
+
+    /* Advance the recorder: we set the current dead cycle as the last event's cycle,
+     * but we mark any live events with some slack (we need the slack to account for events
+     * that linger a bit longer).
+     */
+    //eventRecorder.advance(curCycle + zinfo->phaseLength + 10000 +100000, lastEvCycle2);
+    eventRecorder.advance(curCycle - gapCycles + zinfo->phaseLength + 100000, lastEvSimulated->zllStartCycle);
+
+    if (!lastEvSimulated->getNumChildren()) {
+        //if we were RUNNING, the phase would have been tapered off
+        assert_msg(state == DRAINING, "[%s] state %d lastEvSimulated %p (startCycle %ld) curCycle %ld", name.c_str(), state, lastEvSimulated, lastEvSimulated->startCycle, curCycle);
+        assert(lastEvProduced == lastEvSimulated);
+        lastUnhaltedCycle = lastEvSimulated->startCycle; //the taper is a 0-delay event
+        assert(lastEvSimulated->getPostDelay() == 0);
+        state = HALTED;
+        DEBUG_MSG("[%s] lastEvSimulated reached (startCycle %ld), DRAINING -> HALTED", name.c_str(), lastEvSimulated->startCycle);
+
+        lastEvSimulated = NULL;
+        lastEvProduced = NULL;
+        assert(futureResponses.empty());
+        // This works (because we flush on leave()) but would be inaccurate if we called leave() very frequently; now leave() only happens on blocking syscalls though
+    }
+    return curCycle;
+}
+
+void OOOCoreRecorder::reportIssueEventSimulated(OOOIssueEvent* ev) {
+    lastEvSimulated = ev;
+    eventRecorder.setStartSlack(ev->startCycle - ev->zllStartCycle);
+}
+
+//Stats
+uint64_t OOOCoreRecorder::getUnhaltedCycles(uint64_t curCycle) const {
+    uint64_t cycle = MAX(curCycle, zinfo->globPhaseCycles);
+    uint64_t haltedCycles =  totalHaltedCycles + ((state == HALTED)? (cycle - lastUnhaltedCycle) : 0);
+    return cycle - haltedCycles;
+}
+
+uint64_t OOOCoreRecorder::getContentionCycles() const {
+    return totalGapCycles + gapCycles;
+}
+
diff --git a/src/ooo_core_recorder.h b/src/ooo_core_recorder.h
new file mode 100644
index 00000000..ba4a8d60
--- /dev/null
+++ b/src/ooo_core_recorder.h
@@ -0,0 +1,117 @@
+/** $lic$
+ * Copyright (C) 2012-2014 by Massachusetts Institute of Technology
+ * Copyright (C) 2010-2013 by The Board of Trustees of Stanford University
+ *
+ * This file is part of zsim.
+ *
+ * zsim is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU General Public License as published by the Free Software
+ * Foundation, version 2.
+ *
+ * If you use this software in your research, we request that you reference
+ * the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of
+ * Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the
+ * source of the simulator in any publications that use this software, and that
+ * you send us a citation of your work.
+ *
+ * zsim is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef OOO_CORE_RECORDER_H_
+#define OOO_CORE_RECORDER_H_
+
+#include <functional>
+#include <queue>
+#include "event_recorder.h"
+#include "g_std/g_string.h"
+#include "g_std/g_vector.h"
+
+class OOOIssueEvent;
+class OOORespEvent;
+
+class OOOCoreRecorder {
+    private:
+        typedef enum {
+            HALTED, //Not scheduled, no events left. Initial state. join() --> RUNNING
+            RUNNING, //Scheduled. leave() --> DRAINING
+            DRAINING //Not scheduled, but events remain. join() --> RUNNING; all events done --> HALTED
+        } State;
+
+        uint64_t curId;
+
+        State state;
+
+        /* There are 2 clocks:
+         *  - phase 1 clock = curCycle and is maintained by the bound phase contention-free core model
+         *  - phase 2 clock = curCycle - gapCycles is the zll clock
+         *  We maintain gapCycles, and only get curCycle on function calls. Some of those calls also
+         *  need to change curCycle, so they just return an updated version that the bound phase model
+         *  needs to take. However, **we have no idea about curCycle outside of those calls**.
+         *  Defend this invariant with your life or you'll find this horrible to reason about.
+         */
+        uint64_t gapCycles; //phase 2 clock == curCycle - gapCycles
+
+        //Event bookkeeping
+        EventRecorder eventRecorder;
+
+        //Here goes what's different from CoreRecorder
+
+        //Recording phase
+        OOOIssueEvent* lastEvProduced;
+
+        struct CompareRespEvents : public std::binary_function<OOORespEvent*, OOORespEvent*, bool> {
+            bool operator()(OOORespEvent* lhs, OOORespEvent* rhs) const;
+        };
+
+        std::priority_queue<OOORespEvent*, g_vector<OOORespEvent*>, CompareRespEvents> futureResponses;
+
+        OOOIssueEvent* lastEvSimulated;
+
+        //Cycle accounting
+        uint64_t totalGapCycles; //does not include gapCycles
+        uint64_t totalHaltedCycles; //does not include cycles since last transition to HALTED
+        uint64_t lastUnhaltedCycle; //set on transition to HALTED
+
+        uint32_t domain;
+        g_string name;
+
+    public:
+        OOOCoreRecorder(uint32_t _domain, g_string& _name);
+
+        //Methods called in the bound phase
+        uint64_t notifyJoin(uint64_t curCycle); //returns th updated curCycle, if it needs updating
+        void notifyLeave(uint64_t curCycle);
+
+        //This better be inlined 100% of the time, it's called on EVERY access
+        inline void record(uint64_t curCycle, uint64_t dispatchCycle, uint64_t respCycle) {
+            if (unlikely(eventRecorder.numRecords())) recordAccess(curCycle, dispatchCycle, respCycle);
+        }
+
+        //Methods called between the bound and weave phases
+        uint64_t cSimStart(uint64_t curCycle); //returns updated curCycle
+        uint64_t cSimEnd(uint64_t curCycle); //returns updated curCycle
+
+        //Methods called in the weave phase
+        inline void reportIssueEventSimulated(OOOIssueEvent* ev);
+
+        //Misc
+        inline EventRecorder* getEventRecorder() {return &eventRecorder;}
+
+        //Stats (called fully synchronized)
+        uint64_t getUnhaltedCycles(uint64_t curCycle) const;
+        uint64_t getContentionCycles() const;
+
+        const g_string& getName() const {return name;}
+
+    private:
+        void recordAccess(uint64_t curCycle, uint64_t dispatchCycle, uint64_t respCycle);
+        void addIssueEvent(uint64_t evCycle);
+};
+
+#endif  // OOO_CORE_RECORDER_H_
diff --git a/src/pad.h b/src/pad.h
new file mode 100644
index 00000000..85149a0a
--- /dev/null
+++ b/src/pad.h
@@ -0,0 +1,44 @@
+/** $lic$
+ * Copyright (C) 2012-2014 by Massachusetts Institute of Technology
+ * Copyright (C) 2010-2013 by The Board of Trustees of Stanford University
+ *
+ * This file is part of zsim.
+ *
+ * zsim is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU General Public License as published by the Free Software
+ * Foundation, version 2.
+ *
+ * If you use this software in your research, we request that you reference
+ * the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of
+ * Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the
+ * source of the simulator in any publications that use this software, and that
+ * you send us a citation of your work.
+ *
+ * zsim is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef PAD_H_
+#define PAD_H_
+
+/* Padding macros to remove false sharing */
+
+//Line size, in chars (bytes). We could make it configurable through a define
+#define CACHE_LINE_BYTES 64
+
+#define _PAD_CONCAT(x, y) x ## y
+#define PAD_CONCAT(x, y) _PAD_CONCAT(x, y)
+
+#define PAD() unsigned char PAD_CONCAT(pad_line, __LINE__)[CACHE_LINE_BYTES] //assuming classes are defined over one file, this should generate unique names
+
+//Pad remainder to line size, use as e.g. PAD(sizeof(uint32)) will produce 60B of padding
+#define PAD_SZ(sz) unsigned char PAD_CONCAT(pad_sz_line, __LINE__)[CACHE_LINE_BYTES - ((sz) % CACHE_LINE_BYTES)]
+
+#define ATTR_LINE_ALIGNED __attribute__((aligned(CACHE_LINE_BYTES)))
+
+#endif  // PAD_H_
diff --git a/src/part_repl_policies.h b/src/part_repl_policies.h
new file mode 100644
index 00000000..ebf9efce
--- /dev/null
+++ b/src/part_repl_policies.h
@@ -0,0 +1,645 @@
+/** $lic$
+ * Copyright (C) 2012-2014 by Massachusetts Institute of Technology
+ * Copyright (C) 2010-2013 by The Board of Trustees of Stanford University
+ *
+ * This file is part of zsim.
+ *
+ * zsim is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU General Public License as published by the Free Software
+ * Foundation, version 2.
+ *
+ * If you use this software in your research, we request that you reference
+ * the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of
+ * Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the
+ * source of the simulator in any publications that use this software, and that
+ * you send us a citation of your work.
+ *
+ * zsim is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef PART_REPL_POLICIES_H_
+#define PART_REPL_POLICIES_H_
+
+#include <sstream>
+#include <stdint.h>
+#include "event_queue.h"
+#include "mtrand.h"
+#include "partition_mapper.h"
+#include "partitioner.h"
+#include "repl_policies.h"
+
+struct PartInfo {
+    uint64_t size; //in lines
+    uint64_t targetSize; //in lines
+
+    Counter profHits;
+    Counter profMisses;
+    Counter profSelfEvictions; // from our same partition
+    Counter profExtEvictions; // from other partitions (if too large, we're probably doing something wrong, e.g., too small an adjustment period)
+};
+
+class PartReplPolicy : public virtual ReplPolicy {
+    protected:
+        PartitionMonitor* monitor;
+        PartMapper* mapper;
+
+    public:
+        PartReplPolicy(PartitionMonitor* _monitor, PartMapper* _mapper) : monitor(_monitor), mapper(_mapper) {}
+        ~PartReplPolicy() { delete monitor; }
+
+        virtual void setPartitionSizes(const uint32_t* sizes) = 0;
+
+        PartitionMonitor* getMonitor() { return monitor; }
+        const PartitionMonitor* getMonitor() const { return monitor; }
+};
+
+class WayPartReplPolicy : public PartReplPolicy, public LegacyReplPolicy {
+    private:
+        PartInfo* partInfo;
+        uint32_t partitions;
+
+        uint32_t totalSize;
+        uint32_t waySize;
+        uint32_t ways;
+
+        struct WayPartInfo {
+            Address addr; //FIXME: This is redundant due to the replacement policy interface
+            uint64_t ts; //timestamp, >0 if in the cache, == 0 if line is empty
+            uint32_t p;
+        };
+
+        WayPartInfo* array;
+
+        uint32_t* wayPartIndex; //stores partition of each way
+
+        bool testMode;
+
+        PAD();
+
+        //Replacement process state (RW)
+        int32_t bestId;
+        uint32_t candIdx;
+        uint32_t incomingLinePart; //to what partition does the incoming line belong?
+        Address incomingLineAddr;
+
+        //Globally incremented, but bears little significance per se
+        uint64_t timestamp;
+
+    public:
+        WayPartReplPolicy(PartitionMonitor* _monitor, PartMapper* _mapper, uint64_t _lines, uint32_t _ways, bool _testMode)
+                : PartReplPolicy(_monitor, _mapper), totalSize(_lines), ways(_ways), testMode(_testMode)
+        {
+            partitions = mapper->getNumPartitions();
+            waySize = totalSize/ways;
+            assert(waySize*ways == totalSize); //no partial ways...
+
+            partInfo = gm_calloc<PartInfo>(partitions);
+            for (uint32_t i = 0; i < partitions; i++) {
+                partInfo[i].targetSize = 0;
+
+                //Need placement new, these object have vptr
+                new (&partInfo[i].profHits) Counter;
+                new (&partInfo[i].profMisses) Counter;
+                new (&partInfo[i].profSelfEvictions) Counter;
+                new (&partInfo[i].profExtEvictions) Counter;
+            }
+
+            array = gm_calloc<WayPartInfo>(totalSize); //all have ts, p == 0...
+            partInfo[0].size = totalSize; // so partition 0 has all the lines
+
+            wayPartIndex = gm_calloc<uint32_t>(ways);
+            for (uint32_t w = 0; w < ways; w++) {
+                //Do initial way assignment, partitioner has no profiling info yet
+                uint32_t p = w*partitions/ways; // in [0, ..., partitions-1]
+                wayPartIndex[w] = p;
+                partInfo[p].targetSize += waySize;
+            }
+
+            candIdx = 0;
+            bestId = -1;
+            timestamp = 1;
+        }
+
+        void initStats(AggregateStat* parentStat) {
+            //AggregateStat* partsStat = new AggregateStat(true /*this is a regular aggregate, ONLY PARTITION STATS GO IN HERE*/);
+            AggregateStat* partsStat = new AggregateStat(false); //don't make it a regular aggregate... it gets compacted in periodic stats and becomes useless!
+            partsStat->init("part", "Partition stats");
+            for (uint32_t p = 0; p < partitions; p++) {
+                std::stringstream pss;
+                pss << "part-" << p;
+                AggregateStat* partStat = new AggregateStat();
+                partStat->init(gm_strdup(pss.str().c_str()), "Partition stats");
+                ProxyStat* pStat;
+                pStat = new ProxyStat(); pStat->init("sz", "Actual size", &partInfo[p].size); partStat->append(pStat);
+                pStat = new ProxyStat(); pStat->init("tgtSz", "Target size", &partInfo[p].targetSize); partStat->append(pStat);
+                partInfo[p].profHits.init("hits", "Hits"); partStat->append(&partInfo[p].profHits);
+                partInfo[p].profMisses.init("misses", "Misses"); partStat->append(&partInfo[p].profMisses);
+                partInfo[p].profSelfEvictions.init("selfEvs", "Evictions caused by us"); partStat->append(&partInfo[p].profSelfEvictions);
+                partInfo[p].profExtEvictions.init("extEvs", "Evictions caused by others"); partStat->append(&partInfo[p].profExtEvictions);
+
+                partsStat->append(partStat);
+            }
+            parentStat->append(partsStat);
+        }
+
+        void update(uint32_t id, const MemReq* req) {
+            WayPartInfo* e = &array[id];
+            if (e->ts > 0) { //this is a hit update
+                partInfo[e->p].profHits.inc();
+            } else { //post-miss update, old line has been removed, this is empty
+                uint32_t oldPart = e->p;
+                uint32_t newPart = incomingLinePart;
+                if (oldPart != newPart) {
+                    partInfo[oldPart].size--;
+                    partInfo[oldPart].profExtEvictions.inc();
+                    partInfo[newPart].size++;
+                } else {
+                    partInfo[oldPart].profSelfEvictions.inc();
+                }
+                partInfo[newPart].profMisses.inc();
+                e->p = newPart;
+            }
+            e->ts = timestamp++;
+
+            //Update partitioner...
+            monitor->access(e->p, e->addr);
+        }
+
+        void startReplacement(const MemReq* req) {
+            assert(candIdx == 0);
+            assert(bestId == -1);
+            incomingLinePart = mapper->getPartition(*req);
+            incomingLineAddr = req->lineAddr;
+        }
+
+        void recordCandidate(uint32_t id) {
+            assert(candIdx < ways);
+            WayPartInfo* c = &array[id]; //candidate info
+            WayPartInfo* best = (bestId >= 0)? &array[bestId] : NULL;
+            uint32_t way = candIdx++;
+            //In test mode, this works as LRU
+            if (testMode || wayPartIndex[way] == incomingLinePart) { //this is a way we can fill
+                if (best == NULL) {
+                    bestId = id;
+                } else {
+                    //NOTE: This is actually not feasible without tagging. But what IS feasible is to stop updating the LRU position on new fills. We could kill this, and profile the differences.
+                    if ( testMode || (c->p == incomingLinePart && best->p == incomingLinePart) ) {
+                        if (c->ts < best->ts) bestId = id;
+                    } else if (c->p == incomingLinePart && best->p != incomingLinePart) {
+                        //c wins
+                    } else if (c->p != incomingLinePart && best->p == incomingLinePart) {
+                        //c loses
+                        bestId = id;
+                    } else { //none in our partition, this should be transient but at least enforce LRU
+                        if (c->ts < best->ts) bestId = id;
+                    }
+                }
+            }
+        }
+
+        uint32_t getBestCandidate() {
+            assert(bestId >= 0);
+            return bestId;
+        }
+
+        void replaced(uint32_t id) {
+            candIdx = 0;
+            bestId = -1;
+            array[id].ts = 0;
+            array[id].addr = incomingLineAddr;
+            //info("0x%lx", incomingLineAddr);
+        }
+
+    private:
+        void setPartitionSizes(const uint32_t* waysPart) {
+            uint32_t curWay = 0;
+            for (uint32_t p = 0; p < partitions; p++) {
+                partInfo[p].targetSize = totalSize*waysPart[p]/ways;
+#if UMON_INFO
+                info("part %d assigned %d ways", p, waysPart[p]);
+#endif
+                for (uint32_t i = 0; i < waysPart[p]; i++) wayPartIndex[curWay++] = p;
+            }
+#if UMON_INFO
+            for (uint32_t w = 0; w < ways; w++) info("wayPartIndex[%d] = %d", w, wayPartIndex[w]);
+#endif
+            assert(curWay == ways);
+        }
+};
+
+#define VANTAGE_8BIT_BTS 1 //1 for 8-bit coarse-grain timestamps, 0 for 64-bit coarse-grain (no wrap-arounds)
+
+/* Vantage replacement policy. Please refer to our ISCA 2011 paper for implementation details.
+ */
+class VantageReplPolicy : public PartReplPolicy, public LegacyReplPolicy {
+    private:
+        /* NOTE: This implementation uses 64-bit coarse-grain TSs for simplicity. You have a choice of constraining
+         * these to work 8-bit timestamps by setting VANTAGE_8BIT_BTS to 1. Note that this code still has remnants of
+         * the 64-bit global fine-grain timestamps used to simulate perfect LRU. They are not using for anything but profiling.
+         */
+        uint32_t partitions;
+        uint32_t totalSize;
+        uint32_t assoc;
+
+        struct VantagePartInfo : public PartInfo {
+            uint64_t curBts; //per-partition coarse-grain timestamp (CurrentTS in paper)
+            uint32_t curBtsHits; //hits on current timestamp (AccessCounter in paper)
+
+            uint64_t setpointBts; // setpoint coarse-grain timestamp (SetpointTS in paper)
+            uint64_t setpointAdjs; // setpoint adjustments so far, just for profiling purposes
+
+            uint32_t curIntervalIns; // insertions in current interval. Not currently used.
+            uint32_t curIntervalDems; // CandsDemoted in paper
+            uint32_t curIntervalCands; // CandsSeen in paper
+
+            uint64_t extendedSize;
+
+            uint64_t longTermTargetSize; //in lines
+
+            Counter profDemotions;
+            Counter profEvictions;
+            Counter profSizeCycles;
+            Counter profExtendedSizeCycles;
+        };
+
+        VantagePartInfo* partInfo;
+
+        struct LineInfo {
+            Address addr; //FIXME: This is redundant due to the replacement policy interface
+            uint64_t ts; //timestamp, >0 if in the cache, == 0 if line is empty (little significance otherwise)
+            uint64_t bts; //coarse-grain per-partition timestamp
+            uint32_t p; //partition ID
+            uint32_t op; //original partition id: same as partition id when in partition, but does not change when moved to FFA (unmanaged region)
+        };
+
+        LineInfo* array;
+
+        Counter profPromotions;
+        Counter profUpdateCycles;
+
+        //Repl process stuff
+        uint32_t* candList;
+        uint32_t candIdx;
+        Address incomingLineAddr;
+
+        //Globally incremented, but bears little significance per se
+        uint64_t timestamp;
+
+        double partPortion; //how much of the cache do we devote to the partition's target sizes?
+        double partSlack; //how much the aperture curve reacts to "cushion" the load. partSlack+targetSize sets aperture to 1.0
+        double maxAperture; //Maximum aperture allowed in each partition, must be < 1.0
+        uint32_t partGranularity; //number of partitions that UMON/LookaheadPartitioner expects
+
+        uint64_t lastUpdateCycle; //for cumulative size counter updates; could be made event-driven
+
+        MTRand rng;
+        bool smoothTransients; //if set, keeps all growing partitions at targetSz = actualSz + 1 until they reach their actual target; takes space away slowly from the shrinking partitions instead of aggressively demoting them to the unmanaged region, which turns the whole thing into a shared cache if transients are frequent
+
+    public:
+        VantageReplPolicy(PartitionMonitor* _monitor, PartMapper* _mapper, uint64_t _lines,  uint32_t _assoc, uint32_t partPortionPct,
+                          uint32_t partSlackPct, uint32_t maxAperturePct, uint32_t _partGranularity, bool _smoothTransients)
+                : PartReplPolicy(_monitor, _mapper), totalSize(_lines), assoc(_assoc), rng(0xABCDE563F), smoothTransients(_smoothTransients)
+        {
+            partitions = mapper->getNumPartitions();
+
+            assert(partPortionPct <= 100);
+            assert(partSlackPct <= 100);
+            assert(maxAperturePct <= 100);
+
+            partPortion = ((double)partPortionPct)/100.0;
+            partSlack = ((double)partSlackPct)/100.0;
+            maxAperture = ((double)maxAperturePct)/100.0;
+            partGranularity = _partGranularity;  // NOTE: partitioning at too fine granularity (+1K buckets) overwhelms the lookahead partitioner
+
+            uint32_t targetManagedSize = (uint32_t)(((double)totalSize)*partPortion);
+
+            partInfo = gm_calloc<VantagePartInfo>(partitions+1);  // last one is unmanaged region
+
+            for (uint32_t i = 0; i <= partitions; i++) {
+                partInfo[i].targetSize = targetManagedSize/partitions;
+                partInfo[i].longTermTargetSize = partInfo[i].targetSize;
+                partInfo[i].extendedSize = 0;
+
+                //Need placement new, these objects have vptr
+                new (&partInfo[i].profHits) Counter;
+                new (&partInfo[i].profMisses) Counter;
+                new (&partInfo[i].profSelfEvictions) Counter;
+                new (&partInfo[i].profExtEvictions) Counter;
+                new (&partInfo[i].profDemotions) Counter;
+                new (&partInfo[i].profEvictions) Counter;
+                new (&partInfo[i].profSizeCycles) Counter;
+                new (&partInfo[i].profExtendedSizeCycles) Counter;
+            }
+
+            //unmanaged region should not use these
+            partInfo[partitions].targetSize = 0;
+            partInfo[partitions].longTermTargetSize = 0;
+
+            array = gm_calloc<LineInfo>(totalSize);
+
+            //Initially, assign all the lines to the unmanaged region
+            partInfo[partitions].size = totalSize;
+            partInfo[partitions].extendedSize = totalSize;
+            for (uint32_t i = 0; i < totalSize; i++) {
+                array[i].p = partitions;
+                array[i].op = partitions;
+            }
+
+            candList = gm_calloc<uint32_t>(assoc);
+            candIdx = 0;
+            timestamp = 1;
+
+            lastUpdateCycle = 0;
+
+            info("Vantage RP: %d partitions, managed portion %f Amax %f slack %f", partitions, partPortion, maxAperture, partSlack);
+        }
+
+        void initStats(AggregateStat* parentStat) {
+            AggregateStat* rpStat = new AggregateStat();
+            rpStat->init("part", "Vantage replacement policy stats");
+            ProxyStat* pStat;
+            profPromotions.init("ffaProms", "Promotions from unmanaged region"); rpStat->append(&profPromotions);
+            profUpdateCycles.init("updCycles", "Cycles of updates experienced on size-cycle counters"); rpStat->append(&profUpdateCycles);
+            for (uint32_t p = 0; p <= partitions; p++) {
+                std::stringstream pss;
+                pss << "part-" << p;
+                AggregateStat* partStat = new AggregateStat();
+                partStat->init(gm_strdup(pss.str().c_str()), "Partition stats");
+
+                pStat = new ProxyStat(); pStat->init("sz", "Actual size", &partInfo[p].size); partStat->append(pStat);
+                pStat = new ProxyStat(); pStat->init("xSz", "Extended actual size, including lines currently demoted to FFA", &partInfo[p].extendedSize); partStat->append(pStat);
+                //NOTE: To avoid breaking scripts, I've changed tgtSz to track longTermTargetSize
+                //FIXME: Code and stats should be named similarly
+                pStat = new ProxyStat(); pStat->init("tgtSz", "Target size", &partInfo[p].longTermTargetSize); partStat->append(pStat);
+                pStat = new ProxyStat(); pStat->init("stTgtSz", "Short-term target size (used with smoothedTransients)", &partInfo[p].targetSize); partStat->append(pStat);
+                partInfo[p].profHits.init("hits", "Hits"); partStat->append(&partInfo[p].profHits);
+                partInfo[p].profMisses.init("misses", "Misses"); partStat->append(&partInfo[p].profMisses);
+                //Vantage does not do evictions directly, these do not make sense and are not used
+                //partInfo[p].profSelfEvictions.init("selfEvs", "Evictions caused by us"); partStat->append(&partInfo[p].profSelfEvictions);
+                //partInfo[p].profExtEvictions.init("extEvs", "Evictions caused by others"); partStat->append(&partInfo[p].profExtEvictions);
+                partInfo[p].profDemotions.init("dems", "Demotions"); partStat->append(&partInfo[p].profDemotions);
+                partInfo[p].profEvictions.init("evs", "Evictions"); partStat->append(&partInfo[p].profEvictions);
+                partInfo[p].profSizeCycles.init("szCycles", "Cumulative per-cycle sum of sz"); partStat->append(&partInfo[p].profSizeCycles);
+                partInfo[p].profExtendedSizeCycles.init("xSzCycles", "Cumulative per-cycle sum of xSz"); partStat->append(&partInfo[p].profExtendedSizeCycles);
+
+                rpStat->append(partStat);
+            }
+            parentStat->append(rpStat);
+        }
+
+        void update(uint32_t id, const MemReq* req) {
+            if (unlikely(zinfo->globPhaseCycles > lastUpdateCycle)) {
+                //Update size-cycle counter stats
+                uint64_t diff = zinfo->globPhaseCycles - lastUpdateCycle;
+                for (uint32_t p = 0; p <= partitions; p++) {
+                    partInfo[p].profSizeCycles.inc(diff*partInfo[p].size);
+                    partInfo[p].profExtendedSizeCycles.inc(diff*partInfo[p].extendedSize);
+                }
+                profUpdateCycles.inc(diff);
+                lastUpdateCycle = zinfo->globPhaseCycles;
+            }
+
+            LineInfo* e = &array[id];
+            if (e->ts > 0) {
+                if (e->p == partitions) { //this is an unmanaged region promotion
+                    e->p = mapper->getPartition(*req);
+                    profPromotions.inc();
+                    partInfo[e->p].curIntervalIns++;
+                    partInfo[e->p].size++;
+                    partInfo[partitions].size--;
+                }
+                e->ts = timestamp++;
+                partInfo[e->p].profHits.inc();
+            } else { //post-miss update, old one has been removed, this is empty
+                e->ts = timestamp++;
+                partInfo[e->p].size--;
+                partInfo[e->p].profEvictions.inc();
+                partInfo[e->op].extendedSize--;
+                e->p = mapper->getPartition(*req);
+                e->op = e->p;
+                partInfo[e->p].curIntervalIns++;
+                partInfo[e->p].size++;
+                partInfo[e->op].extendedSize++;
+                partInfo[e->p].profMisses.inc();
+
+                if (partInfo[e->p].targetSize < partInfo[e->p].longTermTargetSize) {
+                    assert(smoothTransients);
+                    partInfo[e->p].targetSize++;
+                    takeOneLine();
+                }
+            }
+
+            //Profile the access
+            monitor->access(e->p, e->addr);
+
+            //Adjust coarse-grain timestamp
+            e->bts = partInfo[e->p].curBts;
+            if (++partInfo[e->p].curBtsHits >= (uint32_t) partInfo[e->p].size/16) {
+                partInfo[e->p].curBts++;
+                partInfo[e->p].setpointBts++;
+                partInfo[e->p].curBtsHits = 0;
+            }
+        }
+
+        void startReplacement(const MemReq* req) {
+            incomingLineAddr = req->lineAddr;
+        }
+
+        void recordCandidate(uint32_t id) {
+            assert(candIdx < assoc);
+            candList[candIdx++] = id;
+        }
+
+        uint32_t getBestCandidate() {
+            assert(candIdx > 0);
+            assert(candIdx <= assoc);
+
+            //Demote all lines below their setpoints
+            for (uint32_t i = 0; i < candIdx; i++) {
+                LineInfo* e = &array[candList[i]];
+                if (e->ts == 0) continue; //empty, bypass
+
+                uint32_t p = e->p;
+                if (p == partitions) continue; //bypass unmanaged region entries
+
+                uint32_t size = partInfo[p].size;
+
+                if (size <= partInfo[p].targetSize) continue; //bypass partitions below target
+
+#if VANTAGE_8BIT_BTS
+                //Must do mod 256 arithmetic. This will do generally worse because of wrap-arounds, but wrapping around is pretty rare
+                //TODO: Doing things this way, we can profile the difference between this and using larger coarse-grain timestamps
+                if (((partInfo[p].curBts - e->bts) % 256) /*8-bit distance to current TS*/ >= ((partInfo[p].curBts - partInfo[p].setpointBts) % 256)) {
+#else
+                if (e->bts <= partInfo[p].setpointBts) {
+#endif
+                    // Demote!
+                    // Out of p
+                    partInfo[p].profDemotions.inc();
+                    partInfo[p].size--;
+
+                    // Into unmanaged
+                    e->p = partitions;
+                    partInfo[partitions].size++;
+
+                    partInfo[p].curIntervalDems++;
+
+                    //Note extended size and op not affected
+                }
+
+                partInfo[p].curIntervalCands++;
+
+                // See if we need interval change
+                if (/*partInfo[p].curIntervalDems >= 16 || partInfo[p].curIntervalIns >= 16 ||*/ partInfo[p].curIntervalCands >= 256) {
+                    double maxSz = partInfo[p].targetSize*(1.0 + partSlack);
+                    double curSz = partInfo[p].size;
+                    double aperture = 0.0;
+
+                    // Feedback-based aperture control
+                    // TODO: Copy over the demotion thresholds lookup table code from the ISCA paper code, or quantize this.
+                    // This is doing finer-grain demotions, but requires a bit more math.
+                    if (curSz >= maxSz) {
+                        aperture = maxAperture;
+                    } else {
+                        double slope = (maxAperture)/(maxSz - partInfo[p].targetSize);
+                        assert(slope > 0.0);
+                        aperture = slope*(curSz - partInfo[p].targetSize);
+                    }
+
+                    if (aperture > 0.0) {
+/*
+                        info ("part %d setpoint adjust, curSz %f tgtSz %ld maxSz %f aperture %f curBts %ld setpointBts %ld interval cands %d ins %d dems %d cpt %f",
+                            p, curSz, partInfo[p].targetSize, maxSz, aperture, partInfo[p].curBts, partInfo[p].setpointBts, partInfo[p].curIntervalCands,\
+                            partInfo[p].curIntervalIns, partInfo[p].curIntervalDems, partInfo[p].curIntervalCands*aperture);
+*/
+
+                        int32_t shrink = partInfo[p].curIntervalDems;
+                        if (shrink < aperture*partInfo[p].curIntervalCands) {
+                            //info ("increasing setpoint");
+                            if (partInfo[p].setpointBts < partInfo[p].curBts) partInfo[p].setpointBts++;
+                        } else if (shrink > aperture*partInfo[p].curIntervalCands) {
+                            //info ("decreasing setpoint");
+#if VANTAGE_8BIT_BTS
+                            //Never get the setpoint to go 256 positions behind the current timestamp
+                            if ((partInfo[p].curBts - partInfo[p].setpointBts) < 255) partInfo[p].setpointBts--;
+#else
+                            if (partInfo[p].setpointBts > 0) partInfo[p].setpointBts--;
+#endif
+                        } else {
+                            //info ("keeping setpoint");
+                        }
+                    }
+
+                    //info("part %d post setpointBts %ld", p, partInfo[p].setpointBts);
+
+                    partInfo[p].curIntervalCands = 0;
+                    partInfo[p].curIntervalIns = 0;
+                    partInfo[p].curIntervalDems = 0;
+                    partInfo[p].setpointAdjs++;
+                }
+            } //for
+
+            //Get best candidate for eviction
+            int32_t bestId = candList[0];
+
+            for (uint32_t i = 0; i < candIdx; i++) { //note we include 0; 0 compares with itself, see shortcut to understand why
+                uint32_t id = candList[i];
+                LineInfo* e = &array[id];
+                LineInfo* best = &array[bestId];
+
+                if (e->ts == 0) {
+                    //shortcut for empty positions
+                    bestId = id;
+                    break;
+                }
+
+                uint32_t p = e->p;
+
+                if (p == partitions && best->p != partitions) { //prioritize umgd
+                    bestId = id;
+                } else if (p == partitions && best->p == partitions) {
+                    if (e->ts < best->ts) bestId = id;
+                } else if (p != partitions && best->p == partitions) {
+                    //best wins, prioritize unmanaged
+                } else {
+                    assert(p != partitions && best->p != partitions);
+                    //Just do LRU; with correctly-sized partitions, this is VERY rare
+                    //NOTE: If we were to study really small unmanaged regions, we can always get fancier and prioritize by aperture, bts, etc.
+                    if (e->ts < best->ts) bestId = id;
+                }
+            }
+            assert(bestId >= 0 && (uint32_t)bestId < totalSize);
+            return bestId;
+        }
+
+        void replaced(uint32_t id) {
+            candIdx = 0; //reset
+
+            LineInfo* e = &array[id];
+            e->ts = 0;
+            e->bts = 0;
+            e->addr = incomingLineAddr;
+        }
+
+    private:
+        void setPartitionSizes(const uint32_t* sizes) {
+            uint32_t s[partitions];
+            uint32_t usedSize = 0;
+            uint32_t linesToTakeAway = 0;
+            for (uint32_t p = 0; p < partitions; p++) {
+                s[p] = totalSize*sizes[p]/partGranularity;
+#if UMON_INFO
+                info("part %d, %ld -> %d lines (now it's %ld lines) [cur %ld/%ld set %ld/%ld setAdjs %ld]", p, partInfo[p].targetSize, s[p],
+                        partInfo[p].size, partInfo[p].curBts, partInfo[p].curBts % 256, partInfo[p].setpointBts, partInfo[p].setpointBts % 256, partInfo[p].setpointAdjs);
+#endif
+                if (smoothTransients) {
+                    partInfo[p].longTermTargetSize = s[p];
+                    if (s[p] > partInfo[p].targetSize) { //growing
+                        uint32_t newTarget = MAX(partInfo[p].targetSize, MIN(partInfo[p].longTermTargetSize, partInfo[p].size+1)); //always in [target,longTermTarget]
+                        linesToTakeAway += newTarget - partInfo[p].targetSize;
+                        partInfo[p].targetSize = newTarget;
+                    }
+                } else {
+                    partInfo[p].targetSize = s[p];
+                    partInfo[p].longTermTargetSize = s[p];
+                }
+                usedSize += s[p];
+            }
+
+            while (linesToTakeAway--) takeOneLine();
+#if UMON_INFO
+            info("%d lines assigned, %d unmanaged", usedSize, totalSize - usedSize);
+#endif
+        }
+
+        void takeOneLine() {
+            assert(smoothTransients);
+            uint32_t linesLeft = 0;
+            //NOTE: This is a fairly inefficient implementation, but we can do it cheaply in hardware
+            //Take away proportionally to difference between actual and long-term target
+            for (uint32_t p = 0; p < partitions; p++) {
+                int32_t left = partInfo[p].targetSize - partInfo[p].longTermTargetSize;
+                linesLeft += MAX(left, 0);
+            }
+            assert(linesLeft > 0);
+            uint32_t l = rng.randInt(linesLeft-1); //[0, linesLeft-1]
+            uint32_t curLines = 0;
+            for (uint32_t p = 0; p < partitions; p++) {
+                int32_t left = partInfo[p].targetSize - partInfo[p].longTermTargetSize;
+                curLines += MAX(left, 0);
+                if (left > 0 && l < curLines) {
+                    partInfo[p].targetSize--;
+                    return;
+                }
+            }
+            panic("Could not find any partition to take away space from???");
+        }
+};
+
+#endif  // PART_REPL_POLICIES_H_
diff --git a/src/partition_mapper.cpp b/src/partition_mapper.cpp
new file mode 100644
index 00000000..be8cf866
--- /dev/null
+++ b/src/partition_mapper.cpp
@@ -0,0 +1,64 @@
+/** $lic$
+ * Copyright (C) 2012-2014 by Massachusetts Institute of Technology
+ * Copyright (C) 2010-2013 by The Board of Trustees of Stanford University
+ *
+ * This file is part of zsim.
+ *
+ * zsim is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU General Public License as published by the Free Software
+ * Foundation, version 2.
+ *
+ * If you use this software in your research, we request that you reference
+ * the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of
+ * Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the
+ * source of the simulator in any publications that use this software, and that
+ * you send us a citation of your work.
+ *
+ * zsim is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "partition_mapper.h"
+#include "log.h"
+#include "process_tree.h"
+#include "zsim.h"
+
+uint32_t CorePartMapper::getPartition(const MemReq& req) {
+    return req.srcId;
+}
+
+uint32_t InstrDataPartMapper::getPartition(const MemReq& req) {
+    return req.flags & MemReq::IFETCH;
+}
+
+uint32_t InstrDataCorePartMapper::getPartition(const MemReq& req) {
+    bool instr = req.flags & MemReq::IFETCH;
+    return req.srcId + (instr ? numCores : 0); //all instruction partitions come after data partitions
+}
+
+uint32_t ProcessPartMapper::getPartition(const MemReq& req) {
+    assert(procIdx < numProcs);
+    return procIdx;
+}
+
+uint32_t InstrDataProcessPartMapper::getPartition(const MemReq& req) {
+    assert(procIdx < numProcs);
+    bool instr = req.flags & MemReq::IFETCH;
+    return procIdx + (instr ? numProcs : 0);
+}
+
+uint32_t ProcessGroupPartMapper::getNumPartitions() {
+    return zinfo->numProcGroups;
+}
+
+uint32_t ProcessGroupPartMapper::getPartition(const MemReq& req) {
+    uint32_t groupIdx = zinfo->procArray[procIdx]->getGroupIdx();
+    assert(groupIdx < zinfo->numProcGroups);
+    return groupIdx;
+}
+
diff --git a/src/partition_mapper.h b/src/partition_mapper.h
new file mode 100644
index 00000000..ef1eade6
--- /dev/null
+++ b/src/partition_mapper.h
@@ -0,0 +1,91 @@
+/** $lic$
+ * Copyright (C) 2012-2014 by Massachusetts Institute of Technology
+ * Copyright (C) 2010-2013 by The Board of Trustees of Stanford University
+ *
+ * This file is part of zsim.
+ *
+ * zsim is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU General Public License as published by the Free Software
+ * Foundation, version 2.
+ *
+ * If you use this software in your research, we request that you reference
+ * the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of
+ * Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the
+ * source of the simulator in any publications that use this software, and that
+ * you send us a citation of your work.
+ *
+ * zsim is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef PARTITION_MAPPER_H_
+#define PARTITION_MAPPER_H_
+
+#include <stdint.h>
+#include "galloc.h"
+#include "memory_hierarchy.h"
+
+//Interface
+class PartMapper : public GlobAlloc {
+    public:
+        virtual uint32_t getNumPartitions()=0;
+        virtual uint32_t getPartition(const MemReq& req)=0;
+};
+
+class CorePartMapper : public PartMapper {
+    private:
+        uint32_t numCores;
+    public:
+        explicit CorePartMapper(uint32_t _numCores) : numCores(_numCores) {}
+        virtual uint32_t getNumPartitions() {return numCores;}
+        virtual uint32_t getPartition(const MemReq& req);
+};
+
+class InstrDataPartMapper : public PartMapper {
+    public:
+        virtual uint32_t getNumPartitions() {return 2;}
+        virtual uint32_t getPartition(const MemReq& req);
+};
+
+class InstrDataCorePartMapper : public PartMapper {
+    private:
+        uint32_t numCores;
+    public:
+        explicit InstrDataCorePartMapper(uint32_t _numCores) : numCores(_numCores) {}
+        virtual uint32_t getNumPartitions() {return 2*numCores;}
+        virtual uint32_t getPartition(const MemReq& req);
+};
+
+class ProcessPartMapper : public PartMapper {
+    private:
+        uint32_t numProcs;
+    public:
+        explicit ProcessPartMapper(uint32_t _numProcs) : numProcs(_numProcs) {}
+        virtual uint32_t getNumPartitions() {return numProcs;}
+        virtual uint32_t getPartition(const MemReq& req);
+};
+
+class InstrDataProcessPartMapper : public PartMapper {
+    private:
+        uint32_t numProcs;
+    public:
+        explicit InstrDataProcessPartMapper(uint32_t _numProcs) : numProcs(_numProcs) {}
+        virtual uint32_t getNumPartitions() {return 2*numProcs;}
+        virtual uint32_t getPartition(const MemReq& req);
+};
+
+class ProcessGroupPartMapper : public PartMapper {
+    public:
+        ProcessGroupPartMapper() {}
+        virtual uint32_t getNumPartitions();
+        virtual uint32_t getPartition(const MemReq& req);
+};
+
+#endif  // PARTITION_MAPPER_H_
+
+
diff --git a/src/partitioner.h b/src/partitioner.h
new file mode 100644
index 00000000..85081e1e
--- /dev/null
+++ b/src/partitioner.h
@@ -0,0 +1,127 @@
+/** $lic$
+ * Copyright (C) 2012-2014 by Massachusetts Institute of Technology
+ * Copyright (C) 2010-2013 by The Board of Trustees of Stanford University
+ *
+ * This file is part of zsim.
+ *
+ * zsim is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU General Public License as published by the Free Software
+ * Foundation, version 2.
+ *
+ * If you use this software in your research, we request that you reference
+ * the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of
+ * Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the
+ * source of the simulator in any publications that use this software, and that
+ * you send us a citation of your work.
+ *
+ * zsim is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef PARTITIONER_H_
+#define PARTITIONER_H_
+
+#include "event_queue.h"
+#include "g_std/g_vector.h"
+#include "galloc.h"
+#include "memory_hierarchy.h"
+#include "stats.h"
+#include "utility_monitor.h"
+
+class PartReplPolicy;
+
+// allocates space in a cache between multiple partitions
+class Partitioner : public GlobAlloc {
+    public:
+        Partitioner(uint32_t _minAlloc, double _allocPortion, bool* _forbidden)
+                : minAlloc(_minAlloc), allocPortion(_allocPortion), forbidden(_forbidden) {}
+
+        class PartitionEvent: public Event {
+            private:
+                Partitioner* part;
+            public:
+                PartitionEvent(Partitioner* _part, uint64_t _period) : Event(_period), part(_part) {}
+                void callback() { part->partition(); }
+        };
+        virtual void partition() = 0;
+
+    protected:
+        uint32_t minAlloc;
+        double allocPortion;
+        bool* forbidden;
+};
+
+// Gives best partition sizes as estimated with the greedy lookahead
+// algorithm proposed in the UCP paper (Qureshi and Patt, ISCA 2006)
+namespace lookahead {
+    uint64_t computePartitioningTotalUtility(uint32_t numPartitions, const uint32_t* parts, const uint32_t* missCurves);
+    void computeBestPartitioning(uint32_t numPartitions, uint32_t* allocs, uint32_t* missCurves);
+}
+
+class LookaheadPartitioner : public Partitioner {
+    public:
+        LookaheadPartitioner(PartReplPolicy* _repl, uint32_t _numPartitions, uint32_t _buckets,
+                             uint32_t _minAlloc = 1, double _allocPortion = 1.0, bool* _forbidden = NULL);
+        void partition();
+
+    private:
+        PartReplPolicy* repl;
+        uint32_t numPartitions;
+        uint32_t buckets;
+        uint32_t* curAllocs;
+};
+
+// *********************************************************************
+
+// monitors the usage of partitions in a cache and generates miss curves
+class PartitionMonitor : public GlobAlloc {
+    public:
+        explicit PartitionMonitor(uint32_t _buckets) : buckets(_buckets) {}
+
+        virtual uint32_t getNumPartitions() const = 0;
+
+        // called by PartReplPolicy on a memory reference
+        virtual void access(uint32_t partition, Address lineAddr) = 0;
+
+        // called by Partitioner to get misses
+        virtual uint32_t get(uint32_t partition, uint32_t bucket) const = 0;
+
+        virtual uint32_t getNumAccesses(uint32_t partition) const = 0;
+
+        // called by Partitioner each interval to reset miss counters
+        virtual void reset() = 0;
+
+        uint32_t getBuckets() const { return buckets; }
+
+    protected:
+        uint32_t buckets;
+};
+
+// Maintains UMONs for each partition as in (Qureshi and Patt, ISCA 2006).
+// Stupid name...but what do you call it? -nzb
+class UMonMonitor : public PartitionMonitor {
+    public:
+        UMonMonitor(uint32_t _numLines, uint32_t _umonLines, uint32_t _umonBuckets, uint32_t _numPartitions, uint32_t _buckets);
+        ~UMonMonitor();
+
+        uint32_t getNumPartitions() const { return monitors.size(); }
+        void access(uint32_t partition, Address lineAddr);
+        uint32_t get(uint32_t partition, uint32_t bucket) const;
+        uint32_t getNumAccesses(uint32_t partition) const;
+        void reset();
+
+    private:
+        void getMissCurves() const;
+        void getMissCurve(uint32_t* misses, uint32_t partition) const;
+
+        mutable uint32_t* missCache;
+        mutable bool missCacheValid;
+        g_vector<UMon*> monitors;       // individual monitors per partition
+};
+
+#endif  // PARTITIONER_H_
diff --git a/src/phase_slab_alloc.h b/src/phase_slab_alloc.h
new file mode 100644
index 00000000..8961a727
--- /dev/null
+++ b/src/phase_slab_alloc.h
@@ -0,0 +1,198 @@
+/** $lic$
+ * Copyright (C) 2012-2014 by Massachusetts Institute of Technology
+ * Copyright (C) 2010-2013 by The Board of Trustees of Stanford University
+ *
+ * This file is part of zsim.
+ *
+ * zsim is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU General Public License as published by the Free Software
+ * Foundation, version 2.
+ *
+ * If you use this software in your research, we request that you reference
+ * the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of
+ * Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the
+ * source of the simulator in any publications that use this software, and that
+ * you send us a citation of your work.
+ *
+ * zsim is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef PHASE_SLAB_ALLOC_H_
+#define PHASE_SLAB_ALLOC_H_
+
+#include <stdint.h>
+#include <string>
+#include "g_std/g_list.h"
+#include "log.h"
+#include "pad.h"
+
+class PhaseSlabAlloc {
+    private:
+        struct Slab {
+            Slab* next;
+            uint32_t size;
+            uint32_t used;
+
+            char buf[0]; //buffer starts here
+
+            void init(uint32_t sz) {
+                size = sz;
+                clear();
+            }
+
+            void clear() {
+                used = 0;
+                next = NULL;
+                //memset(buf, 0, size); //zeroing the slab can help chase memory corruption bugs
+            }
+
+            void* alloc(uint32_t bytes) {
+#if 1 //no effort to align, but objs are a multiple of 8 bytes, so all allocs are as well
+                char* ptr = buf+used;
+                used += bytes;
+#else //align to some block size --- performs worse in my analysis, the loss in locality does not compensate
+#define ALIGN_SZ 64
+                char* base = buf+used;
+                char* ptr = static_cast<char*>(((uint64_t)(base+(ALIGN_SZ))) & (-ALIGN_SZ)); //aligned
+                used = (ptr-buf)+bytes;
+#endif
+                //info("Allocation stating at %p, %d bytes", ptr, bytes);
+                return (used < size)? ptr : NULL;
+            }
+        };
+
+        //SLL, intrusive, LIFO, LIFO prepend. Pass by value, it's just 2 pointers
+        class SlabList {
+            private:
+                Slab* start;
+                Slab* end;
+
+            public:
+                SlabList() : start(NULL), end(NULL) {}
+
+            void push_front(Slab* s) {
+                assert(s);
+                assert(s->next == NULL);
+                s->next = start;
+                start = s;
+                if (!end) end = s;
+            }
+
+            Slab* pop_front() {
+                assert(start);
+                Slab* res = start;
+                start = start->next;
+                if (res == end) {
+                    assert(start == NULL);
+                    end = NULL;
+                }
+                return res;
+            }
+
+            void prepend(SlabList& lst) {
+                if (lst.start == NULL) { //lst is empty
+                    assert(lst.end == NULL);
+                } else {
+                    assert(lst.end);
+                    assert(lst.end->next == NULL);
+                    lst.end->next = start;
+                    start = lst.start;
+                    if (!end) end = lst.end; //we could be empty
+                }
+            }
+
+            void clear() {
+                start = NULL;
+                end = NULL;
+            }
+
+            bool empty() const {
+                return !start;
+            }
+        };
+
+        Slab* curSlab;
+        SlabList freeList;
+        SlabList curPhaseList;
+
+        g_list<std::pair<SlabList, uint64_t> > liveList;
+
+        uint32_t slabSize;
+
+    public:
+        PhaseSlabAlloc() {
+            //slabSize = (1<<12); //4KB, too small
+            slabSize = (1<<16); //64KB, seems to be sweet spot in a number of tests, though I tried 32KB-256KB and the differences are minimal in that range (2.3% weave time)
+            curSlab = NULL;
+            freeList.clear();
+            curPhaseList.clear();
+            allocSlab();
+        }
+
+        template <typename T>
+        T* alloc() {
+            assert(sizeof(T) < slabSize);
+            T* ptr = static_cast<T*>(curSlab->alloc(sizeof(T)));
+            if (unlikely(!ptr)) {
+                allocSlab();
+                ptr = static_cast<T*>(curSlab->alloc(sizeof(T)));
+                assert(ptr);
+            }
+            return ptr;
+        }
+
+        void* alloc(size_t sz) {
+            assert(sz < slabSize);
+            void* ptr = curSlab->alloc(sz);
+            if (unlikely(!ptr)) {
+                allocSlab();
+                ptr = curSlab->alloc(sz);
+                assert(ptr);
+            }
+            return ptr;
+        }
+
+
+
+        //Every event currently produced is < prodCycle, every event < usedCycle is dead (has already been simulated)
+        void advance(uint64_t prodCycle, uint64_t usedCycle) {
+            if (!curPhaseList.empty()) {
+                liveList.push_back(std::make_pair(curPhaseList, prodCycle));
+                curPhaseList.clear();
+            }
+
+            while (!liveList.empty()) {
+                std::pair<SlabList, uint64_t> p = liveList.front();
+                uint64_t cycle = p.second;
+                if (cycle < usedCycle) {
+                    freeList.prepend(p.first);
+                    liveList.pop_front();
+                    //info("(%ld, %ld) Recycling %ld, %ld left", prodCycle, usedCycle, cycle, liveList.size());
+                } else {
+                    break;
+                }
+            }
+        }
+
+    private:
+        void allocSlab() {
+            if (curSlab) curPhaseList.push_front(curSlab);
+
+            if (!freeList.empty()) {
+                curSlab = freeList.pop_front();
+                assert(curSlab);
+                curSlab->clear();
+            } else {
+                curSlab = static_cast<Slab*>(gm_malloc(sizeof(Slab) + slabSize));
+                curSlab->init(slabSize); //NOTE: Slab is POD
+            }
+        }
+};
+
+#endif  // PHASE_SLAB_ALLOC_H_
diff --git a/src/pin_cmd.cpp b/src/pin_cmd.cpp
new file mode 100644
index 00000000..e9629549
--- /dev/null
+++ b/src/pin_cmd.cpp
@@ -0,0 +1,174 @@
+/** $lic$
+ * Copyright (C) 2012-2014 by Massachusetts Institute of Technology
+ * Copyright (C) 2010-2013 by The Board of Trustees of Stanford University
+ *
+ * This file is part of zsim.
+ *
+ * zsim is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU General Public License as published by the Free Software
+ * Foundation, version 2.
+ *
+ * If you use this software in your research, we request that you reference
+ * the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of
+ * Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the
+ * source of the simulator in any publications that use this software, and that
+ * you send us a citation of your work.
+ *
+ * zsim is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pin_cmd.h"
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <wordexp.h> //for posix-shell command expansion
+#include "config.h"
+
+//Funky macro expansion stuff
+#define QUOTED_(x) #x
+#define QUOTED(x) QUOTED_(x)
+
+PinCmd::PinCmd(Config* conf, const char* configFile, const char* outputDir, uint64_t shmid) {
+    //Figure the program paths
+    const char* zsimEnvPath = getenv("ZSIM_PATH");
+    g_string pinPath, zsimPath;
+    if (zsimEnvPath) {
+        info("Using env path %s", zsimEnvPath);
+        pinPath = zsimEnvPath;
+        pinPath += "/pinbin";
+        zsimPath = zsimEnvPath;
+        zsimPath += "/libzsim.so";
+    } else {
+        pinPath = QUOTED(PIN_PATH);
+        zsimPath = QUOTED(ZSIM_PATH);
+    }
+
+    args.push_back(pinPath);
+
+    //Global pin options
+    args.push_back("-follow_execv"); //instrument child processes
+    args.push_back("-tool_exit_timeout"); //don't wait much of internal threads
+    args.push_back("1");
+
+    //Additional options (e.g., -smc_strict for Java), parsed from config
+    const char* pinOptions = conf->get<const char*>("sim.pinOptions", "");
+    wordexp_t p;
+    wordexp(pinOptions, &p, 0);
+    for (uint32_t i = 0; i < p.we_wordc; i++) {
+        args.push_back(g_string(p.we_wordv[i]));
+    }
+    wordfree(&p);
+
+    //Load tool
+    args.push_back("-t");
+    args.push_back(zsimPath);
+
+    //Tool options
+    if (configFile) {
+        //Check configFile is an absolute path
+        //NOTE: We check rather than canonicalizing it ourselves because by the time we're created, we might be in another directory
+        char* absPath = realpath(configFile, NULL);
+        if (std::string(configFile) != std::string(absPath)) {
+            panic("Internal zsim bug, configFile should be absolute");
+        }
+        free(absPath);
+
+        args.push_back("-config");
+        args.push_back(configFile);
+    }
+
+    args.push_back("-outputDir");
+    args.push_back(outputDir);
+
+    std::stringstream shmid_ss;
+    shmid_ss << shmid;
+
+    args.push_back("-shmid");
+    args.push_back(shmid_ss.str().c_str());
+
+    if (conf->get<bool>("sim.logToFile", false)) {
+        args.push_back("-logToFile");
+    }
+
+    //Read the per-process params of the processes run directly by the harness
+    while (true) {
+        std::stringstream p_ss;
+        p_ss << "process" << procInfo.size();
+
+        if (!conf->exists(p_ss.str().c_str())) break;
+
+        const char* cmd = conf->get<const char*>(p_ss.str() +  ".command");
+        const char* input = conf->get<const char*>(p_ss.str() +  ".input", "");
+        const char* loader = conf->get<const char*>(p_ss.str() +  ".loader", "");
+        const char* env = conf->get<const char*>(p_ss.str() +  ".env", "");
+
+        ProcCmdInfo pi = {g_string(cmd), g_string(input), g_string(loader), g_string(env)};
+        procInfo.push_back(pi);
+    }
+}
+
+g_vector<g_string> PinCmd::getPinCmdArgs(uint32_t procIdx) {
+    g_vector<g_string> res = args;
+
+    std::stringstream procIdx_ss;
+    procIdx_ss << procIdx;
+    res.push_back("-procIdx");
+    res.push_back(procIdx_ss.str().c_str());
+    res.push_back("--");
+    return res;
+}
+
+g_vector<g_string> PinCmd::getFullCmdArgs(uint32_t procIdx, const char** inputFile) {
+    assert(procIdx < procInfo.size()); //must be one of the topmost processes
+    g_vector<g_string> res = getPinCmdArgs(procIdx);
+
+    g_string cmd = procInfo[procIdx].cmd;
+
+    /* Loader injection: Turns out that Pin mingles with the simulated binary, which decides the loader used,
+     * even when PIN_VM_LIBRARY_PATH is used. This kill the invariance on libzsim.so's loaded address, because
+     * loaders in different children have different sizes. So, if specified, we prefix the program with the
+     * given loader. This is optional because it won't work with statically linked binaries.
+     *
+     * BTW, thinking of running pin under a specific loaderto fix this instead? Nope, it gets into an infinite loop.
+     */
+    if (procInfo[procIdx].loader != "") {
+        cmd = procInfo[procIdx].loader + " " + cmd;
+        info("Injected loader on process%d, command line: %s", procIdx, cmd.c_str());
+        warn("Loader injection makes Pin unaware of symbol routines, so things like routine patching"
+             "will not work! You can homogeneize the loaders instead by editing the .interp ELF section");
+    }
+
+    //Parse command -- use glibc's wordexp to parse things like quotes, handle argument expansion, etc correctly
+    wordexp_t p;
+    wordexp(cmd.c_str(), &p, 0);
+    for (uint32_t i = 0; i < p.we_wordc; i++) {
+        res.push_back(g_string(p.we_wordv[i]));
+    }
+    wordfree(&p);
+
+    //Input redirect
+    *inputFile = (procInfo[procIdx].input == "")? NULL : procInfo[procIdx].input.c_str();
+    return res;
+}
+
+void PinCmd::setEnvVars(uint32_t procIdx) {
+    assert(procIdx < procInfo.size()); //must be one of the topmost processes
+    if (procInfo[procIdx].env != "") {
+        wordexp_t p;
+        wordexp(procInfo[procIdx].env.c_str(), &p, 0);
+        for (uint32_t i = 0; i < p.we_wordc; i++) {
+            char* var = strdup(p.we_wordv[i]); //putenv() does not make copies, and takes non-const char* in
+            if (putenv(var) != 0) {
+                panic("putenv(%s) failed", var);
+            }
+        }
+        wordfree(&p);
+    }
+}
+
diff --git a/src/pin_cmd.h b/src/pin_cmd.h
new file mode 100644
index 00000000..249c491c
--- /dev/null
+++ b/src/pin_cmd.h
@@ -0,0 +1,60 @@
+/** $lic$
+ * Copyright (C) 2012-2014 by Massachusetts Institute of Technology
+ * Copyright (C) 2010-2013 by The Board of Trustees of Stanford University
+ *
+ * This file is part of zsim.
+ *
+ * zsim is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU General Public License as published by the Free Software
+ * Foundation, version 2.
+ *
+ * If you use this software in your research, we request that you reference
+ * the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of
+ * Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the
+ * source of the simulator in any publications that use this software, and that
+ * you send us a citation of your work.
+ *
+ * zsim is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef PIN_CMD_H_
+#define PIN_CMD_H_
+
+/* Interface to get pin command line */
+
+#include <stdint.h>
+#include "g_std/g_string.h"
+#include "g_std/g_vector.h"
+#include "galloc.h"
+
+class Config;
+
+class PinCmd : public GlobAlloc {
+    private:
+        g_vector<g_string> args;
+
+        struct ProcCmdInfo {
+            g_string cmd;
+            g_string input;
+            g_string loader;
+            g_string env;
+        };
+
+        g_vector<ProcCmdInfo> procInfo; //one entry for each process that the harness launches (not for child procs)
+
+    public:
+        PinCmd(Config* conf, const char* configFile, const char* outputDir, uint64_t shmid);
+        g_vector<g_string> getPinCmdArgs(uint32_t procIdx);
+        g_vector<g_string> getFullCmdArgs(uint32_t procIdx, const char** inputFile);
+        void setEnvVars(uint32_t procIdx);
+
+        uint32_t getNumCmdProcs() {return procInfo.size();}
+};
+
+#endif  // PIN_CMD_H_
diff --git a/src/prefetcher.cpp b/src/prefetcher.cpp
new file mode 100644
index 00000000..b99ff03b
--- /dev/null
+++ b/src/prefetcher.cpp
@@ -0,0 +1,190 @@
+/** $lic$
+ * Copyright (C) 2012-2014 by Massachusetts Institute of Technology
+ * Copyright (C) 2010-2013 by The Board of Trustees of Stanford University
+ *
+ * This file is part of zsim.
+ *
+ * zsim is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU General Public License as published by the Free Software
+ * Foundation, version 2.
+ *
+ * If you use this software in your research, we request that you reference
+ * the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of
+ * Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the
+ * source of the simulator in any publications that use this software, and that
+ * you send us a citation of your work.
+ *
+ * zsim is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "prefetcher.h"
+#include "bithacks.h"
+
+//#define DBG(args...) info(args)
+#define DBG(args...)
+
+void StreamPrefetcher::setParents(uint32_t _childId, const g_vector<MemObject*>& parents, Network* network) {
+    childId = _childId;
+    if (parents.size() != 1) panic("Must have one parent");
+    if (network) panic("Network not handled");
+    parent = parents[0];
+}
+
+void StreamPrefetcher::setChildren(const g_vector<BaseCache*>& children, Network* network) {
+    if (children.size() != 1) panic("Must have one children");
+    if (network) panic("Network not handled");
+    child = children[0];
+}
+
+void StreamPrefetcher::initStats(AggregateStat* parentStat) {
+    AggregateStat* s = new AggregateStat();
+    s->init(name.c_str(), "Prefetcher stats");
+    profAccesses.init("acc", "Accesses"); s->append(&profAccesses);
+    profPrefetches.init("pf", "Issued prefetches"); s->append(&profPrefetches);
+    profDoublePrefetches.init("dpf", "Issued double prefetches"); s->append(&profDoublePrefetches);
+    profPageHits.init("pghit", "Page/entry hit"); s->append(&profPageHits);
+    profHits.init("hit", "Prefetch buffer hits, short and full"); s->append(&profHits);
+    profShortHits.init("shortHit", "Prefetch buffer short hits"); s->append(&profShortHits);
+    profStrideSwitches.init("strideSwitches", "Predicted stride switches"); s->append(&profStrideSwitches);
+    profLowConfAccs.init("lcAccs", "Low-confidence accesses with no prefetches"); s->append(&profLowConfAccs);
+    parentStat->append(s);
+}
+
+uint64_t StreamPrefetcher::access(MemReq& req) {
+    uint32_t origChildId = req.childId;
+    req.childId = childId;
+
+    if (req.type != GETS) return parent->access(req); //other reqs ignored, including stores
+
+    profAccesses.inc();
+
+    uint64_t reqCycle = req.cycle;
+    uint64_t respCycle = parent->access(req);
+
+    Address pageAddr = req.lineAddr >> 6;
+    uint32_t pos = req.lineAddr & (64-1);
+    uint32_t idx = 16;
+    // This loop gets unrolled and there are no control dependences. Way faster than a break (but should watch for the avoidable loop-carried dep)
+    for (uint32_t i = 0; i < 16; i++) {
+        bool match = (pageAddr == tag[i]);
+        idx = match?  i : idx;  // ccmov, no branch
+    }
+
+    DBG("%s: 0x%lx page %lx pos %d", name.c_str(), req.lineAddr, pageAddr, pos);
+
+    if (idx == 16) {  // entry miss
+        uint32_t cand = 16;
+        uint64_t candScore = -1;
+        //uint64_t candScore = 0;
+        for (uint32_t i = 0; i < 16; i++) {
+            if (array[i].lastCycle > reqCycle + 500) continue;  // warm prefetches, not even a candidate
+            /*uint64_t score = (reqCycle - array[i].lastCycle)*(3 - array[i].conf.counter());
+            if (score > candScore) {
+                cand = i;
+                candScore = score;
+            }*/
+            if (array[i].ts < candScore) {  // just LRU
+                cand = i;
+                candScore = array[i].ts;
+            }
+        }
+
+        if (cand < 16) {
+            idx = cand;
+            array[idx].alloc(reqCycle);
+            array[idx].lastPos = pos;
+            array[idx].ts = timestamp++;
+            tag[idx] = pageAddr;
+        }
+        DBG("%s: MISS alloc idx %d", name.c_str(), idx);
+    } else {  // entry hit
+        profPageHits.inc();
+        Entry& e = array[idx];
+        array[idx].ts = timestamp++;
+        DBG("%s: PAGE HIT idx %d", name.c_str(), idx);
+
+        // 1. Did we prefetch-hit?
+        bool shortPrefetch = false;
+        if (e.valid[pos]) {
+            uint64_t pfRespCycle = e.times[pos].respCycle;
+            shortPrefetch = pfRespCycle > respCycle;
+            e.valid[pos] = false;  // close, will help with long-lived transactions
+            respCycle = MAX(pfRespCycle, respCycle);
+            e.lastCycle = MAX(respCycle, e.lastCycle);
+            profHits.inc();
+            if (shortPrefetch) profShortHits.inc();
+            DBG("%s: pos %d prefetched on %ld, pf resp %ld, demand resp %ld, short %d", name.c_str(), pos, e.times[pos].startCycle, pfRespCycle, respCycle, shortPrefetch);
+        }
+
+        // 2. Update predictors, issue prefetches
+        int32_t stride = pos - e.lastPos;
+        DBG("%s: pos %d lastPos %d lastLastPost %d e.stride %d", name.c_str(), pos, e.lastPos, e.lastLastPos, e.stride);
+        if (e.stride == stride) {
+            e.conf.inc();
+            if (e.conf.pred()) {  // do prefetches
+                int32_t fetchDepth = (e.lastPrefetchPos - e.lastPos)/stride;
+                uint32_t prefetchPos = e.lastPrefetchPos + stride;
+                if (fetchDepth < 1) {
+                    prefetchPos = pos + stride;
+                    fetchDepth = 1;
+                }
+                DBG("%s: pos %d stride %d conf %d lastPrefetchPos %d prefetchPos %d fetchDepth %d", name.c_str(), pos, stride, e.conf.counter(), e.lastPrefetchPos, prefetchPos, fetchDepth);
+
+                if (prefetchPos < 64 && !e.valid[prefetchPos]) {
+                    MESIState state = I;
+                    MemReq pfReq = {req.lineAddr + prefetchPos - pos, GETS, req.childId, &state, reqCycle, req.childLock, state, req.srcId, MemReq::PREFETCH};
+                    uint64_t pfRespCycle = parent->access(pfReq);  // FIXME, might segfault
+                    e.valid[prefetchPos] = true;
+                    e.times[prefetchPos].fill(reqCycle, pfRespCycle);
+                    profPrefetches.inc();
+
+                    if (shortPrefetch && fetchDepth < 8 && prefetchPos + stride < 64 && !e.valid[prefetchPos + stride]) {
+                        prefetchPos += stride;
+                        pfReq.lineAddr += stride;
+                        pfRespCycle = parent->access(pfReq);
+                        e.valid[prefetchPos] = true;
+                        e.times[prefetchPos].fill(reqCycle, pfRespCycle);
+                        profPrefetches.inc();
+                        profDoublePrefetches.inc();
+                    }
+                    e.lastPrefetchPos = prefetchPos;
+                    assert(state == I);  // prefetch access should not give us any permissions
+                }
+            } else {
+                profLowConfAccs.inc();
+            }
+        } else {
+            e.conf.dec();
+            // See if we need to switch strides
+            if (!e.conf.pred()) {
+                int32_t lastStride = e.lastPos - e.lastLastPos;
+
+                if (stride && stride != e.stride && stride == lastStride) {
+                    e.conf.reset();
+                    e.stride = stride;
+                    profStrideSwitches.inc();
+                }
+            }
+            e.lastPrefetchPos = pos;
+        }
+
+        e.lastLastPos = e.lastPos;
+        e.lastPos = pos;
+    }
+
+    req.childId = origChildId;
+    return respCycle;
+}
+
+// nop for now; do we need to invalidate our own state?
+uint64_t StreamPrefetcher::invalidate(Address lineAddr, InvType type, bool* reqWriteback, uint64_t reqCycle, uint32_t srcId) {
+    return child->invalidate(lineAddr, type, reqWriteback, reqCycle, srcId);
+}
+
+
diff --git a/src/prefetcher.h b/src/prefetcher.h
new file mode 100644
index 00000000..519b36c0
--- /dev/null
+++ b/src/prefetcher.h
@@ -0,0 +1,116 @@
+/** $lic$
+ * Copyright (C) 2012-2014 by Massachusetts Institute of Technology
+ * Copyright (C) 2010-2013 by The Board of Trustees of Stanford University
+ *
+ * This file is part of zsim.
+ *
+ * zsim is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU General Public License as published by the Free Software
+ * Foundation, version 2.
+ *
+ * If you use this software in your research, we request that you reference
+ * the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of
+ * Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the
+ * source of the simulator in any publications that use this software, and that
+ * you send us a citation of your work.
+ *
+ * zsim is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef PREFETCHER_H_
+#define PREFETCHER_H_
+
+#include <bitset>
+#include "bithacks.h"
+#include "g_std/g_string.h"
+#include "memory_hierarchy.h"
+#include "stats.h"
+
+/* Prefetcher models: Basic operation is to interpose between cache levels, issue additional accesses,
+ * and keep a small table with delays; when the demand access comes, we do it and account for the
+ * latency as when it was first fetched (to avoid hit latencies on partial latency overlaps).
+ */
+
+template <int32_t M, int32_t T, int32_t I>  // max value, threshold, initial
+class SatCounter {
+    private:
+        int32_t count;
+    public:
+        SatCounter() : count(I) {}
+        void reset() { count = I; }
+        void dec() { count = MAX(count - 1, 0); }
+        void inc() { count = MIN(count + 1, M); }
+        bool pred() const { return count >= T; }
+        uint32_t counter() const { return count; }
+};
+
+/* This is basically a souped-up version of the DLP L2 prefetcher in Nehalem: 16 stream buffers,
+ * but (a) no up/down distinction, and (b) strided operation based on dominant stride detection
+ * to try to subsume as much of the L1 IP/strided prefetcher as possible.
+ *
+ * FIXME: For now, mostly hardcoded; 64-line entries (4KB w/64-byte lines), fixed granularities, etc.
+ * TODO: Adapt to use weave models
+ */
+class StreamPrefetcher : public BaseCache {
+    private:
+        struct Entry {
+            // Two competing strides; at most one active
+            int32_t stride;
+            SatCounter<3, 2, 1> conf;
+
+            struct AccessTimes {
+                uint64_t startCycle;  // FIXME: Dead for now, we should use it for profiling
+                uint64_t respCycle;
+
+                void fill(uint32_t s, uint64_t r) { startCycle = s; respCycle = r; }
+            };
+
+            AccessTimes times[64];
+            std::bitset<64> valid;
+
+            uint32_t lastPos;
+            uint32_t lastLastPos;
+            uint32_t lastPrefetchPos;
+            uint64_t lastCycle;  // updated on alloc and hit
+            uint64_t ts;
+
+            void alloc(uint64_t curCycle) {
+                stride = 1;
+                lastPos = 0;
+                lastLastPos = 0;
+                lastPrefetchPos = 0;
+                conf.reset();
+                valid.reset();
+                lastCycle = curCycle;
+            }
+        };
+
+        uint64_t timestamp;  // for LRU
+        Address tag[16];
+        Entry array[16];
+
+        Counter profAccesses, profPrefetches, profDoublePrefetches, profPageHits, profHits, profShortHits, profStrideSwitches, profLowConfAccs;
+
+        MemObject* parent;
+        BaseCache* child;
+        uint32_t childId;
+        g_string name;
+
+    public:
+        explicit StreamPrefetcher(const g_string& _name) : timestamp(0), name(_name) {}
+        void initStats(AggregateStat* parentStat);
+        const char* getName() { return name.c_str();}
+        void setParents(uint32_t _childId, const g_vector<MemObject*>& parents, Network* network);
+        void setChildren(const g_vector<BaseCache*>& children, Network* network);
+
+        uint64_t access(MemReq& req);
+        uint64_t invalidate(Address lineAddr, InvType type, bool* reqWriteback, uint64_t reqCycle, uint32_t srcId);
+};
+
+#endif  // PREFETCHER_H_
diff --git a/src/prio_queue.h b/src/prio_queue.h
new file mode 100644
index 00000000..02d935f1
--- /dev/null
+++ b/src/prio_queue.h
@@ -0,0 +1,154 @@
+/** $lic$
+ * Copyright (C) 2012-2014 by Massachusetts Institute of Technology
+ * Copyright (C) 2010-2013 by The Board of Trustees of Stanford University
+ *
+ * This file is part of zsim.
+ *
+ * zsim is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU General Public License as published by the Free Software
+ * Foundation, version 2.
+ *
+ * If you use this software in your research, we request that you reference
+ * the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of
+ * Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the
+ * source of the simulator in any publications that use this software, and that
+ * you send us a citation of your work.
+ *
+ * zsim is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef PRIO_QUEUE_H_
+#define PRIO_QUEUE_H_
+
+#include "g_std/g_multimap.h"
+
+template <typename T, uint32_t B>
+class PrioQueue {
+    struct PQBlock {
+        T* array[64];
+        uint64_t occ; // bit i is 1 if array[i] is populated
+
+        PQBlock() {
+            for (uint32_t i = 0; i < 64; i++) array[i] = NULL;
+            occ = 0;
+        }
+
+        inline T* dequeue(uint32_t& offset) {
+            assert(occ);
+            uint32_t pos = __builtin_ctzl(occ);
+            T* res = array[pos];
+            T* next = res->next;
+            array[pos] = next;
+            if (!next) occ ^= 1L << pos;
+            assert(res);
+            offset = pos;
+            res->next = NULL;
+            return res;
+        }
+
+        inline void enqueue(T* obj, uint32_t pos) {
+            occ |= 1L << pos;
+            assert(!obj->next);
+            obj->next = array[pos];
+            array[pos] = obj;
+        }
+    };
+
+    PQBlock blocks[B];
+
+    typedef g_multimap<uint64_t, T*> FEMap; //far element map
+    typedef typename FEMap::iterator FEMapIterator;
+
+    FEMap feMap;
+
+    uint64_t curBlock;
+    uint64_t elems;
+
+    public:
+        PrioQueue() {
+            curBlock = 0;
+            elems = 0;
+        }
+
+        void enqueue(T* obj, uint64_t cycle) {
+            uint64_t absBlock = cycle/64;
+            assert(absBlock >= curBlock);
+
+            if (absBlock < curBlock + B) {
+                uint32_t i = absBlock % B;
+                uint32_t offset = cycle % 64;
+                blocks[i].enqueue(obj, offset);
+            } else {
+                //info("XXX far enq() %ld", cycle);
+                feMap.insert(std::pair<uint64_t, T*>(cycle, obj));
+            }
+            elems++;
+        }
+
+        T* dequeue(uint64_t& deqCycle) {
+            assert(elems);
+            while (!blocks[curBlock % B].occ) {
+                curBlock++;
+                if ((curBlock % (B/2)) == 0 && !feMap.empty()) {
+                    uint64_t topCycle = (curBlock + B)*64;
+                    //Move every element with cycle < topCycle to blocks[]
+                    FEMapIterator it = feMap.begin();
+                    while (it != feMap.end() && it->first < topCycle) {
+                        uint64_t cycle = it->first;
+                        T* obj = it->second;
+
+                        uint64_t absBlock = cycle/64;
+                        assert(absBlock >= curBlock);
+                        assert(absBlock < curBlock + B);
+                        uint32_t i = absBlock % B;
+                        uint32_t offset = cycle % 64;
+                        blocks[i].enqueue(obj, offset);
+                        it++;
+                    }
+                    feMap.erase(feMap.begin(), it);
+                }
+            }
+
+            //We're now at the first populated block
+            uint32_t offset;
+            T* obj = blocks[curBlock % B].dequeue(offset);
+            elems--;
+
+            deqCycle = curBlock*64 + offset;
+            return obj;
+        }
+
+        inline uint64_t size() const {
+            return elems;
+        }
+
+        inline uint64_t firstCycle() const {
+            assert(elems);
+            for (uint32_t i = 0; i < B/2; i++) {
+                uint64_t occ = blocks[(curBlock + i) % B].occ;
+                if (occ) {
+                    uint64_t pos = __builtin_ctzl(occ);
+                    return (curBlock + i)*64 + pos;
+                }
+            }
+            for (uint32_t i = B/2; i < B; i++) { //beyond B/2 blocks, there may be a far element that comes earlier
+                uint64_t occ = blocks[(curBlock + i) % B].occ;
+                if (occ) {
+                    uint64_t pos = __builtin_ctzl(occ);
+                    uint64_t cycle = (curBlock + i)*64 + pos;
+                    return feMap.empty()? cycle : MIN(cycle, feMap.begin()->first);
+                }
+            }
+
+            return feMap.begin()->first;
+        }
+};
+
+#endif  // PRIO_QUEUE_H_
+
diff --git a/src/process_stats.cpp b/src/process_stats.cpp
new file mode 100644
index 00000000..aa6b977a
--- /dev/null
+++ b/src/process_stats.cpp
@@ -0,0 +1,95 @@
+/** $lic$
+ * Copyright (C) 2012-2014 by Massachusetts Institute of Technology
+ * Copyright (C) 2010-2013 by The Board of Trustees of Stanford University
+ *
+ * This file is part of zsim.
+ *
+ * zsim is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU General Public License as published by the Free Software
+ * Foundation, version 2.
+ *
+ * If you use this software in your research, we request that you reference
+ * the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of
+ * Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the
+ * source of the simulator in any publications that use this software, and that
+ * you send us a citation of your work.
+ *
+ * zsim is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "process_stats.h"
+#include "process_tree.h"
+#include "scheduler.h"
+#include "zsim.h"
+
+ProcessStats::ProcessStats(AggregateStat* parentStat) {
+    uint32_t maxProcs = zinfo->lineSize;
+    processCycles.resize(maxProcs, 0);
+    processInstrs.resize(maxProcs, 0);
+    lastCoreCycles.resize(zinfo->numCores, 0);
+    lastCoreInstrs.resize(zinfo->numCores, 0);
+    lastUpdatePhase = 0;
+
+    auto procCyclesLambda = [this](uint32_t p) { return getProcessCycles(p); };
+    auto procCyclesStat = makeLambdaVectorStat(procCyclesLambda, maxProcs);
+    procCyclesStat->init("procCycles", "Per-process unhalted core cycles");
+
+    auto procInstrsLambda = [this](uint32_t p) { return getProcessInstrs(p); };
+    auto procInstrsStat = makeLambdaVectorStat(procInstrsLambda, maxProcs);
+    procInstrsStat->init("procInstrs", "Per-process instructions");
+
+    parentStat->append(procCyclesStat);
+    parentStat->append(procInstrsStat);
+}
+
+uint64_t ProcessStats::getProcessCycles(uint32_t p) {
+    if (unlikely(lastUpdatePhase != zinfo->numPhases)) update();
+    assert(p < processCycles.size());
+    return processCycles[p];
+}
+
+uint64_t ProcessStats::getProcessInstrs(uint32_t p) {
+    if (unlikely(lastUpdatePhase != zinfo->numPhases)) update();
+    assert(p < processInstrs.size());
+    return processInstrs[p];
+}
+
+void ProcessStats::notifyDeschedule(uint32_t cid, uint32_t outgoingPid) {
+    assert(cid < lastCoreCycles.size());
+    assert(outgoingPid < processCycles.size());
+    updateCore(cid, outgoingPid);
+}
+
+/* Private */
+
+void ProcessStats::updateCore(uint32_t cid, uint32_t p) {
+    p = zinfo->procArray[p]->getGroupIdx();
+
+    uint64_t cCycles = zinfo->cores[cid]->getCycles();
+    uint64_t cInstrs = zinfo->cores[cid]->getInstrs();
+
+    assert(cCycles >= lastCoreCycles[cid] && cInstrs >= lastCoreInstrs[cid]);
+    processCycles[p]  += cCycles - lastCoreCycles[cid];
+    processInstrs[p]  += cInstrs - lastCoreInstrs[cid];
+
+    lastCoreCycles[cid] = cCycles;
+    lastCoreInstrs[cid] = cInstrs;
+}
+
+void ProcessStats::update() {
+    assert(lastUpdatePhase < zinfo->numPhases);
+    for (uint32_t cid = 0; cid < lastCoreCycles.size(); cid++) {
+        uint32_t p = zinfo->sched->getScheduledPid(cid);
+        if (p == (uint32_t)-1) continue;
+        assert(p < processCycles.size());
+        updateCore(cid, p);
+    }
+    lastUpdatePhase = zinfo->numPhases;
+}
+
diff --git a/src/process_stats.h b/src/process_stats.h
new file mode 100644
index 00000000..2d71c8d4
--- /dev/null
+++ b/src/process_stats.h
@@ -0,0 +1,57 @@
+/** $lic$
+ * Copyright (C) 2012-2014 by Massachusetts Institute of Technology
+ * Copyright (C) 2010-2013 by The Board of Trustees of Stanford University
+ *
+ * This file is part of zsim.
+ *
+ * zsim is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU General Public License as published by the Free Software
+ * Foundation, version 2.
+ *
+ * If you use this software in your research, we request that you reference
+ * the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of
+ * Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the
+ * source of the simulator in any publications that use this software, and that
+ * you send us a citation of your work.
+ *
+ * zsim is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef PROCESS_STATS_H_
+#define PROCESS_STATS_H_
+
+#include "galloc.h"
+#include "stats.h"
+
+/* Maintains, queries, and transparently updates per-process instruction and cycle counts.
+ * You'd think it'd make sense to include this in ProcTreeNode, but those are dynamic, and
+ * stats must be static (and zeros compress great)
+ */
+class ProcessStats : public GlobAlloc {
+    private:
+        g_vector<uint64_t> processCycles, processInstrs;
+        g_vector<uint64_t> lastCoreCycles, lastCoreInstrs;
+        uint64_t lastUpdatePhase;
+
+    public:
+        explicit ProcessStats(AggregateStat* parentStat); //includes initStats, called post-system init
+
+        // May trigger a global update, should call ONLY when quiesced
+        uint64_t getProcessCycles(uint32_t p);
+        uint64_t getProcessInstrs(uint32_t p);
+
+        // Must be called by scheduler when descheduling; core must be quiesced
+        void notifyDeschedule(uint32_t cid, uint32_t outgoingPid);
+
+    private:
+        void updateCore(uint32_t cid, uint32_t p);
+        void update(); //transparent
+};
+
+#endif  // PROCESS_STATS_H_
diff --git a/src/process_tree.cpp b/src/process_tree.cpp
new file mode 100644
index 00000000..43ee1792
--- /dev/null
+++ b/src/process_tree.cpp
@@ -0,0 +1,237 @@
+/** $lic$
+ * Copyright (C) 2012-2014 by Massachusetts Institute of Technology
+ * Copyright (C) 2010-2013 by The Board of Trustees of Stanford University
+ *
+ * This file is part of zsim.
+ *
+ * zsim is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU General Public License as published by the Free Software
+ * Foundation, version 2.
+ *
+ * If you use this software in your research, we request that you reference
+ * the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of
+ * Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the
+ * source of the simulator in any publications that use this software, and that
+ * you send us a citation of your work.
+ *
+ * zsim is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "process_tree.h"
+#include <sstream>
+#include <stdlib.h>
+#include <string>
+#include <vector>
+#include "config.h"
+#include "constants.h"
+#include "event_queue.h"
+#include "process_stats.h"
+#include "stats.h"
+#include "zsim.h"
+
+using std::string;
+using std::stringstream;
+
+static string DefaultMaskStr() {
+    stringstream ss;
+    ss << "0:" << zinfo->numCores;
+    return ss.str();
+}
+
+//Helper
+static void DumpEventualStats(uint32_t procIdx, const char* reason) {
+    uint32_t p = zinfo->procArray[procIdx]->getGroupIdx();
+    info("Dumping eventual stats for process GROUP %d (%s)", p, reason);
+    zinfo->trigger = p;
+    zinfo->eventualStatsBackend->dump(true /*buffered*/);
+    zinfo->procEventualDumps++;
+    if (zinfo->procEventualDumps == zinfo->maxProcEventualDumps) {
+        info("Terminating, maxProcEventualDumps (%ld) reached", zinfo->maxProcEventualDumps);
+        zinfo->terminationConditionMet = true; //note this works because it always runs at the end of the phase
+    }
+}
+
+//Returns true if this is an actual first start, false otherwise (e.g. an exec)
+bool ProcessTreeNode::notifyStart() {
+    if (!started) { //first start
+        uint32_t oldActiveProcs = __sync_fetch_and_add(&zinfo->globalActiveProcs, 1);
+        if (procIdx) {
+            if (oldActiveProcs == 0) {
+                panic("Race! All processes finished before this one started, so stats have already been dumped and sim state may be partially deleted. "
+                    "You should serialize process creation and termination through the harness to avoid this.");
+            }
+        } else { //first start
+            assert(oldActiveProcs == 0);
+        }
+
+        //Set FFWD counters -- NOTE we can't call enter FFWD
+        if (inFastForward) {
+            if (syncedFastForward) __sync_fetch_and_add(&zinfo->globalSyncedFFProcs, 1);
+            __sync_fetch_and_add(&zinfo->globalFFProcs, 1);
+        }
+
+        started = true;
+        return true;
+    } else { //already started
+        return false;
+    }
+}
+
+bool ProcessTreeNode::notifyEnd() {
+    if (inFastForward) exitFastForward();
+    assert(zinfo->procExited[procIdx] == PROC_RUNNING);
+    uint32_t remaining;
+    if (restartsLeft && !zinfo->terminationConditionMet) {
+        restartsLeft--;
+        info("Marking procIdx %d for restart, %d restarts left", procIdx, restartsLeft);
+        zinfo->procExited[procIdx] = PROC_RESTARTME;
+        return false;
+    } else {
+        zinfo->procExited[procIdx] = PROC_EXITED;
+        remaining = __sync_sub_and_fetch(&zinfo->globalActiveProcs, 1);
+        return (remaining == 0);
+    }
+}
+
+void ProcessTreeNode::enterFastForward() {
+    assert(!inFastForward);
+    inFastForward = true;
+    if (syncedFastForward) __sync_fetch_and_add(&zinfo->globalSyncedFFProcs, 1);
+    __sync_fetch_and_add(&zinfo->globalFFProcs, 1);
+    __sync_synchronize();
+}
+
+void ProcessTreeNode::exitFastForward() {
+    assert(inFastForward);
+    inFastForward = false;
+    if (syncedFastForward) __sync_fetch_and_sub(&zinfo->globalSyncedFFProcs, 1);
+    __sync_fetch_and_sub(&zinfo->globalFFProcs, 1);
+    __sync_synchronize();
+}
+
+void ProcessTreeNode::heartbeat() {
+    uint64_t curBeats = __sync_add_and_fetch(&heartbeats, 1);
+    zinfo->profHeartbeats->atomicInc(procIdx);
+    //info("Heartbeat, total %ld", curBeats);
+
+    //trigger stats if we've reached the limit
+    class EventualStatsDumpEvent : public Event {
+        private:
+            uint32_t p;
+        public:
+            explicit EventualStatsDumpEvent(uint64_t _p) : Event(0 /*one-shot*/), p(_p) {}
+            void callback() { DumpEventualStats(p, "heartbeats"); }
+    };
+
+    if (curBeats == dumpHeartbeats) { //never triggers if dumpHeartbeats == 0
+        info("Heartbeat target %ld reached, marking stats dump", curBeats);
+        zinfo->eventQueue->insert(new EventualStatsDumpEvent(procIdx));
+
+        if (dumpsResetHeartbeats) {
+            info("Resetting heartbeat count (for periodic dumps)");
+            __sync_sub_and_fetch(&heartbeats, curBeats);
+        }
+    }
+}
+
+static void PopulateLevel(Config& config, const std::string& prefix, std::vector<ProcessTreeNode*>& globProcVector, ProcessTreeNode* parent, uint32_t& procIdx, uint32_t& groupIdx) {
+    uint32_t idx = 0;
+    std::vector<ProcessTreeNode*> children;
+    while (true) {
+        std::stringstream p_ss;
+        p_ss << prefix << "process" << idx;
+
+        if (!config.exists(p_ss.str().c_str())) {
+            break;
+        }
+
+        //Get patch root fs
+        std::string patchRoot = config.get<const char*>(p_ss.str() +  ".patchRoot", "");
+
+        const char* gpr = NULL;
+        if (patchRoot != "") {
+            //In case this is a relpath, convert it to absolute
+            char* pathBuf = realpath(patchRoot.c_str(), NULL); //mallocs the buffer
+            assert(pathBuf);
+            gpr = gm_strdup(pathBuf);
+            free(pathBuf);
+        }
+
+        bool groupWithPrevious = config.get<bool>(p_ss.str() +  ".groupWithPrevious", false);
+        if (groupWithPrevious) {
+            if (procIdx == 0) panic("Can't group process0 with the previous one, there is not previous process");
+            assert(groupIdx > 0);
+            groupIdx--;
+        }
+
+
+        bool startFastForwarded = config.get<bool>(p_ss.str() +  ".startFastForwarded", false);
+        bool syncedFastForward = config.get<bool>(p_ss.str() +  ".syncedFastForward", true);
+        bool startPaused = config.get<bool>(p_ss.str() +  ".startPaused", false);
+        uint32_t clockDomain = config.get<uint32_t>(p_ss.str() +  ".clockDomain", 0);
+        uint32_t portDomain = config.get<uint32_t>(p_ss.str() +  ".portDomain", 0);
+        uint64_t dumpHeartbeats = config.get<uint64_t>(p_ss.str() +  ".dumpHeartbeats", 0);
+        bool dumpsResetHeartbeats = config.get<bool>(p_ss.str() +  ".dumpsResetHeartbeats", false);
+        uint64_t dumpInstrs = config.get<uint64_t>(p_ss.str() +  ".dumpInstrs", 0);
+        uint32_t restarts = config.get<uint32_t>(p_ss.str() +  ".restarts", 0);
+        g_string syscallBlacklistRegex = config.get<const char*>(p_ss.str() +  ".syscallBlacklistRegex", ".*");
+        g_vector<bool> mask(ParseMask(config.get<const char*>(p_ss.str() +  ".mask", DefaultMaskStr().c_str()), zinfo->numCores));
+        g_vector<uint64_t> ffiPoints(ParseList<uint64_t>(config.get<const char*>(p_ss.str() +  ".ffiPoints", "")));
+
+        if (dumpInstrs) {
+            if (dumpHeartbeats) warn("Dumping eventual stats on both heartbeats AND instructions; you won't be able to distinguish both!");
+            auto getInstrs = [procIdx]() { return zinfo->processStats->getProcessInstrs(procIdx); };
+            auto dumpStats = [procIdx]() { DumpEventualStats(procIdx, "instructions"); };
+            zinfo->eventQueue->insert(makeAdaptiveEvent(getInstrs, dumpStats, 0, dumpInstrs, MAX_IPC*zinfo->phaseLength*zinfo->numCores /*all cores can be on*/));
+        } //NOTE: trivial to do the same with cycles
+
+        if (clockDomain >= MAX_CLOCK_DOMAINS) panic("Invalid clock domain %d", clockDomain);
+        if (portDomain >= MAX_PORT_DOMAINS) panic("Invalid port domain %d", portDomain);
+
+        ProcessTreeNode* ptn = new ProcessTreeNode(procIdx, groupIdx, startFastForwarded, startPaused, syncedFastForward, clockDomain, portDomain, dumpHeartbeats, dumpsResetHeartbeats, restarts, mask, ffiPoints, syscallBlacklistRegex, gpr);
+        //info("Created ProcessTreeNode, procIdx %d", procIdx);
+        parent->addChild(ptn);
+        children.push_back(ptn);
+
+        assert(procIdx == globProcVector.size());
+        globProcVector.push_back(ptn);
+
+        procIdx++;
+        groupIdx++;
+        idx++;
+    }
+
+    for (uint32_t i = 0;  i < children.size(); i++) {
+        std::stringstream p_ss;
+        p_ss << prefix << "process" << i << ".";
+        std::string childPrefix = p_ss.str();
+        PopulateLevel(config, childPrefix, globProcVector, children[i], procIdx, groupIdx);
+    }
+}
+
+void CreateProcessTree(Config& config) {
+    ProcessTreeNode* rootNode = new ProcessTreeNode(-1, -1, false, false, false, 0, 0, 0, false, 0, g_vector<bool> {},  g_vector<uint64_t> {}, g_string {}, NULL);
+    uint32_t procIdx = 0;
+    uint32_t groupIdx = 0;
+    std::vector<ProcessTreeNode*> globProcVector;
+
+    PopulateLevel(config, std::string(""), globProcVector, rootNode, procIdx, groupIdx);
+
+    if (procIdx > (uint32_t)zinfo->lineSize) panic("Cannot simulate more than sys.lineSize=%d processes (address spaces will get aliased), %d specified", zinfo->lineSize, procIdx);
+
+    zinfo->procTree = rootNode;
+    zinfo->numProcs = procIdx;
+    zinfo->numProcGroups = groupIdx;
+
+    zinfo->procArray = gm_calloc<ProcessTreeNode*>(zinfo->lineSize /*max procs*/); //note we can add processes later, so we size it to the maximum
+    for (uint32_t i = 0; i < procIdx; i++) zinfo->procArray[i] = globProcVector[i];
+
+    zinfo->procExited = gm_calloc<ProcExitStatus>(zinfo->lineSize /*max procs*/);
+}
+
diff --git a/src/process_tree.h b/src/process_tree.h
new file mode 100644
index 00000000..d217d1d3
--- /dev/null
+++ b/src/process_tree.h
@@ -0,0 +1,145 @@
+/** $lic$
+ * Copyright (C) 2012-2014 by Massachusetts Institute of Technology
+ * Copyright (C) 2010-2013 by The Board of Trustees of Stanford University
+ *
+ * This file is part of zsim.
+ *
+ * zsim is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU General Public License as published by the Free Software
+ * Foundation, version 2.
+ *
+ * If you use this software in your research, we request that you reference
+ * the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of
+ * Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the
+ * source of the simulator in any publications that use this software, and that
+ * you send us a citation of your work.
+ *
+ * zsim is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef PROCESS_TREE_H_
+#define PROCESS_TREE_H_
+
+#include "g_std/g_string.h"
+#include "g_std/g_vector.h"
+#include "galloc.h"
+#include "log.h"
+#include "zsim.h"
+
+class Config;
+
+class ProcessTreeNode : public GlobAlloc {
+    private:
+        g_vector<ProcessTreeNode*> children;
+        const char* patchRoot; //used in syscall patching
+        uint32_t procIdx;
+        const uint32_t groupIdx;
+        volatile uint32_t curChildren;
+        volatile uint64_t heartbeats;
+        bool started;
+        volatile bool inFastForward;
+        volatile bool inPause;
+        uint32_t restartsLeft;
+        const bool syncedFastForward; //if true, make sim wait when fast-forwarding
+        const uint32_t clockDomain;
+        const uint32_t portDomain;
+        const uint64_t dumpHeartbeats;
+        const bool dumpsResetHeartbeats;
+        const g_vector<bool> mask;
+        const g_vector<uint64_t> ffiPoints;
+        const g_string syscallBlacklistRegex;
+
+    public:
+        ProcessTreeNode(uint32_t _procIdx, uint32_t _groupIdx, bool _inFastForward, bool _inPause, bool _syncedFastForward,
+                        uint32_t _clockDomain, uint32_t _portDomain, uint64_t _dumpHeartbeats, bool _dumpsResetHeartbeats, uint32_t _restarts,
+                        const g_vector<bool>& _mask, const g_vector<uint64_t>& _ffiPoints, const g_string& _syscallBlacklistRegex, const char*_patchRoot)
+            : patchRoot(_patchRoot), procIdx(_procIdx), groupIdx(_groupIdx), curChildren(0), heartbeats(0), started(false), inFastForward(_inFastForward),
+              inPause(_inPause), restartsLeft(_restarts), syncedFastForward(_syncedFastForward), clockDomain(_clockDomain), portDomain(_portDomain), dumpHeartbeats(_dumpHeartbeats), dumpsResetHeartbeats(_dumpsResetHeartbeats), mask(_mask), ffiPoints(_ffiPoints), syscallBlacklistRegex(_syscallBlacklistRegex) {}
+
+        void addChild(ProcessTreeNode* child) {
+            children.push_back(child);
+        }
+
+        ProcessTreeNode* getNextChild() {
+            if (curChildren == children.size()) { //allocate a new child
+                uint32_t childProcIdx = __sync_fetch_and_add(&zinfo->numProcs, 1);
+                if (childProcIdx >= (uint32_t)zinfo->lineSize) {
+                    panic("Cannot simulate more than sys.lineSize=%d processes (to avoid aliasing), limit reached", zinfo->lineSize);
+                }
+                ProcessTreeNode* child = new ProcessTreeNode(*this);
+                child->procIdx = childProcIdx;
+                child->started = false;
+                child->curChildren = 0;
+                child->heartbeats = 0;
+                child->children.clear();
+                addChild(child);
+                zinfo->procArray[childProcIdx] = child;
+                info("Created child process %d on the fly, inheriting %d's config", childProcIdx, procIdx);
+            }
+
+            assert_msg(curChildren < children.size(), "ProcessTreeNode::getNextChild, procIdx=%d curChildren=%d numChildren=%ld", procIdx, curChildren, children.size());
+            return children[curChildren++];
+        }
+
+        uint32_t getProcIdx() const {return procIdx;}
+        uint32_t getGroupIdx() const {return groupIdx;}
+
+        //Returns true if this is an actual first start, false otherwise (e.g. an exec)
+        bool notifyStart();
+
+        //Returns true if this is the last process to end, false otherwise
+        bool notifyEnd() __attribute__((warn_unused_result));
+
+        void heartbeat();
+
+        const char* getPatchRoot() const {
+            return patchRoot;
+        }
+
+        inline bool isInFastForward() const { return inFastForward; }
+        inline bool isInPause() const { return inPause; }
+        inline bool getSyncedFastForward() const { return syncedFastForward; }
+
+        //In cpp file, they need to access zinfo
+        void enterFastForward();
+        void exitFastForward();
+
+        inline uint32_t getClockDomain() const {
+            return clockDomain;
+        }
+
+        inline uint32_t getPortDomain() const {
+            return portDomain;
+        }
+
+        void exitPause() {
+            assert(inPause);
+            inPause = false;
+            __sync_synchronize();
+        }
+
+        const g_vector<bool>& getMask() const {
+            return mask;
+        }
+
+        const g_vector<uint64_t>& getFFIPoints() const {
+            return ffiPoints;
+        }
+
+        const g_string& getSyscallBlacklistRegex() const {
+            return syscallBlacklistRegex;
+        }
+
+        //Currently there's no API to get back to a paused state; processes can start in a paused state, but once they are unpaused, they are unpaused for good
+};
+
+void CreateProcessTree(Config& config);
+
+
+#endif  // PROCESS_TREE_H_
diff --git a/src/profile_stats.h b/src/profile_stats.h
new file mode 100644
index 00000000..61132802
--- /dev/null
+++ b/src/profile_stats.h
@@ -0,0 +1,115 @@
+/** $lic$
+ * Copyright (C) 2012-2014 by Massachusetts Institute of Technology
+ * Copyright (C) 2010-2013 by The Board of Trustees of Stanford University
+ *
+ * This file is part of zsim.
+ *
+ * zsim is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU General Public License as published by the Free Software
+ * Foundation, version 2.
+ *
+ * If you use this software in your research, we request that you reference
+ * the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of
+ * Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the
+ * source of the simulator in any publications that use this software, and that
+ * you send us a citation of your work.
+ *
+ * zsim is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef PROFILE_STATS_H_
+#define PROFILE_STATS_H_
+
+/* Stats used to profile the simulator */
+
+#include <time.h>
+#include "stats.h"
+
+//Helper function
+inline uint64_t getNs() {
+    struct timespec ts;
+    //guaranteed synchronized across processors, averages 20ns/call on Ubuntu 12.04... Linux hrtimers have gotten really good! In comparison, rdtsc is 9ns.
+    clock_gettime(CLOCK_REALTIME, &ts);
+    return 1000000000L*ts.tv_sec + ts.tv_nsec;
+}
+
+/* Implements a single stopwatch-style cumulative clock. Useful to profile isolated events.
+ * get() accounts for current interval if clock is running.
+ */
+class ClockStat : public ScalarStat {
+    private:
+        uint64_t startNs;
+        uint64_t totalNs;
+
+    public:
+        ClockStat() : ScalarStat(), startNs(0), totalNs(0) {}
+
+        void start() {
+            assert(!startNs);
+            startNs = getNs();
+        }
+
+        void end() {
+            assert(startNs);
+            uint64_t endNs = getNs();
+            assert(endNs >= startNs)
+            totalNs += (endNs - startNs);
+            startNs = 0;
+        }
+
+        uint64_t get() const {
+            return totalNs + (startNs? (getNs() - startNs) : 0);
+        }
+};
+
+/* Implements multi-state time profiling. Always starts at state 0.
+ * Using this with an enum will help retain your sanity. Does not stop,
+ * so just transition to a dummy state if you want to stop profiling.
+ * count() accounts for partial time in current state; count() is used
+ * because we extend VectorCounter (TODO: we should have a VectorStat)
+ */
+class TimeBreakdownStat : public VectorCounter {
+    private:
+        uint32_t curState;
+        uint64_t startNs;
+
+    public:
+        TimeBreakdownStat() : VectorCounter() {}
+
+        virtual void init(const char* name, const char* desc, uint32_t size) {
+            VectorCounter::init(name, desc, size);
+            curState = 0;
+            startNs = getNs();
+        }
+
+        //I need to define this even though it is completely unnecessary, but only if I override init. gcc bug or C++ oddity?
+        virtual void init(const char* name, const char* desc, uint32_t size, const char** names) {
+            VectorCounter::init(name, desc, size, names); //will call our init(name, desc, size)
+        }
+
+        void transition(uint32_t newState) {
+            assert(curState < size());
+            assert(newState < size());
+
+            uint64_t curNs = getNs();
+            assert(curNs >= startNs);
+
+            inc(curState, curNs - startNs);
+            //info("%d: %ld / %ld", curState, curNs - startNs, VectorCounter::count(curState));
+            curState = newState;
+            startNs = curNs;
+        }
+
+        inline virtual uint64_t count(uint32_t idx) const {
+            uint64_t partial = VectorCounter::count(idx);
+            return partial + ((idx == curState)? (getNs() - startNs) : 0);
+        }
+};
+
+#endif  // PROFILE_STATS_H_
diff --git a/src/rdtsc.h b/src/rdtsc.h
new file mode 100644
index 00000000..aa4a8e44
--- /dev/null
+++ b/src/rdtsc.h
@@ -0,0 +1,43 @@
+/** $lic$
+ * Copyright (C) 2012-2014 by Massachusetts Institute of Technology
+ * Copyright (C) 2010-2013 by The Board of Trustees of Stanford University
+ *
+ * This file is part of zsim.
+ *
+ * zsim is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU General Public License as published by the Free Software
+ * Foundation, version 2.
+ *
+ * If you use this software in your research, we request that you reference
+ * the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of
+ * Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the
+ * source of the simulator in any publications that use this software, and that
+ * you send us a citation of your work.
+ *
+ * zsim is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef RDTSC_H_
+#define RDTSC_H_
+
+/* Functions to read the timestamp counter */
+
+#include <stdint.h>
+
+#if defined(__x86_64__)
+static inline uint64_t rdtsc() {
+    uint32_t hi, lo;
+    __asm__ __volatile__("rdtsc" : "=a"(lo), "=d"(hi));
+    return ((uint64_t)lo) | (((uint64_t)hi) << 32);
+}
+#else
+#error "No rdtsc() available for this arch"
+#endif
+
+#endif  // RDTSC_H_
diff --git a/src/repl_policies.h b/src/repl_policies.h
new file mode 100644
index 00000000..3e08c892
--- /dev/null
+++ b/src/repl_policies.h
@@ -0,0 +1,476 @@
+/** $lic$
+ * Copyright (C) 2012-2014 by Massachusetts Institute of Technology
+ * Copyright (C) 2010-2013 by The Board of Trustees of Stanford University
+ *
+ * This file is part of zsim.
+ *
+ * zsim is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU General Public License as published by the Free Software
+ * Foundation, version 2.
+ *
+ * If you use this software in your research, we request that you reference
+ * the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of
+ * Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the
+ * source of the simulator in any publications that use this software, and that
+ * you send us a citation of your work.
+ *
+ * zsim is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef REPL_POLICIES_H_
+#define REPL_POLICIES_H_
+
+#include <functional>
+#include "bithacks.h"
+#include "cache_arrays.h"
+#include "coherence_ctrls.h"
+#include "memory_hierarchy.h"
+#include "mtrand.h"
+
+/* Generic replacement policy interface. A replacement policy is initialized by the cache (by calling setTop/BottomCC) and used by the cache array. Usage follows two models:
+ * - On lookups, update() is called if the replacement policy is to be updated on a hit
+ * - On each replacement, rank() is called with the req and a list of replacement candidates.
+ * - When the replacement is done, replaced() is called. (See below for more detail.)
+ */
+class ReplPolicy : public GlobAlloc {
+    protected:
+        CC* cc; //coherence controller, used to figure out whether candidates are valid or number of sharers
+
+    public:
+        ReplPolicy() : cc(NULL) {}
+
+        virtual void setCC(CC* _cc) {cc = _cc;}
+
+        virtual void update(uint32_t id, const MemReq* req) = 0;
+        virtual void replaced(uint32_t id) = 0;
+
+        virtual uint32_t rankCands(const MemReq* req, SetAssocCands cands) = 0;
+        virtual uint32_t rankCands(const MemReq* req, ZCands cands) = 0;
+
+        virtual void initStats(AggregateStat* parent) {}
+};
+
+/* Add DECL_RANK_BINDINGS to each class that implements the new interface,
+ * then implement a single, templated rank() function (see below for examples)
+ * This way, we achieve a simple, single interface that is specialized transparently to each type of array
+ * (this code is performance-critical)
+ */
+#define DECL_RANK_BINDING(T) uint32_t rankCands(const MemReq* req, T cands) { return rank(req, cands); }
+#define DECL_RANK_BINDINGS DECL_RANK_BINDING(SetAssocCands); DECL_RANK_BINDING(ZCands);
+
+/* Legacy support.
+ * - On each replacement, the controller first calls startReplacement(), indicating the line that will be inserted;
+ *   then it calls recordCandidate() for each candidate it finds; finally, it calls getBestCandidate() to get the
+ *   line chosen for eviction. When the replacement is done, replaced() is called. The division of getBestCandidate()
+ *   and replaced() happens because the former is called in preinsert(), and the latter in postinsert(). Note how the
+ *   same restrictions on concurrent insertions extend to this class, i.e. startReplacement()/recordCandidate()/
+ *   getBestCandidate() will be atomic, but there may be intervening update() calls between getBestCandidate() and
+ *   replaced().
+ */
+class LegacyReplPolicy : public virtual ReplPolicy {
+    protected:
+        virtual void startReplacement(const MemReq* req) {} //many policies don't need it
+        virtual void recordCandidate(uint32_t id) = 0;
+        virtual uint32_t getBestCandidate() = 0;
+
+    public:
+        template <typename C> inline uint32_t rank(const MemReq* req, C cands) {
+            startReplacement(req);
+            for (auto ci = cands.begin(); ci != cands.end(); ci.inc()) {
+                recordCandidate(*ci);
+            }
+            return getBestCandidate();
+        }
+
+        DECL_RANK_BINDINGS;
+};
+
+/* Plain ol' LRU, though this one is sharers-aware, prioritizing lines that have
+ * sharers down in the hierarchy vs lines not shared by anyone.
+ */
+template <bool sharersAware>
+class LRUReplPolicy : public ReplPolicy {
+    protected:
+        uint64_t timestamp; // incremented on each access
+        uint64_t* array;
+        uint32_t numLines;
+
+    public:
+        explicit LRUReplPolicy(uint32_t _numLines) : timestamp(1), numLines(_numLines) {
+            array = gm_calloc<uint64_t>(numLines);
+        }
+
+        ~LRUReplPolicy() {
+            gm_free(array);
+        }
+
+        void update(uint32_t id, const MemReq* req) {
+            array[id] = timestamp++;
+        }
+
+        void replaced(uint32_t id) {
+            array[id] = 0;
+        }
+
+        template <typename C> inline uint32_t rank(const MemReq* req, C cands) {
+            uint32_t bestCand = -1;
+            uint64_t bestScore = (uint64_t)-1L;
+            for (auto ci = cands.begin(); ci != cands.end(); ci.inc()) {
+                uint32_t s = score(*ci);
+                bestCand = (s < bestScore)? *ci : bestCand;
+                bestScore = MIN(s, bestScore);
+            }
+            return bestCand;
+        }
+
+        DECL_RANK_BINDINGS;
+
+    private:
+        inline uint64_t score(uint32_t id) { //higher is least evictable
+            //array[id] < timestamp always, so this prioritizes by:
+            // (1) valid (if not valid, it's 0)
+            // (2) sharers, and
+            // (3) timestamp
+            return (sharersAware? cc->numSharers(id) : 0)*timestamp + array[id]*cc->isValid(id);
+        }
+};
+
+//This is VERY inefficient, uses LRU timestamps to do something that in essence requires a few bits.
+//If you want to use this frequently, consider a reimplementation
+class TreeLRUReplPolicy : public LRUReplPolicy<true> {
+    private:
+        uint32_t* candArray;
+        uint32_t numCands;
+        uint32_t candIdx;
+
+    public:
+        TreeLRUReplPolicy(uint32_t _numLines, uint32_t _numCands) : LRUReplPolicy<true>(_numLines), numCands(_numCands), candIdx(0) {
+            candArray = gm_calloc<uint32_t>(numCands);
+            if (numCands & (numCands-1)) panic("Tree LRU needs a power of 2 candidates, %d given", numCands);
+        }
+
+        ~TreeLRUReplPolicy() {
+            gm_free(candArray);
+        }
+
+        void recordCandidate(uint32_t id) {
+            candArray[candIdx++] = id;
+        }
+
+        uint32_t getBestCandidate() {
+            assert(candIdx == numCands);
+            uint32_t start = 0;
+            uint32_t end = numCands;
+
+            while (end - start > 1) {
+                uint32_t pivot = start + (end - start)/2;
+                uint64_t t1 = 0;
+                uint64_t t2 = 0;
+                for (uint32_t i = start; i < pivot; i++) t1 = MAX(t1, array[candArray[i]]);
+                for (uint32_t i = pivot; i < end; i++)   t2 = MAX(t2, array[candArray[i]]);
+                if (t1 > t2) start = pivot;
+                else end = pivot;
+            }
+            //for (uint32_t i = 0; i < numCands; i++) printf("%8ld ", array[candArray[i]]);
+            //info(" res: %d (%d %ld)", start, candArray[start], array[candArray[start]]);
+            return candArray[start];
+        }
+
+        void replaced(uint32_t id) {
+            candIdx = 0;
+            array[id] = 0;
+        }
+};
+
+//2-bit NRU, see A new Case for Skew-Associativity, A. Seznec, 1997
+class NRUReplPolicy : public LegacyReplPolicy {
+    private:
+        //read-only
+        uint32_t* array;
+        uint32_t* candArray;
+        uint32_t numLines;
+        uint32_t numCands;
+
+        //read-write
+        uint32_t youngLines;
+        uint32_t candVal;
+        uint32_t candIdx;
+
+    public:
+        NRUReplPolicy(uint32_t _numLines, uint32_t _numCands) :numLines(_numLines), numCands(_numCands), youngLines(0), candIdx(0) {
+            array = gm_calloc<uint32_t>(numLines);
+            candArray = gm_calloc<uint32_t>(numCands);
+            candVal = (1<<20);
+        }
+
+        ~NRUReplPolicy() {
+            gm_free(array);
+            gm_free(candArray);
+        }
+
+        void update(uint32_t id, const MemReq* req) {
+            //if (array[id]) info("update PRE %d %d %d", id, array[id], youngLines);
+            youngLines += 1 - (array[id] >> 1); //+0 if young, +1 if old
+            array[id] |= 0x2;
+
+            if (youngLines >= numLines/2) {
+                //info("youngLines = %d, shifting", youngLines);
+                for (uint32_t i = 0; i < numLines; i++) array[i] >>= 1;
+                youngLines = 0;
+            }
+            //info("update POST %d %d %d", id, array[id], youngLines);
+        }
+
+        void recordCandidate(uint32_t id) {
+            uint32_t iVal = array[id];
+            if (iVal < candVal) {
+                candVal = iVal;
+                candArray[0] = id;
+                candIdx = 1;
+            } else if (iVal == candVal) {
+                candArray[candIdx++] = id;
+            }
+        }
+
+        uint32_t getBestCandidate() {
+            assert(candIdx > 0);
+            return candArray[youngLines % candIdx]; // youngLines used to sort-of-randomize
+        }
+
+        void replaced(uint32_t id) {
+            //info("repl %d val %d cands %d", id, array[id], candIdx);
+            candVal = (1<<20);
+            candIdx = 0;
+            array[id] = 0;
+        }
+};
+
+class RandReplPolicy : public LegacyReplPolicy {
+    private:
+        //read-only
+        uint32_t* candArray;
+        uint32_t numCands;
+
+        //read-write
+        MTRand rnd;
+        uint32_t candVal;
+        uint32_t candIdx;
+
+    public:
+        explicit RandReplPolicy(uint32_t _numCands) : numCands(_numCands), rnd(0x23A5F + (uint64_t)this), candIdx(0) {
+            candArray = gm_calloc<uint32_t>(numCands);
+        }
+
+        ~RandReplPolicy() {
+            gm_free(candArray);
+        }
+
+        void update(uint32_t id, const MemReq* req) {}
+
+        void recordCandidate(uint32_t id) {
+            candArray[candIdx++] = id;
+        }
+
+        uint32_t getBestCandidate() {
+            assert(candIdx == numCands);
+            uint32_t idx = rnd.randInt(numCands-1);
+            return candArray[idx];
+        }
+
+        void replaced(uint32_t id) {
+            candIdx = 0;
+        }
+};
+
+class LFUReplPolicy : public LegacyReplPolicy {
+    private:
+        uint64_t timestamp; // incremented on each access
+        int32_t bestCandidate; // id
+        struct LFUInfo {
+            uint64_t ts;
+            uint64_t acc;
+        };
+        LFUInfo* array;
+        uint32_t numLines;
+
+        //NOTE: Rank code could be shared across Replacement policy implementations
+        struct Rank {
+            LFUInfo lfuInfo;
+            uint32_t sharers;
+            bool valid;
+
+            void reset() {
+                valid = false;
+                sharers = 0;
+                lfuInfo.ts = 0;
+                lfuInfo.acc = 0;
+            }
+
+            inline bool lessThan(const Rank& other, const uint64_t curTs) const {
+                if (!valid && other.valid) {
+                    return true;
+                } else if (valid == other.valid) {
+                    if (sharers == 0 && other.sharers > 0) {
+                        return true;
+                    } else if (sharers > 0 && other.sharers == 0) {
+                        return false;
+                    } else {
+                        if (lfuInfo.acc == 0) return true;
+                        if (other.lfuInfo.acc == 0) return false;
+                        uint64_t ownInvFreq = (curTs - lfuInfo.ts)/lfuInfo.acc; //inverse frequency, lower is better
+                        uint64_t otherInvFreq = (curTs - other.lfuInfo.ts)/other.lfuInfo.acc;
+                        return ownInvFreq > otherInvFreq;
+                    }
+                }
+                return false;
+            }
+        };
+
+        Rank bestRank;
+
+    public:
+        explicit LFUReplPolicy(uint32_t _numLines) : timestamp(1), bestCandidate(-1), numLines(_numLines) {
+            array = gm_calloc<LFUInfo>(numLines);
+            bestRank.reset();
+        }
+
+        ~LFUReplPolicy() {
+            gm_free(array);
+        }
+
+        void update(uint32_t id, const MemReq* req) {
+            //ts is the "center of mass" of all the accesses, i.e. the average timestamp
+            array[id].ts = (array[id].acc*array[id].ts + timestamp)/(array[id].acc + 1);
+            array[id].acc++;
+            timestamp += 1000; //have larger steps to avoid losing too much resolution over successive divisions
+        }
+
+        void recordCandidate(uint32_t id) {
+            Rank candRank = {array[id], cc? cc->numSharers(id) : 0, cc->isValid(id)};
+
+            if (bestCandidate == -1 || candRank.lessThan(bestRank, timestamp)) {
+                bestRank = candRank;
+                bestCandidate = id;
+            }
+        }
+
+        uint32_t getBestCandidate() {
+            assert(bestCandidate != -1);
+            return (uint32_t)bestCandidate;
+        }
+
+        void replaced(uint32_t id) {
+            bestCandidate = -1;
+            bestRank.reset();
+            array[id].acc = 0;
+        }
+};
+
+//Extends a given replacement policy to profile access ordering violations
+template <class T>
+class ProfViolReplPolicy : public T {
+    private:
+        struct AccTimes {
+            uint64_t read;
+            uint64_t write;
+        };
+
+        AccTimes* accTimes;
+
+        Counter profRAW, profWAR, profRAR, profWAW, profNoViolAcc;
+        Counter profAAE, profNoViolEv; //access after eviction violation
+
+        uint64_t replCycle;
+
+    public:
+        //using T::T; //C++11, but can't do in gcc yet
+
+        //Since this is only used with LRU, let's do that...
+        explicit ProfViolReplPolicy(uint32_t nl) : T(nl) {}
+
+        void init(uint32_t numLines) {
+            accTimes = gm_calloc<AccTimes>(numLines);
+            replCycle = 0;
+        }
+
+        void initStats(AggregateStat* parentStat) {
+            T::initStats(parentStat);
+            profRAW.init("vRAW", "RAW violations (R simulated before preceding W)");
+            profWAR.init("vWAR", "WAR violations (W simulated before preceding R)");
+            profRAR.init("vRAR", "RAR violations (R simulated before preceding R)");
+            profWAW.init("vWAW", "WAW violations (W simulated before preceding W)");
+            profAAE.init("vAAE", "Access simulated before preceding eviction");
+            profNoViolAcc.init("noViolAcc", "Accesses without R/WAR/W violations");
+            profNoViolEv.init("noViolEv",  "Evictions without AAE violations");
+
+            parentStat->append(&profRAW);
+            parentStat->append(&profWAR);
+            parentStat->append(&profRAR);
+            parentStat->append(&profWAW);
+            parentStat->append(&profAAE);
+            parentStat->append(&profNoViolAcc);
+            parentStat->append(&profNoViolEv);
+        }
+
+        void update(uint32_t id, const MemReq* req) {
+            T::update(id, req);
+
+            bool read = (req->type == GETS);
+            assert(read || req->type == GETX);
+            uint64_t cycle = req->cycle;
+
+            if (cycle < MAX(accTimes[id].read, accTimes[id].write)) { //violation
+                //Now have to determine order
+                bool readViol;
+                if (cycle < MIN(accTimes[id].read, accTimes[id].write)) { //before both
+                    readViol = (accTimes[id].read < accTimes[id].write); //read is closer
+                } else if (cycle < accTimes[id].read) { //write, current access, read -> XAR viol
+                    readViol = true;
+                } else { //read, current access, write -> XAW viol
+                    assert(cycle < accTimes[id].write);
+                    readViol = false;
+                }
+
+                //Record
+                read? (readViol? profRAR.inc() : profRAW.inc()) : (readViol? profWAR.inc() : profWAW.inc());
+
+                //info("0x%lx viol read %d readViol %d cycles: %ld | r %ld w %ld", req->lineAddr, read, readViol, cycle, accTimes[id].read, accTimes[id].write);
+            } else {
+                profNoViolAcc.inc();
+            }
+
+            //Record
+            if (read) accTimes[id].read  = MAX(accTimes[id].read,  req->cycle);
+            else      accTimes[id].write = MAX(accTimes[id].write, req->cycle);
+
+            T::update(id, req);
+        }
+
+        void startReplacement(const MemReq* req) {
+            T::startReplacement(req);
+
+            replCycle = req->cycle;
+        }
+
+        void replaced(uint32_t id) {
+            T::replaced(id);
+
+            if (replCycle < MAX(accTimes[id].read, accTimes[id].write)) {
+                profAAE.inc();
+            } else {
+                profNoViolEv.inc();
+            }
+
+            //Reset --- update() will set correctly
+            accTimes[id].read = 0;
+            accTimes[id].write = 0;
+        }
+};
+
+#endif  // REPL_POLICIES_H_
diff --git a/src/scheduler.cpp b/src/scheduler.cpp
new file mode 100644
index 00000000..945c2682
--- /dev/null
+++ b/src/scheduler.cpp
@@ -0,0 +1,399 @@
+/** $glic$
+ * Copyright (C) 2012-2014 by Massachusetts Institute of Technology
+ * Copyright (C) 2010-2013 by The Board of Trustees of Stanford University
+ * Copyright (C) 2011 Google Inc.
+ *
+ * This file is part of zsim.
+ *
+ * zsim is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU General Public License as published by the Free Software
+ * Foundation, version 2.
+ *
+ * If you use this software in your research, we request that you reference
+ * the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of
+ * Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the
+ * source of the simulator in any publications that use this software, and that
+ * you send us a citation of your work.
+ *
+ * zsim is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <fstream>
+#include <regex>
+#include <sys/stat.h>
+#include "config.h" // for ParseList
+#include "pin.H"
+#include "process_tree.h"
+#include "profile_stats.h"
+#include "scheduler.h"
+#include "str.h"
+#include "virt/syscall_name.h"
+
+//The scheduler class started simple, but at some point having it all in the header is too ridiculous. Migrate non perf-intensive calls here! (all but sync, really)
+
+#define WATCHDOG_INTERVAL_USEC (50)
+#define WATCHDOG_MAX_MULTIPLER (40) //50us-2ms waits
+#define WATCHDOG_STALL_THRESHOLD (100)
+
+//#define DEBUG_FL(args...) info(args)
+#define DEBUG_FL(args...)
+
+//#define DEBUG_FUTEX(args...) info(args)
+#define DEBUG_FUTEX(args...)
+
+// Unlike glibc's sleep functions suck, this ensures guaranteed minimum sleep time
+static void TrueSleep(uint32_t usecs) {
+    struct timespec req;
+    struct timespec rem;
+
+    req.tv_sec = usecs/1000000;
+    req.tv_nsec = (usecs*1000) % 1000000000;
+
+    while (req.tv_sec != 0 || req.tv_nsec != 0) {
+        int res = syscall(SYS_nanosleep, &req, &rem); //we don't call glibc's nanosleep because errno is not thread-safe in pintools.
+        if (res == 0) break;
+        req = rem;
+        if (res != -EINTR && res != 0) panic("nanosleep() returned an unexpected error code %d", res);
+        //info("nanosleep() interrupted!");
+    }
+}
+
+/* Hacky way to figure out if a thread is sleeping on a certain futex.
+ * 
+ * Uses /proc/<pid>/task/<tid>/syscall, which is only set when the process is
+ * actually sleeping on the syscall, not just in the kernel (see Linux kernel
+ * docs). This interface has been available since ~2008.
+ */
+bool IsSleepingInFutex(uint32_t linuxPid, uint32_t linuxTid, uintptr_t futexAddr) {
+    std::string fname = "/proc/" + Str(linuxPid) + "/task/" + Str(linuxTid) + "/syscall";
+    std::ifstream fs(fname);
+    if (!fs.is_open()) {
+        warn("Could not open %s", fname.c_str());
+        return false;
+    }
+
+    std::stringstream ss;
+    ss << fs.rdbuf();
+    fs.close();
+    
+    std::vector<std::string> argList = ParseList<std::string>(ss.str());
+    bool match = argList.size() >= 2 &&
+        strtoul(argList[0].c_str(), NULL, 0) == SYS_futex &&
+        (uintptr_t)strtoul(argList[1].c_str(), NULL, 0) == futexAddr;
+    //info("%s | %s | SYS_futex = %d futexAddr = 0x%lx | match = %d ", ss.str().c_str(), Str(argList).c_str(), SYS_futex, futexAddr, match);
+    return match;
+}
+
+
+void Scheduler::watchdogThreadFunc() {
+    info("Started scheduler watchdog thread");
+    uint64_t lastPhase = 0;
+    int multiplier = 1;
+    uint64_t lastMs = 0;
+    uint64_t fakeLeaveStalls = 0;
+    while (true) {
+        TrueSleep(multiplier*WATCHDOG_INTERVAL_USEC);
+
+        if (zinfo->terminationConditionMet) {
+            // Synchronize to avoid racing with EndOfPhaseActions code
+            // (zinfo->terminationConditionMet is set on EndOfPhaseActions,
+            // which has schedLock held, we must let it finish)
+            futex_lock(&schedLock);
+            info("Terminating scheduler watchdog thread");
+            futex_unlock(&schedLock);
+            SimEnd();
+        }
+
+        //Fastpath (unlocked, benign read races, only modifies local state)
+        if (lastPhase != curPhase && pendingPidCleanups.size() == 0) {
+            lastPhase = curPhase;
+            fakeLeaveStalls = 0;
+            if (multiplier < WATCHDOG_MAX_MULTIPLER) multiplier++;
+            continue;
+        }
+
+        //if (lastPhase == curPhase && scheduledThreads == outQueue.size() && !sleepQueue.empty()) info("Mult %d curPhase %ld", multiplier, curPhase);
+
+        futex_lock(&schedLock);
+
+        if (lastPhase == curPhase && !fakeLeaves.empty() && (fakeLeaves.front()->th->futexJoin.action != FJA_WAKE)) {
+            if (++fakeLeaveStalls >= WATCHDOG_STALL_THRESHOLD) {
+                info("Detected possible stall due to fake leaves (%ld current)", fakeLeaves.size());
+                // Uncomment to print all leaves
+                FakeLeaveInfo* pfl = fakeLeaves.front();
+                while (pfl) {
+                    info(" [%d/%d] %s (%d) @ 0x%lx", getPid(pfl->th->gid), getTid(pfl->th->gid), GetSyscallName(pfl->syscallNumber), pfl->syscallNumber, pfl->pc);
+                    pfl = pfl->next;
+                }
+
+                // Trigger a leave() on the first process, if the process's blacklist regex allows it
+                FakeLeaveInfo* fl = fakeLeaves.front();
+                ThreadInfo* th = fl->th;
+                uint32_t pid = getPid(th->gid);
+                uint32_t tid = getTid(th->gid);
+                uint32_t cid = th->cid;
+
+                const g_string& sbRegexStr = zinfo->procArray[pid]->getSyscallBlacklistRegex();
+                std::regex sbRegex(sbRegexStr.c_str());
+                if (std::regex_match(GetSyscallName(fl->syscallNumber), sbRegex)) {
+                    // If this is the last leave we catch, it is the culprit for sure -> blacklist it
+                    // Over time, this will blacklist every blocking syscall
+                    // The root reason for being conservative though is that we don't have a sure-fire
+                    // way to distinguish IO waits from truly blocking syscalls (TODO)
+                    if (fakeLeaves.size() == 1) {
+                        info("Blacklisting from future fake leaves: [%d] %s @ 0x%lx | arg0 0x%lx arg1 0x%lx", pid, GetSyscallName(fl->syscallNumber), fl->pc, fl->arg0, fl->arg1);
+                        blockingSyscalls[pid].insert(fl->pc);
+                    }
+
+                    finishFakeLeave(th);
+
+                    futex_unlock(&schedLock);
+                    leave(pid, tid, cid);
+                    futex_lock(&schedLock);
+                } else {
+                    info("Skipping, [%d] %s @ 0x%lx | arg0 0x%lx arg1 0x%lx does not match blacklist regex (%s)",
+                            pid, GetSyscallName(fl->syscallNumber), fl->pc, fl->arg0, fl->arg1, sbRegexStr.c_str());
+                }
+                fakeLeaveStalls = 0;
+            }
+        } else {
+            fakeLeaveStalls = 0;
+        }
+
+        if (lastPhase == curPhase && scheduledThreads == outQueue.size() && !sleepQueue.empty()) {
+            //info("Watchdog Thread: Sleep dep detected...")
+            int64_t wakeupPhase = sleepQueue.front()->wakeupPhase;
+            int64_t wakeupCycles = (wakeupPhase - curPhase)*zinfo->phaseLength;
+            int64_t wakeupUsec = (wakeupCycles > 0)? wakeupCycles/zinfo->freqMHz : 0;
+
+            //info("Additional usecs of sleep %ld", wakeupUsec);
+            if (wakeupUsec > 10*1000*1000) warn("Watchdog sleeping for a long time due to long sleep, %ld secs", wakeupUsec/1000/1000);
+
+            futex_unlock(&schedLock);
+            TrueSleep(WATCHDOG_INTERVAL_USEC + wakeupUsec);
+            futex_lock(&schedLock);
+
+            if (lastPhase == curPhase && scheduledThreads == outQueue.size() && !sleepQueue.empty()) {
+                ThreadInfo* sth = sleepQueue.front();
+                uint64_t curMs = curPhase*zinfo->phaseLength/zinfo->freqMHz/1000;
+                uint64_t endMs = sth->wakeupPhase*zinfo->phaseLength/zinfo->freqMHz/1000;
+                (void)curMs; (void)endMs; //make gcc happy
+                if (curMs > lastMs + 1000) {
+                    info("Watchdog Thread: Driving time forward to avoid deadlock on sleep (%ld -> %ld ms)", curMs, endMs);
+                    lastMs += 1000;
+                }
+                while (sth->state == SLEEPING) {
+                    idlePhases.inc();
+                    callback(); //sth will eventually get woken up
+
+                    if (futex_haswaiters(&schedLock)) {
+                        //happens commonly with multiple sleepers and very contended I/O...
+                        //info("Sched: Threads waiting on advance, startPhase %ld curPhase %ld", lastPhase, curPhase);
+                        break;
+                    }
+
+                    if (zinfo->terminationConditionMet) {
+                        info("Termination condition met inside watchdog thread loop, exiting");
+                        break;
+                    }
+                }
+                idlePeriods.inc();
+                multiplier = 0;
+            }
+        }
+
+        if (multiplier < WATCHDOG_MAX_MULTIPLER) {
+            multiplier++;
+        }
+
+        lastPhase = curPhase;
+
+        //Lazily clean state of processes that terminated abruptly
+        //NOTE: For now, we rely on the process explicitly telling us that it's going to terminate.
+        //We could make this self-checking by periodically checking for liveness of the processes we're supposedly running.
+        //The bigger problem is that if we get SIGKILL'd, we may not even leave a consistent zsim state behind.
+        while (pendingPidCleanups.size()) {
+            std::pair<uint32_t, uint32_t> p = pendingPidCleanups.back();
+            uint32_t pid = p.first; //the procIdx pid
+            uint32_t osPid = p.second;
+
+            std::stringstream ss;
+            ss << "/proc/" << osPid;
+            struct stat dummy;
+            if (stat(ss.str().c_str(), &dummy) == 0) {
+                info("[watchdog] Deferring cleanup of pid %d (%d), not finished yet", pid, osPid);
+                break;
+            }
+
+            pendingPidCleanups.pop_back(); //must happen while we have the lock
+
+            futex_unlock(&schedLock);
+            processCleanup(pid);
+            futex_lock(&schedLock);
+        }
+
+        if (terminateWatchdogThread) {
+            futex_unlock(&schedLock);
+            break;
+        } else {
+            futex_unlock(&schedLock);
+        }
+    }
+    info("Finished scheduler watchdog thread");
+}
+
+void Scheduler::threadTrampoline(void* arg) {
+    Scheduler* sched = static_cast<Scheduler*>(arg);
+    sched->watchdogThreadFunc();
+}
+
+void Scheduler::startWatchdogThread() {
+    PIN_SpawnInternalThread(threadTrampoline, this, 64*1024, NULL);
+}
+
+
+// Accurate join-leave implementation
+void Scheduler::syscallLeave(uint32_t pid, uint32_t tid, uint32_t cid, uint64_t pc, int syscallNumber, uint64_t arg0, uint64_t arg1) {
+    futex_lock(&schedLock);
+    uint32_t gid = getGid(pid, tid);
+    ThreadInfo* th = contexts[cid].curThread;
+    assert(th->gid == gid);
+    assert_msg(th->cid == cid, "%d != %d", th->cid, cid);
+    assert(th->state == RUNNING);
+    assert_msg(pid < blockingSyscalls.size(), "%d >= %ld?", pid, blockingSyscalls.size());
+
+    bool blacklisted = blockingSyscalls[pid].find(pc) != blockingSyscalls[pid].end();
+    if (blacklisted || th->markedForSleep) {
+        DEBUG_FL("%s @ 0x%lx calling leave(), reason: %s", GetSyscallName(syscallNumber), pc, blacklisted? "blacklist" : "sleep");
+        futex_unlock(&schedLock);
+        leave(pid, tid, cid);
+    } else {
+        DEBUG_FL("%s @ 0x%lx skipping leave()", GetSyscallName(syscallNumber), pc);
+        FakeLeaveInfo* si = new FakeLeaveInfo(pc, th, syscallNumber, arg0, arg1);
+        fakeLeaves.push_back(si);
+        // FIXME(dsm): zsim.cpp's SyscallEnter may be checking whether we are in a syscall and not calling us.
+        // If that's the case, this would be stale, which may lead to some false positives/negatives
+        futex_unlock(&schedLock);
+    }
+}
+
+/* Wake/wait matching code */
+
+// External interface, must be non-blocking
+void Scheduler::notifyFutexWakeStart(uint32_t pid, uint32_t tid, uint32_t maxWakes) {
+    futex_lock(&schedLock);
+    ThreadInfo* th = gidMap[getGid(pid, tid)];
+    DEBUG_FUTEX("[%d/%d] wakeStart max %d", pid, tid, maxWakes);
+    assert(th->futexJoin.action == FJA_NONE);
+    
+    // Programs sometimes call FUTEX_WAIT with maxWakes = UINT_MAX to wake
+    // everyone waiting on it; we cap to a reasonably high number to avoid
+    // overflows on maxAllowedFutexWakeups
+    maxWakes = MIN(maxWakes, 1<<24 /*16M wakes*/);
+
+    maxAllowedFutexWakeups += maxWakes;
+    th->futexJoin.maxWakes = maxWakes;
+    futex_unlock(&schedLock);
+}
+
+void Scheduler::notifyFutexWakeEnd(uint32_t pid, uint32_t tid, uint32_t wokenUp) {
+    futex_lock(&schedLock); 
+    ThreadInfo* th = gidMap[getGid(pid, tid)];
+    DEBUG_FUTEX("[%d/%d] wakeEnd woken %d", pid, tid, wokenUp);
+    th->futexJoin.action = FJA_WAKE;
+    th->futexJoin.wokenUp = wokenUp;
+    futex_unlock(&schedLock);
+}
+
+void Scheduler::notifyFutexWaitWoken(uint32_t pid, uint32_t tid) {
+    futex_lock(&schedLock);
+    ThreadInfo* th = gidMap[getGid(pid, tid)];
+    DEBUG_FUTEX("[%d/%d] waitWoken", pid, tid);
+    th->futexJoin = {FJA_WAIT, 0, 0};
+    futex_unlock(&schedLock);
+}
+
+// Internal, called with schedLock held
+void Scheduler::futexWakeJoin(ThreadInfo* th) {  // may release schedLock
+    assert(th->futexJoin.action == FJA_WAKE);
+
+    uint32_t maxWakes = th->futexJoin.maxWakes;
+    uint32_t wokenUp = th->futexJoin.wokenUp;
+
+    // Adjust allowance
+    assert(maxWakes <= maxAllowedFutexWakeups);
+    assert(wokenUp <= maxWakes);
+    maxAllowedFutexWakeups -= (maxWakes - wokenUp);
+    
+    assert(unmatchedFutexWakeups <= maxAllowedFutexWakeups); // should panic...
+
+    DEBUG_FUTEX("Futex wake matching %d %d", unmatchedFutexWakeups, maxAllowedFutexWakeups);
+
+    while (true) {
+        futex_unlock(&schedLock);
+        uint64_t startNs = getNs();
+        uint32_t iters = 0;
+        while (wokenUp > unmatchedFutexWakeups) {
+            TrueSleep(10*(1 + iters));  // linear backoff, start small but avoid overwhelming the OS with short sleeps
+            iters++;
+            uint64_t curNs = getNs();
+            if (curNs - startNs > (2L<<31L) /* ~2s */) {
+                futex_lock(&schedLock);
+                warn("Futex wake matching failed (%d/%d) (external/ff waiters?)", unmatchedFutexWakeups, wokenUp);
+                unmatchedFutexWakeups = 0;
+                maxAllowedFutexWakeups -= wokenUp;
+                return;
+            }
+        }
+
+        futex_lock(&schedLock);
+
+        // Recheck after acquire, may have concurrent wakes here
+        if (wokenUp <= unmatchedFutexWakeups) {
+            unmatchedFutexWakeups -= wokenUp;
+            maxAllowedFutexWakeups -= wokenUp;
+            break;
+        }
+    }
+
+    DEBUG_FUTEX("Finished futex wake matching");
+}
+
+void Scheduler::futexWaitJoin(ThreadInfo* th) {
+    assert(th->futexJoin.action == FJA_WAIT);
+    if (unmatchedFutexWakeups >= maxAllowedFutexWakeups) {
+        warn("External futex wakes? (%d/%d)", unmatchedFutexWakeups, maxAllowedFutexWakeups);
+    } else {
+        unmatchedFutexWakeups++;
+    }
+}
+
+void Scheduler::finishFakeLeave(ThreadInfo* th) {
+    assert(th->fakeLeave);
+    DEBUG_FL("%s (%d)  @ 0x%lx finishFakeLeave()", GetSyscallName(th->fakeLeave->syscallNumber), th->fakeLeave->syscallNumber, th->fakeLeave->pc);
+    assert_msg(th->state == RUNNING, "gid 0x%x invalid state %d", th->gid, th->state);
+    FakeLeaveInfo* si = th->fakeLeave;
+    fakeLeaves.remove(si);
+    delete si;
+    assert(th->fakeLeave == NULL);
+}
+
+void Scheduler::waitUntilQueued(ThreadInfo* th) {
+    uint64_t startNs = getNs();
+    uint32_t sleepUs = 1;
+    while(!IsSleepingInFutex(th->linuxTid, th->linuxTid, (uintptr_t)&schedLock)) {
+        TrueSleep(sleepUs++); // linear backoff, start small but avoid overwhelming the OS with short sleeps
+        uint64_t curNs = getNs();
+        if (curNs - startNs > (2L<<31L) /* ~2s */) {
+            warn("waitUntilQueued for pid %d tid %d timed out", getPid(th->gid), getTid(th->gid));
+            return;
+        }
+    }
+}
+
diff --git a/src/scheduler.h b/src/scheduler.h
new file mode 100644
index 00000000..53fea71d
--- /dev/null
+++ b/src/scheduler.h
@@ -0,0 +1,840 @@
+/** $glic$
+ * Copyright (C) 2012-2014 by Massachusetts Institute of Technology
+ * Copyright (C) 2010-2013 by The Board of Trustees of Stanford University
+ * Copyright (C) 2011 Google Inc.
+ *
+ * This file is part of zsim.
+ *
+ * zsim is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU General Public License as published by the Free Software
+ * Foundation, version 2.
+ *
+ * If you use this software in your research, we request that you reference
+ * the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of
+ * Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the
+ * source of the simulator in any publications that use this software, and that
+ * you send us a citation of your work.
+ *
+ * zsim is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef SCHEDULER_H_
+#define SCHEDULER_H_
+
+#include <algorithm>
+#include <iomanip>
+#include <list>
+#include <sstream>
+#include <vector>
+#include "barrier.h"
+#include "constants.h"
+#include "core.h"
+#include "g_std/g_unordered_map.h"
+#include "g_std/g_unordered_set.h"
+#include "g_std/g_vector.h"
+#include "intrusive_list.h"
+#include "process_stats.h"
+#include "stats.h"
+#include "zsim.h"
+
+/**
+ * TODO (dsm): This class is due for a heavy pass or rewrite. Some things are more complex than they should:
+ * - The OUT state is unnecessary. It is done as a weak link between a thread that left and its context to preserve affinity, but
+ *   there are far easier ways to implement this.
+ * - Should allow for complete separation of scheduling policies. Done to some degree (schedContext, etc.), but it should be cleaner.
+ * - wakeup() takes a needsJoin param that is computed per thread, but the barrier operates per core. This discrepancy manifests itself
+ *   in a corner case: if we kill a process, the watchdog reclaims its slots, and the system is overcommitted, sometimes we don't do
+ *   a join when we should.
+ * - It should be clearer who transisions threads/contexts among states (the thread taking the context, the one giving the context?),
+ *   I think this can potentially lead to races.
+ */
+
+
+/* Performs (pid, tid) -> cid translation; round-robin scheduling with no notion of locality or heterogeneity... */
+
+class Scheduler : public GlobAlloc, public Callee {
+    private:
+        enum ThreadState {
+            STARTED, //transient state, thread will do a join immediately after
+            RUNNING, //has cid assigned, managed by the phase barrier
+            OUT, //in leave() this phase, can rejoin immediately
+            BLOCKED, //inside a system call, no cid assigned, not in the barrier or the runqueue
+            SLEEPING, //inside a patched sleep syscall; no cid assigned, in sleepQueue; it is our responsibility to wake this thread up when its deadline arrives
+            QUEUED //in the runqueue
+        };
+
+        enum ContextState {
+            IDLE,
+            USED
+        };
+
+        void (*atSyncFunc)(void); //executed by syncing thread while others are waiting. Good for non-thread-safe stuff
+        Barrier bar;
+        uint32_t numCores;
+        uint32_t schedQuantum; //in phases
+
+        struct FakeLeaveInfo;
+
+        enum FutexJoinAction {FJA_NONE, FJA_WAKE, FJA_WAIT};
+        struct FutexJoinInfo {
+            FutexJoinAction action;
+            uint32_t maxWakes;
+            uint32_t wokenUp;
+        };
+
+        struct ThreadInfo : GlobAlloc, InListNode<ThreadInfo> {
+            const uint32_t gid;
+            const uint32_t linuxPid;
+            const uint32_t linuxTid;
+
+            ThreadState state;
+            uint32_t cid; //only current if RUNNING; otherwise, it's the last one used.
+
+            volatile ThreadInfo* handoffThread; //if at the end of a sync() this is not NULL, we need to transfer our current context to the thread pointed here.
+            volatile uint32_t futexWord;
+            volatile bool needsJoin; //after waiting on the scheduler, should we join the barrier, or is our cid good to go already?
+
+            bool markedForSleep; //if true, we will go to sleep on the next leave()
+            uint64_t wakeupPhase; //if SLEEPING, when do we have to wake up?
+
+            g_vector<bool> mask;
+
+            FakeLeaveInfo* fakeLeave; // for accurate join-leaves, see below
+
+            FutexJoinInfo futexJoin;
+
+            ThreadInfo(uint32_t _gid, uint32_t _linuxPid, uint32_t _linuxTid, const g_vector<bool>& _mask) :
+                InListNode<ThreadInfo>(), gid(_gid), linuxPid(_linuxPid), linuxTid(_linuxTid), mask(_mask)
+            {
+                state = STARTED;
+                cid = 0;
+                handoffThread = NULL;
+                futexWord = 0;
+                markedForSleep = false;
+                wakeupPhase = 0;
+                assert(mask.size() == zinfo->numCores);
+                uint32_t count = 0;
+                for (auto b : mask) if (b) count++;
+                if (count == 0) panic("Empty mask on gid %d!", gid);
+                fakeLeave = NULL;
+                futexJoin.action = FJA_NONE;
+            }
+        };
+
+        struct ContextInfo : InListNode<ContextInfo> {
+            uint32_t cid;
+            ContextState state;
+            ThreadInfo* curThread; //only current if used, otherwise NULL
+        };
+
+        g_unordered_map<uint32_t, ThreadInfo*> gidMap;
+        g_vector<ContextInfo> contexts;
+
+        InList<ContextInfo> freeList;
+
+        InList<ThreadInfo> runQueue;
+        InList<ThreadInfo> outQueue;
+        InList<ThreadInfo> sleepQueue; //contains all the sleeping threads, it is ORDERED by wakeup time
+
+        PAD();
+        lock_t schedLock;
+        PAD();
+
+        uint64_t curPhase;
+        //uint32_t nextVictim;
+        MTRand rnd;
+
+        volatile bool terminateWatchdogThread;
+
+        g_vector<std::pair<uint32_t, uint32_t>> pendingPidCleanups; //(pid, osPid) pairs of abruptly terminated processes
+
+        //Stats
+        Counter threadsCreated, threadsFinished;
+        Counter scheduleEvents, waitEvents, handoffEvents, sleepEvents;
+        Counter idlePhases, idlePeriods;
+        VectorCounter occHist, runQueueHist;
+        uint32_t scheduledThreads;
+
+        // gid <-> (pid, tid) xlat functions
+        inline uint32_t getGid(uint32_t pid, uint32_t tid) const {return (pid << 16) | tid;}
+        inline uint32_t getPid(uint32_t gid) const {return gid >> 16;}
+        inline uint32_t getTid(uint32_t gid) const {return gid & 0x0FFFF;}
+
+    public:
+        Scheduler(void (*_atSyncFunc)(void), uint32_t _parallelThreads, uint32_t _numCores, uint32_t _schedQuantum) :
+            atSyncFunc(_atSyncFunc), bar(_parallelThreads, this), numCores(_numCores), schedQuantum(_schedQuantum), rnd(0x5C73D9134)
+        {
+            contexts.resize(numCores);
+            for (uint32_t i = 0; i < numCores; i++) {
+                contexts[i].cid = i;
+                contexts[i].state = IDLE;
+                contexts[i].curThread = NULL;
+                freeList.push_back(&contexts[i]);
+            }
+            schedLock = 0;
+            //nextVictim = 0; //only used when freeList is empty.
+            curPhase = 0;
+            scheduledThreads = 0;
+           
+            maxAllowedFutexWakeups = 0;
+            unmatchedFutexWakeups = 0;
+
+            blockingSyscalls.resize(MAX_THREADS /* TODO: max # procs */);
+            
+            info("Started RR scheduler, quantum=%d phases", schedQuantum);
+            terminateWatchdogThread = false;
+            startWatchdogThread();
+        }
+
+        ~Scheduler() {}
+
+        void initStats(AggregateStat* parentStat) {
+            AggregateStat* schedStats = new AggregateStat();
+            schedStats->init("sched", "Scheduler stats");
+            threadsCreated.init("thCr", "Threads created"); schedStats->append(&threadsCreated);
+            threadsFinished.init("thFn", "Threads finished"); schedStats->append(&threadsFinished);
+            scheduleEvents.init("schedEvs", "Schedule events"); schedStats->append(&scheduleEvents);
+            waitEvents.init("waitEvs", "Wait events"); schedStats->append(&waitEvents);
+            handoffEvents.init("handoffEvs", "Handoff events"); schedStats->append(&handoffEvents);
+            sleepEvents.init("sleepEvs", "Sleep events"); schedStats->append(&sleepEvents);
+            idlePhases.init("idlePhases", "Phases with no thread active"); schedStats->append(&idlePhases);
+            idlePeriods.init("idlePeriods", "Periods with no thread active"); schedStats->append(&idlePeriods);
+            occHist.init("occHist", "Occupancy histogram", numCores+1); schedStats->append(&occHist);
+            uint32_t runQueueHistSize = ((numCores > 16)? numCores : 16) + 1;
+            runQueueHist.init("rqSzHist", "Run queue size histogram", runQueueHistSize); schedStats->append(&runQueueHist);
+            parentStat->append(schedStats);
+        }
+
+        void start(uint32_t pid, uint32_t tid, const g_vector<bool>& mask) {
+            futex_lock(&schedLock);
+            uint32_t gid = getGid(pid, tid);
+            //info("[G %d] Start", gid);
+            assert(gidMap.find(gid) == gidMap.end());
+            // Get pid and tid straight from the OS
+            // - SYS_gettid because glibc does not implement gettid()
+            // - SYS_getpid because after a fork (where zsim calls ThreadStart),
+            //   getpid() returns the parent's pid (getpid() caches, and I'm
+            //   guessing it hasn't flushed its cached pid at this point)
+            gidMap[gid] = new ThreadInfo(gid, syscall(SYS_getpid), syscall(SYS_gettid), mask);
+            threadsCreated.inc();
+            futex_unlock(&schedLock);
+        }
+
+        void finish(uint32_t pid, uint32_t tid) {
+            futex_lock(&schedLock);
+            uint32_t gid = getGid(pid, tid);
+            //info("[G %d] Finish", gid);
+            assert(gidMap.find(gid) != gidMap.end());
+            ThreadInfo* th = gidMap[gid];
+            gidMap.erase(gid);
+
+            // Check for suppressed syscall leave(), execute it
+            if (th->fakeLeave) {
+                finishFakeLeave(th);
+                futex_unlock(&schedLock);
+                leave(pid, tid, th->cid);
+                futex_lock(&schedLock);
+            }
+
+            //dsm: Added this check; the normal sequence is leave, finish, but with fastFwd you never know
+            if (th->state == RUNNING) {
+                warn("RUNNING thread %d (cid %d) called finish(), trying leave() first", tid, th->cid);
+                futex_unlock(&schedLock); //FIXME: May be racey...
+                leave(pid, tid, th->cid);
+                futex_lock(&schedLock);
+            }
+
+            assert_msg(th->state == STARTED /*might be started but in fastFwd*/ ||th->state == OUT || th->state == BLOCKED || th->state == QUEUED, "gid %d finish with state %d", gid, th->state);
+            if (th->state == QUEUED) {
+                assert(th->owner == &runQueue);
+                runQueue.remove(th);
+            } else if (th->owner) {
+                assert(th->owner == &outQueue);
+                outQueue.remove(th);
+                ContextInfo* ctx = &contexts[th->cid];
+                deschedule(th, ctx, BLOCKED);
+                freeList.push_back(ctx);
+                //no need to try to schedule anything; this context was already being considered while in outQueue
+                //assert(runQueue.empty()); need not be the case with masks
+                //info("[G %d] Removed from outQueue and descheduled", gid);
+            }
+            //At this point noone holds pointer to th, it's out from all queues, and either on OUT or BLOCKED means it's not pending a handoff
+            delete th;
+            threadsFinished.inc();
+            futex_unlock(&schedLock);
+        }
+
+        uint32_t join(uint32_t pid, uint32_t tid) {
+            futex_lock(&schedLock);
+            //If leave was in this phase, call bar.join()
+            //Otherwise, try to grab a free context; if all are taken, queue up
+            uint32_t gid = getGid(pid, tid);
+            ThreadInfo* th = gidMap[gid];
+
+            //dsm 25 Oct 2012: Failed this assertion right after a fork when trying to simulate gedit. Very weird, cannot replicate.
+            //dsm 10 Apr 2013: I think I got it. We were calling sched->finish() too early when following exec.
+            assert_msg(th, "gid not found %d pid %d tid %d", gid, pid, tid);
+
+            if (unlikely(th->futexJoin.action != FJA_NONE)) {
+                if (th->futexJoin.action == FJA_WAIT) futexWaitJoin(th);
+                else futexWakeJoin(th);  // may release and grab schedLock to delay our join, this is fine at this point
+                th->futexJoin.action = FJA_NONE;
+            }
+
+            // If we're in a fake leave, no need to do anything
+            if (th->fakeLeave) {
+                finishFakeLeave(th);
+                uint32_t cid = th->cid;
+                futex_unlock(&schedLock);
+                return cid;
+            }
+
+            assert(!th->markedForSleep);
+
+            if (th->state == SLEEPING) {
+                /*panic(*/ warn("[%d] called join() while SLEEPING, early sleep termination, moving to BLOCKED", tid);
+                sleepQueue.remove(th);
+                th->state = BLOCKED;
+            }
+
+            if (th->state == OUT) {
+                th->state = RUNNING;
+                outQueue.remove(th);
+                zinfo->cores[th->cid]->join();
+                bar.join(th->cid, &schedLock); //releases lock
+            } else {
+                assert(th->state == BLOCKED || th->state == STARTED);
+
+                ContextInfo* ctx = schedThread(th);
+                if (ctx) {
+                    schedule(th, ctx);
+                    zinfo->cores[th->cid]->join();
+                    bar.join(th->cid, &schedLock); //releases lock
+                } else {
+                    th->state = QUEUED;
+                    runQueue.push_back(th);
+                    waitForContext(th); //releases lock, might join
+                }
+            }
+
+            return th->cid;
+        }
+
+        void leave(uint32_t pid, uint32_t tid, uint32_t cid) {
+            futex_lock(&schedLock);
+            //Just call bar.leave
+            uint32_t gid = getGid(pid, tid);
+            ThreadInfo* th = contexts[cid].curThread;
+            assert(th->gid == gid);
+            assert(th->state == RUNNING);
+            zinfo->cores[cid]->leave();
+
+            if (th->markedForSleep) { //transition to SLEEPING, eagerly deschedule
+                trace(Sched, "Sched: %d going to SLEEP, wakeup on phase %ld", gid, th->wakeupPhase);
+                th->markedForSleep = false;
+                ContextInfo* ctx = &contexts[cid];
+                deschedule(th, ctx, SLEEPING);
+
+                //Ordered insert into sleepQueue
+                if (sleepQueue.empty() || sleepQueue.front()->wakeupPhase > th->wakeupPhase) {
+                    sleepQueue.push_front(th);
+                } else {
+                    ThreadInfo* cur = sleepQueue.front();
+                    while (cur->next && cur->next->wakeupPhase <= th->wakeupPhase) {
+                        cur = cur->next;
+                    }
+                    trace(Sched, "Put %d in sleepQueue (deadline %ld), after %d (deadline %ld)", gid, th->wakeupPhase, cur->gid, cur->wakeupPhase);
+                    sleepQueue.insertAfter(cur, th);
+                }
+                sleepEvents.inc();
+
+                ThreadInfo* inTh = schedContext(ctx);
+                if (inTh) {
+                    schedule(inTh, ctx);
+                    zinfo->cores[ctx->cid]->join(); //inTh does not do a sched->join, so we need to notify the core since we just called leave() on it
+                    wakeup(inTh, false /*no join, we did not leave*/);
+                } else {
+                    freeList.push_back(ctx);
+                    bar.leave(cid); //may trigger end of phase
+                }
+            } else { //lazily transition to OUT, where we retain our context
+                ContextInfo* ctx = &contexts[cid];
+                ThreadInfo* inTh = schedContext(ctx);
+                if (inTh) { //transition to BLOCKED, sched inTh
+                    deschedule(th, ctx, BLOCKED);
+                    schedule(inTh, ctx);
+                    zinfo->cores[ctx->cid]->join(); //inTh does not do a sched->join, so we need to notify the core since we just called leave() on it
+                    wakeup(inTh, false /*no join, we did not leave*/);
+                } else { //lazily transition to OUT, where we retain our context
+                    th->state = OUT;
+                    outQueue.push_back(th);
+                    bar.leave(cid); //may trigger end of phase
+                }
+            }
+
+            futex_unlock(&schedLock);
+        }
+
+        uint32_t sync(uint32_t pid, uint32_t tid, uint32_t cid) {
+            futex_lock(&schedLock);
+            ThreadInfo* th = contexts[cid].curThread;
+            assert(!th->markedForSleep);
+            bar.sync(cid, &schedLock); //releases lock, may trigger end of phase, may block us
+
+            //No locks at this point; we need to check whether we need to hand off our context
+            if (th->handoffThread) {
+                futex_lock(&schedLock);  // this can be made lock-free, but it's not worth the effort
+                ThreadInfo* dst = const_cast<ThreadInfo*>(th->handoffThread);  // de-volatilize
+                th->handoffThread = NULL;
+                ContextInfo* ctx = &contexts[th->cid];
+                deschedule(th, ctx, QUEUED);
+                schedule(dst, ctx);
+                wakeup(dst, false /*no join needed*/);
+                handoffEvents.inc();
+                //info("%d starting handoff cid %d to gid %d", th->gid, ctx->cid, dst->gid);
+
+                //We're descheduled and have completed the handoff. Now we need to see if we can be scheduled somewhere else.
+                ctx = schedThread(th);
+                if (ctx) {
+                    //TODO: This should only arise in very weird cases (e.g., partially overlapping process masks), and has not been tested
+                    warn("Sched: untested code path, check with Daniel if you see this");
+                    schedule(th, ctx);
+                    //We need to do a join, because dst will not join
+                    zinfo->cores[ctx->cid]->join();
+                    bar.join(ctx->cid, &schedLock); //releases lock
+                } else {
+                    runQueue.push_back(th);
+                    waitForContext(th); //releases lock, might join
+                }
+            }
+
+            assert(th->state == RUNNING);
+            return th->cid;
+        }
+
+        // This is called with schedLock held, and must not release it!
+        virtual void callback() {
+            //End of phase stats
+            assert(scheduledThreads <= numCores);
+            occHist.inc(scheduledThreads);
+            uint32_t rqPos = (runQueue.size() < (runQueueHist.size()-1))? runQueue.size() : (runQueueHist.size()-1);
+            runQueueHist.inc(rqPos);
+
+            if (atSyncFunc) atSyncFunc(); //call the simulator-defined actions external to the scheduler
+
+            /* End of phase accounting */
+            zinfo->numPhases++;
+            zinfo->globPhaseCycles += zinfo->phaseLength;
+            curPhase++;
+
+            assert(curPhase == zinfo->numPhases); //check they don't skew
+
+            //Wake up all sleeping threads where deadline is met
+            if (!sleepQueue.empty()) {
+                ThreadInfo* th = sleepQueue.front();
+                while (th && th->wakeupPhase <= curPhase) {
+                    assert(th->wakeupPhase == curPhase);
+                    trace(Sched, "%d SLEEPING -> BLOCKED, waking up from timeout syscall (curPhase %ld, wakeupPhase %ld)", th->gid, curPhase, th->wakeupPhase);
+                    
+                    // Try to deschedule ourselves
+                    th->state = BLOCKED;
+                    wakeup(th, false /*no join, this is sleeping out of the scheduler*/);
+                    
+                    sleepQueue.pop_front();
+                    th = sleepQueue.front();
+                }
+            }
+
+            //Handle rescheduling
+            if (runQueue.empty()) return;
+
+            if ((curPhase % schedQuantum) == 0) {
+                schedTick();
+            }
+        }
+
+        volatile uint32_t* markForSleep(uint32_t pid, uint32_t tid, uint64_t wakeupPhase) {
+            futex_lock(&schedLock);
+            uint32_t gid = getGid(pid, tid);
+            trace(Sched, "%d marking for sleep", gid);
+            ThreadInfo* th = gidMap[gid];
+            assert(!th->markedForSleep);
+            th->markedForSleep = true;
+            th->wakeupPhase = wakeupPhase;
+            th->futexWord = 1; //to avoid races, this must be set here.
+            futex_unlock(&schedLock);
+            return &(th->futexWord);
+        }
+
+        bool isSleeping(uint32_t pid, uint32_t tid) {
+            futex_lock(&schedLock);
+            uint32_t gid = getGid(pid, tid);
+            ThreadInfo* th = gidMap[gid];
+            bool res = th->state == SLEEPING;
+            futex_unlock(&schedLock);
+            return res;
+        }
+
+        void notifySleepEnd(uint32_t pid, uint32_t tid) {
+            futex_lock(&schedLock);
+            uint32_t gid = getGid(pid, tid);
+            ThreadInfo* th = gidMap[gid];
+            assert(th->markedForSleep == false);
+            //Move to BLOCKED; thread will join pretty much immediately
+            assert(th->state == SLEEPING || th->state == BLOCKED);
+            if (th->state == BLOCKED) {
+                warn("Scheduler:notifySleepEnd: Benign race on SLEEPING->BLOCKED transition, thread is already blocked");
+            } else {
+                sleepQueue.remove(th);
+                th->state = BLOCKED;
+            }
+            futex_unlock(&schedLock);
+        }
+
+        void printThreadState(uint32_t pid, uint32_t tid) {
+            futex_lock(&schedLock);
+            uint32_t gid = getGid(pid, tid);
+            ThreadInfo* th = gidMap[gid];
+            info("[%d] is in scheduling state %d", tid, th->state);
+            futex_unlock(&schedLock);
+        }
+
+        void notifyTermination() {
+            /* dsm 2013-06-15: Traced a deadlock at termination down here... looks like with MT apps this lock is held at SimEnd.
+             * Leaving the lock off is safe now, but if this function gets more complex, we may have to rethink this.
+             */
+            //futex_lock(&schedLock);
+            terminateWatchdogThread = true;
+            //futex_unlock(&schedLock);
+        }
+
+        //Should be called when a process is terminated abruptly (e.g., through a signal).
+        //Walks the gidMap and calls leave/finish on all threads of the process. Not quite race-free,
+        //we could have private unlocked versions of leave, finifh, etc, but the key problem is that
+        //if you call this and any other thread in the process is still alive, then there is a
+        //much bigger problem.
+        void processCleanup(uint32_t pid) {
+            futex_lock(&schedLock);
+            std::vector<uint32_t> doomedTids;
+            g_unordered_map<uint32_t, ThreadInfo*>::iterator it;
+            for (it = gidMap.begin(); it != gidMap.end(); it++) {
+                uint32_t gid = it->first;
+                if (getPid(gid) == pid) doomedTids.push_back(getTid(gid));
+            }
+            futex_unlock(&schedLock);
+
+            if (doomedTids.size()) {
+                for (uint32_t tid : doomedTids) {
+                    if (isSleeping(pid, tid)) {
+                        notifySleepEnd(pid, tid);
+                    }
+                    finish(pid, tid);
+                }
+                info("[sched] Cleaned up pid %d, %ld tids", pid, doomedTids.size());
+            }
+        }
+
+        //Calling doProcessCleanup on multithreaded processes leads to races,
+        //so we'll just have the watchdog thread to it once we're gone
+        void queueProcessCleanup(uint32_t pid, uint32_t osPid) {
+            futex_lock(&schedLock);
+            pendingPidCleanups.push_back(std::make_pair(pid, osPid));
+            futex_unlock(&schedLock);
+        }
+
+        uint32_t getScheduledPid(uint32_t cid) const { return (contexts[cid].state == USED)? getPid(contexts[cid].curThread->gid) : (uint32_t)-1; }
+
+    private:
+        void schedule(ThreadInfo* th, ContextInfo* ctx) {
+            assert(th->state == STARTED || th->state == BLOCKED || th->state == QUEUED);
+            assert(ctx->state == IDLE);
+            assert(ctx->curThread == NULL);
+            th->state = RUNNING;
+            th->cid = ctx->cid;
+            ctx->state = USED;
+            ctx->curThread = th;
+            scheduleEvents.inc();
+            scheduledThreads++;
+            //info("Scheduled %d <-> %d", th->gid, ctx->cid);
+            zinfo->cores[ctx->cid]->contextSwitch(th->gid);
+        }
+
+        void deschedule(ThreadInfo* th, ContextInfo* ctx, ThreadState targetState) {
+            assert(th->state == RUNNING || th->state == OUT);
+            assert(ctx->state == USED);
+            assert(ctx->cid == th->cid);
+            assert(ctx->curThread == th);
+            assert(targetState == BLOCKED || targetState == QUEUED || targetState == SLEEPING);
+            th->state = targetState;
+            ctx->state = IDLE;
+            ctx->curThread = NULL;
+            scheduledThreads--;
+            //Notify core of context-switch eagerly.
+            //TODO: we may need more callbacks in the cores, e.g. in schedule(). Revise interface as needed...
+            zinfo->cores[ctx->cid]->contextSwitch(-1);
+            zinfo->processStats->notifyDeschedule(ctx->cid, getPid(th->gid));
+            //info("Descheduled %d <-> %d", th->gid, ctx->cid);
+        }
+
+        void waitForContext(ThreadInfo* th) {
+            th->futexWord = 1;
+            waitEvents.inc();
+            //info("%d waiting to be scheduled", th->gid);
+            //printState();
+            futex_unlock(&schedLock);
+            while (true) {
+                int futex_res = syscall(SYS_futex, &th->futexWord, FUTEX_WAIT, 1 /*a racing thread waking us up will change value to 0, and we won't block*/, NULL, NULL, 0);
+                if (futex_res == 0 || th->futexWord != 1) break;
+            }
+            //info("%d out of sched wait, got cid = %d, needsJoin = %d", th->gid, th->cid, th->needsJoin);
+            if (th->needsJoin) {
+                futex_lock(&schedLock);
+                assert(th->needsJoin); //re-check after the lock
+                zinfo->cores[th->cid]->join();
+                bar.join(th->cid, &schedLock);
+                //info("%d join done", th->gid);
+            }
+        }
+
+        void wakeup(ThreadInfo* th, bool needsJoin) {
+            th->needsJoin = needsJoin;
+            bool succ = __sync_bool_compare_and_swap(&th->futexWord, 1, 0);
+            if (!succ) panic("Wakeup race in barrier?");
+            syscall(SYS_futex, &th->futexWord, FUTEX_WAKE, 1, NULL, NULL, 0);
+            waitUntilQueued(th);
+        }
+
+        void printState() {
+            std::stringstream ss;
+            for (uint32_t c = 0; c < numCores; c++) {
+                if (contexts[c].state == IDLE) {
+                    ss << " " << "___";
+                } else {
+                    ss << " " << std::setw(2) << contexts[c].curThread->gid;
+                    if (contexts[c].curThread->state == RUNNING) ss << "r";
+                    else if (contexts[c].curThread->state == OUT) ss << "o";
+                    else panic("Invalid state cid=%d, threadState=%d", c, contexts[c].curThread->state);
+                }
+            }
+            info(" State: %s", ss.str().c_str());
+        }
+
+
+        //Core scheduling functions
+        /* This is actually the interface that an abstract OS scheduler would have, and implements the scheduling policy:
+         * - schedThread(): Here's a thread that just became available; return either a ContextInfo* where to schedule it, or NULL if none are available
+         * - schedContext(): Here's a context that just became available; return either a ThreadInfo* to schedule on it, or NULL if none are available
+         * - schedTick(): Current quantum is over, hand off contexts to other threads as you see fit
+         * These functions can REMOVE from runQueue, outQueue, and freeList, but do not INSERT. These are filled in elsewhere. They also have minimal concerns
+         * for thread and context states. Those state machines are implemented and handled elsewhere, except where strictly necessary.
+         */
+        ContextInfo* schedThread(ThreadInfo* th) {
+            ContextInfo* ctx = NULL;
+
+            //First, try to get scheduled in the last context we were running at
+            assert(th->cid < numCores); //though old, it should be in a valid range
+            if (contexts[th->cid].state == IDLE && th->mask[th->cid]) {
+                ctx = &contexts[th->cid];
+                freeList.remove(ctx);
+            }
+
+            //Second, check the freeList
+            if (!ctx && !freeList.empty()) {
+                ContextInfo* c = freeList.front();
+                while (c) {
+                    if (th->mask[c->cid]) {
+                        ctx = c;
+                        freeList.remove(ctx);
+                        break;
+                    } else {
+                        c = c->next;
+                    }
+                }
+            }
+
+            //Third, try to steal from the outQueue (block a thread, take its cid)
+            if (!ctx && !outQueue.empty()) {
+                ThreadInfo* outTh = outQueue.front();
+                while (outTh) {
+                    if (th->mask[outTh->cid]) {
+                        ctx = &contexts[outTh->cid];
+                        outQueue.remove(outTh);
+                        deschedule(outTh, ctx, BLOCKED);
+                        break;
+                    } else {
+                        outTh = outTh->next;
+                    }
+                }
+            }
+
+            if (ctx) assert(th->mask[ctx->cid]);
+
+            //info("schedThread done, gid %d, success %d", th->gid, ctx != NULL);
+            //printState();
+            return ctx;
+        }
+
+        ThreadInfo* schedContext(ContextInfo* ctx) {
+            ThreadInfo* th = NULL;
+            ThreadInfo* blockedTh = runQueue.front(); //NULL if empty
+            while (blockedTh) {
+                if (blockedTh->mask[ctx->cid]) {
+                    th = blockedTh;
+                    runQueue.remove(blockedTh);
+                    break;
+                } else {
+                    blockedTh = blockedTh->next;
+                }
+            }
+
+            //info("schedContext done, cid %d, success %d (gid %d)", ctx->cid, th != NULL, th? th->gid : 0);
+            //printState();
+            return th;
+        }
+
+        void schedTick() {
+            std::vector<uint32_t> availVec;
+            availVec.resize(zinfo->numCores);
+            for (uint32_t i = 0; i < zinfo->numCores; i++) availVec[i] = i;
+
+            //Random shuffle (Fisher-Yates)
+            for (uint32_t i = zinfo->numCores - 1; i > 0; i--) {
+                uint32_t j = rnd.randInt(i); //j is in 0,...,i
+                std::swap(availVec[i], availVec[j]);
+            }
+
+            std::list<uint32_t> avail(availVec.begin(), availVec.end());
+
+            /* NOTE: avail has all cores, including those in freeList, which may not be empty.
+             * But we will never match anything in the freeList, because schedContext and
+             * schedThread would have matched them out. So, no need to prioritize the freeList.
+             */
+
+            uint32_t contextSwitches = 0;
+
+            ThreadInfo* th = runQueue.front();
+            while (th && !avail.empty()) {
+                bool scheduled = false;
+                for (std::list<uint32_t>::iterator it = avail.begin(); it != avail.end(); it++) {
+                    uint32_t cid = *it;
+                    if (th->mask[cid]) {
+                        ContextInfo* ctx = &contexts[cid];
+                        ThreadInfo* victimTh = ctx->curThread;
+                        assert(victimTh);
+                        victimTh->handoffThread = th;
+                        contextSwitches++;
+
+                        scheduled = true;
+                        avail.erase(it);
+                        break;
+                    }
+                }
+
+                ThreadInfo* pth = th;
+                th = th->next;
+                if (scheduled) runQueue.remove(pth);
+            }
+
+            info("Time slice ended, context-switched %d threads, runQueue size %ld, available %ld", contextSwitches, runQueue.size(), avail.size());
+            printState();
+        }
+
+        //Watchdog thread functions
+        /* With sleeping threads, we have to drive time forward if no thread is scheduled and some threads are sleeping; otherwise, we can deadlock.
+         * This initially was the responsibility of the last leaving thread, but led to horribly long syscalls being simulated. For example, if you
+         * have 2 threads, 1 is sleeping and the other one goes on a syscall, it had to drive time fwd to wake the first thread up, on the off-chance
+         * that the impending syscall was blocking, to avoid deadlock.
+         * Instead, we have an auxiliary thread check for this condition periodically, and if all threads are sleeping or blocked, we just drive time
+         * forward.
+         */
+        void startWatchdogThread();
+        void watchdogThreadFunc();
+
+        static void threadTrampoline(void* arg);
+
+    /* Accurate and adaptive join-leave
+     *
+     * Threads leave() on a syscall enter and join() when they return, which desyncs them from the simulation to prevent deadlock through syscalls.
+     * In practice this is often not an issue because most syscalls are short enough that they finish before the phase changes. However, with highly
+     * overcommitted systems and system-intensive apps, we've started seeing some timeing leakage. The old syscall_funcs reduced this problem by avoiding
+     * a leave on safe syscalls, but that solution was quite restrictive: there are many syscalls that could theoretically block, but never do. Additionally,
+     * futexes and sleeps, which are blocking but for which we can accurately infer their join phase, may suffer from inaccurate joins.
+     *
+     * To this end, the following interface supports an adaptive join-leave implementation that avoids most desyncs:
+     * - Threads should call syscallLeave() and syscallJoin(), passing their PC and a small syscall descriptor for a few syscalls of interest.
+     * - The scheduler adaptively decides whether we should wait for a syscall to join or to start the next phase. It avoids deadlock by having
+     *   the watchdog detect potential deadlocks, and desyncing the threads. To avoid frequent desyncs, it blacklists syscalls 
+     * - When the scheduler wakes up a sleeping thread (e.g., in a timeout syscall), it ensures the phase does not slip by.
+     * - When the scheduler sees a FUTEX_WAKE, it ensures we wait for the woken-up thread(s).
+     *
+     * TODO: This code is currently written to be as independent as possible from the other sched and barrier code.
+     * If it works well, the code should be reorganized and simplified.
+     */
+    private:
+        // All structures protected by schedLock
+
+        // Per-process per-PC blacklist
+        g_vector< g_unordered_set<uint64_t> > blockingSyscalls;
+
+        struct FakeLeaveInfo : GlobAlloc, InListNode<FakeLeaveInfo> {
+            const uint64_t pc;
+            ThreadInfo* const th;
+            const int syscallNumber;
+            const uint64_t arg0; // kept for reference
+            const uint64_t arg1; // kept for reference
+
+            FakeLeaveInfo(uint64_t _pc, ThreadInfo* _th, int _syscallNumber, uint64_t _arg0, uint64_t _arg1) :
+                pc(_pc), th(_th), syscallNumber(_syscallNumber), arg0(_arg0), arg1(_arg1)
+            {
+                assert(th->fakeLeave == NULL);
+                th->fakeLeave = this;
+            }
+
+            ~FakeLeaveInfo() {
+                assert(th->fakeLeave == this);
+                th->fakeLeave = NULL;
+            }
+        };
+
+        // All active syscalls that are still in the simulator (no leave()) have an entry here
+        InList<FakeLeaveInfo> fakeLeaves;
+
+        // TODO: Futex wait/wake matching code
+
+    public:
+        // Externally, has the exact same behavior as leave(); internally, may choose to not actually leave;
+        // join() and finish() handle this state
+        void syscallLeave(uint32_t pid, uint32_t tid, uint32_t cid, uint64_t pc, int syscallNumber, uint64_t arg0, uint64_t arg1);
+   
+        // Futex wake/wait matching interface
+        void notifyFutexWakeStart(uint32_t pid, uint32_t tid, uint32_t maxWakes);
+        void notifyFutexWakeEnd(uint32_t pid, uint32_t tid, uint32_t wokenUp);
+        void notifyFutexWaitWoken(uint32_t pid, uint32_t tid);
+
+    private:
+        volatile uint32_t maxAllowedFutexWakeups;
+        volatile uint32_t unmatchedFutexWakeups;
+
+        // Called with schedLock held, at the start of a join 
+        void futexWakeJoin(ThreadInfo* th);  // may release and re-acquire schedLock 
+        void futexWaitJoin(ThreadInfo* th);
+
+        
+        void finishFakeLeave(ThreadInfo* th);
+
+        /* Must be called with schedLock held. Waits until the given thread is
+         * queued in schedLock. Used for accurate wakeups, by calling here we
+         * ensure that the waking thread won't skip a phase. May cause deadlock
+         * if used incorrectly.
+         */
+        void waitUntilQueued(ThreadInfo* th);
+};
+
+
+#endif  // SCHEDULER_H_
diff --git a/src/simple_core.cpp b/src/simple_core.cpp
new file mode 100644
index 00000000..65a7be1c
--- /dev/null
+++ b/src/simple_core.cpp
@@ -0,0 +1,127 @@
+/** $lic$
+ * Copyright (C) 2012-2014 by Massachusetts Institute of Technology
+ * Copyright (C) 2010-2013 by The Board of Trustees of Stanford University
+ *
+ * This file is part of zsim.
+ *
+ * zsim is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU General Public License as published by the Free Software
+ * Foundation, version 2.
+ *
+ * If you use this software in your research, we request that you reference
+ * the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of
+ * Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the
+ * source of the simulator in any publications that use this software, and that
+ * you send us a citation of your work.
+ *
+ * zsim is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "simple_core.h"
+#include "filter_cache.h"
+#include "zsim.h"
+
+SimpleCore::SimpleCore(FilterCache* _l1i, FilterCache* _l1d, g_string& _name) : Core(_name), l1i(_l1i), l1d(_l1d), instrs(0), curCycle(0), haltedCycles(0) {
+}
+
+void SimpleCore::initStats(AggregateStat* parentStat) {
+    AggregateStat* coreStat = new AggregateStat();
+    coreStat->init(name.c_str(), "Core stats");
+    auto x = [this]() -> uint64_t { assert(curCycle >= haltedCycles); return curCycle - haltedCycles; };
+    auto cyclesStat = makeLambdaStat(x);
+    cyclesStat->init("cycles", "Simulated cycles");
+    ProxyStat* instrsStat = new ProxyStat();
+    instrsStat->init("instrs", "Simulated instructions", &instrs);
+    coreStat->append(cyclesStat);
+    coreStat->append(instrsStat);
+    parentStat->append(coreStat);
+}
+
+uint64_t SimpleCore::getPhaseCycles() const {
+    return curCycle % zinfo->phaseLength;
+}
+
+void SimpleCore::load(Address addr) {
+    curCycle = l1d->load(addr, curCycle);
+}
+
+void SimpleCore::store(Address addr) {
+    curCycle = l1d->store(addr, curCycle);
+}
+
+void SimpleCore::bbl(Address bblAddr, BblInfo* bblInfo) {
+    //info("BBL %s %p", name.c_str(), bblInfo);
+    //info("%d %d", bblInfo->instrs, bblInfo->bytes);
+    instrs += bblInfo->instrs;
+    curCycle += bblInfo->instrs;
+
+    Address endBblAddr = bblAddr + bblInfo->bytes;
+    for (Address fetchAddr = bblAddr; fetchAddr < endBblAddr; fetchAddr+=(1 << lineBits)) {
+        curCycle = l1i->load(fetchAddr, curCycle);
+    }
+}
+
+void SimpleCore::contextSwitch(int32_t gid) {
+    if (gid == -1) {
+        l1i->contextSwitch();
+        l1d->contextSwitch();
+    }
+}
+
+void SimpleCore::join() {
+    //info("[%s] Joining, curCycle %ld phaseEnd %ld haltedCycles %ld", name.c_str(), curCycle, phaseEndCycle, haltedCycles);
+    if (curCycle < zinfo->globPhaseCycles) { //carry up to the beginning of the phase
+        haltedCycles += (zinfo->globPhaseCycles - curCycle);
+        curCycle = zinfo->globPhaseCycles;
+    }
+    phaseEndCycle = zinfo->globPhaseCycles + zinfo->phaseLength;
+    //note that with long events, curCycle can be arbitrarily larger than phaseEndCycle; however, it must be aligned in current phase
+    //info("[%s] Joined, curCycle %ld phaseEnd %ld haltedCycles %ld", name.c_str(), curCycle, phaseEndCycle, haltedCycles);
+}
+
+
+//Static class functions: Function pointers and trampolines
+
+InstrFuncPtrs SimpleCore::GetFuncPtrs() {
+    return {LoadFunc, StoreFunc, BblFunc, BranchFunc, PredLoadFunc, PredStoreFunc, FPTR_ANALYSIS, {0}};
+}
+
+void SimpleCore::LoadFunc(THREADID tid, ADDRINT addr) {
+    static_cast<SimpleCore*>(cores[tid])->load(addr);
+}
+
+void SimpleCore::StoreFunc(THREADID tid, ADDRINT addr) {
+    static_cast<SimpleCore*>(cores[tid])->store(addr);
+}
+
+void SimpleCore::PredLoadFunc(THREADID tid, ADDRINT addr, BOOL pred) {
+    if (pred) static_cast<SimpleCore*>(cores[tid])->load(addr);
+}
+
+void SimpleCore::PredStoreFunc(THREADID tid, ADDRINT addr, BOOL pred) {
+    if (pred) static_cast<SimpleCore*>(cores[tid])->store(addr);
+}
+
+void SimpleCore::BblFunc(THREADID tid, ADDRINT bblAddr, BblInfo* bblInfo) {
+    SimpleCore* core = static_cast<SimpleCore*>(cores[tid]);
+    core->bbl(bblAddr, bblInfo);
+
+    while (core->curCycle > core->phaseEndCycle) {
+        assert(core->phaseEndCycle == zinfo->globPhaseCycles + zinfo->phaseLength);
+        core->phaseEndCycle += zinfo->phaseLength;
+
+        uint32_t cid = getCid(tid);
+        //NOTE: TakeBarrier may take ownership of the core, and so it will be used by some other thread. If TakeBarrier context-switches us,
+        //the *only* safe option is to return inmmediately after we detect this, or we can race and corrupt core state. If newCid == cid,
+        //we're not at risk of racing, even if we were switched out and then switched in.
+        uint32_t newCid = TakeBarrier(tid, cid);
+        if (newCid != cid) break; /*context-switch*/
+    }
+}
+
diff --git a/src/simple_core.h b/src/simple_core.h
new file mode 100644
index 00000000..13a57b30
--- /dev/null
+++ b/src/simple_core.h
@@ -0,0 +1,76 @@
+/** $lic$
+ * Copyright (C) 2012-2014 by Massachusetts Institute of Technology
+ * Copyright (C) 2010-2013 by The Board of Trustees of Stanford University
+ *
+ * This file is part of zsim.
+ *
+ * zsim is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU General Public License as published by the Free Software
+ * Foundation, version 2.
+ *
+ * If you use this software in your research, we request that you reference
+ * the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of
+ * Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the
+ * source of the simulator in any publications that use this software, and that
+ * you send us a citation of your work.
+ *
+ * zsim is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef SIMPLE_CORE_H_
+#define SIMPLE_CORE_H_
+
+//A simple core model with IPC=1 except on memory accesses
+
+#include "core.h"
+#include "memory_hierarchy.h"
+#include "pad.h"
+
+class FilterCache;
+
+class SimpleCore : public Core {
+    protected:
+        FilterCache* l1i;
+        FilterCache* l1d;
+
+        uint64_t instrs;
+        uint64_t curCycle;
+        uint64_t phaseEndCycle; //next stopping point
+        uint64_t haltedCycles;
+
+    public:
+        SimpleCore(FilterCache* _l1i, FilterCache* _l1d, g_string& _name);
+        void initStats(AggregateStat* parentStat);
+
+        uint64_t getInstrs() const {return instrs;}
+        uint64_t getPhaseCycles() const;
+        uint64_t getCycles() const {return curCycle - haltedCycles;}
+
+        void contextSwitch(int32_t gid);
+        virtual void join();
+
+        InstrFuncPtrs GetFuncPtrs();
+
+    protected:
+        //Simulation functions
+        inline void load(Address addr);
+        inline void store(Address addr);
+        inline void bbl(Address bblAddr, BblInfo* bblInstrs);
+
+        static void LoadFunc(THREADID tid, ADDRINT addr);
+        static void StoreFunc(THREADID tid, ADDRINT addr);
+        static void BblFunc(THREADID tid, ADDRINT bblAddr, BblInfo* bblInfo);
+        static void PredLoadFunc(THREADID tid, ADDRINT addr, BOOL pred);
+        static void PredStoreFunc(THREADID tid, ADDRINT addr, BOOL pred);
+
+        static void BranchFunc(THREADID, ADDRINT, BOOL, ADDRINT, ADDRINT) {}
+}  ATTR_LINE_ALIGNED; //This needs to take up a whole cache line, or false sharing will be extremely frequent
+
+#endif  // SIMPLE_CORE_H_
+
diff --git a/src/stats.h b/src/stats.h
new file mode 100644
index 00000000..99c8f328
--- /dev/null
+++ b/src/stats.h
@@ -0,0 +1,423 @@
+/** $lic$
+ * Copyright (C) 2012-2014 by Massachusetts Institute of Technology
+ * Copyright (C) 2010-2013 by The Board of Trustees of Stanford University
+ *
+ * This file is part of zsim.
+ *
+ * zsim is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU General Public License as published by the Free Software
+ * Foundation, version 2.
+ *
+ * If you use this software in your research, we request that you reference
+ * the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of
+ * Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the
+ * source of the simulator in any publications that use this software, and that
+ * you send us a citation of your work.
+ *
+ * zsim is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/* Statistics facilities
+ * Author: Daniel Sanchez
+ * Date: Aug 2010
+ *
+ * There are four basic types of stats:
+ * - Counter: A plain single counter.
+ * - VectorCounter: A fixed-size vector of logically related counters. Each
+ *   vector element may be unnamed or named (useful when enum-indexed vectors).
+ * - Histogram: A GEMS-style histogram, intended to profile a distribution.
+ *   It has a fixed amount of buckets, and buckets are resized as samples
+ *   are added, making profiling increasingly coarser but keeping storage
+ *   space constant. Unlike GEMS-style stats, though, at some configurable
+ *   point part of the array starts growing logarithmically, to capture
+ *   outliers without hurting accuracy of most samples.
+ * - ProxyStat takes a function pointer uint64_t(*)(void) at initialization,
+ *   and calls it to get its value. It is used for cases where a stat can't
+ *   be stored as a counter (e.g. aggregates, RDTSC, performance counters,...)
+ *   or where we have values we want to output just as stats, but would not
+ *   like to treat as raw counters because e.g. they have a different type,
+ *   or for efficiency reasons (e.g. the per-thread phase cycles count is
+ *   updated on every BBL, and may be an uint32_t)
+ *
+ * Groups of stats are contained in aggregates (AggregateStat), representing
+ * a collection of stats. At initialization time, all stats are registered
+ * with an aggregate, forming a tree of stats. After all stats are
+ * initialized, the tree of stats is made immutable; no new stats can be
+ * created and output at runtime.
+ *
+ * These facilities are created with three goals in mind:
+ * 1) Allow stats to be independent of stats output: Simulator code is only
+ *    concerned with creating, naming, describing and updating a hierarchy of
+ *    stats. We can then use a variety of *stats backends* to traverse and
+ *    output the stats, either periodically or at specific events.
+ * 2) High-performance stats: Updating counters should be as fast as updating raw
+ *    integers. Counters are objects though, so they entail some space overhead.
+ * 3) Allow fixed-size stats output: The stat types supported are all fixed-size,
+ *    and stats cannot be created after initialization. This allows fixed-size records,
+ *    making periodic stats much easier to parse and **iterate over** (e.g. we can
+ *    parse 1% of the samples for a high-level graph without bringing the whole stats
+ *    file from disk, then zoom in on a specific portion, etc.).
+ *
+ * This design was definitely influenced by the M5 stats facilities, however,
+ * it is significantly simpler, doesn't use templates or has formula support,
+ * and has an emphasis on fixed-size records for periodic stats.
+ */
+
+#ifndef STATS_H_
+#define STATS_H_
+
+/* TODO: I want these to be POD types, but polymorphism (needed by dynamic_cast) probably disables it. Dang. */
+
+#include <stdint.h>
+#include <string>
+#include "g_std/g_vector.h"
+#include "log.h"
+
+class Stat : public GlobAlloc {
+    protected:
+        const char* _name;
+        const char* _desc;
+
+    public:
+        Stat() : _name(NULL), _desc(NULL) {}
+
+        virtual ~Stat() {}
+
+        const char* name() const {
+            assert(_name);
+            return _name;
+        }
+
+        const char* desc() const {
+            assert(_desc);
+            return _desc;
+        }
+
+    protected:
+        virtual void initStat(const char* name, const char* desc) {
+            assert(name);
+            assert(desc);
+            assert(!_name);
+            assert(!_desc);
+            _name = name;
+            _desc = desc;
+        }
+};
+
+class AggregateStat : public Stat {
+    private:
+        g_vector<Stat*> _children;
+        bool _isMutable;
+        bool _isRegular;
+
+    public:
+        /* An aggregate stat is regular if all its children are 1) aggregate and 2) of the same type (e.g. all the threads).
+         * This lets us express all the subtypes of instances of a common datatype, and this collection as an array. It is
+         * useful with HDF5, where we would otherwise be forced to have huge compund datatypes, which HDF5 can't do after some
+         * point.
+         */
+        explicit AggregateStat(bool isRegular = false) : Stat(), _isMutable(true), _isRegular(isRegular) {}
+
+        void init(const char* name, const char* desc) {
+            assert(_isMutable);
+            initStat(name, desc);
+        }
+
+        //Returns true if it is a non-empty type, false otherwise. Empty types are culled by the parent.
+        bool makeImmutable() {
+            assert(_isMutable);
+            assert(_name != NULL); //Should have been initialized
+            _isMutable = false;
+            g_vector<Stat*>::iterator it;
+            g_vector<Stat*> newChildren;
+            for (it = _children.begin(); it != _children.end(); it++) {
+                Stat* s = *it;
+                AggregateStat* as = dynamic_cast<AggregateStat*>(s);
+                if (as) {
+                    bool emptyChild = as->makeImmutable();
+                    if (!emptyChild) newChildren.push_back(s);
+                } else {
+                    newChildren.push_back(s);
+                }
+            }
+            _children = newChildren;
+            return _children.size() == 0;
+        }
+
+        void append(Stat* child) {
+            assert(_isMutable);
+            _children.push_back(child);
+        }
+
+        uint32_t size() const {
+            assert(!_isMutable);
+            return _children.size();
+        }
+
+        bool isRegular() const {
+            return _isRegular;
+        }
+
+        Stat* get(uint32_t idx) const {
+            assert(!_isMutable);
+            return _children[idx];
+        }
+};
+
+/*
+ * General scalar class
+ * FIXME: All other scalar stats should derive from this
+ */
+class ScalarStat : public Stat {
+    public:
+        ScalarStat() : Stat() {}
+
+        virtual void init(const char* name, const char* desc) {
+            initStat(name, desc);
+        }
+
+        virtual uint64_t get() const = 0;
+};
+
+class VectorStat : public Stat {
+    protected:
+        const char** _counterNames;
+
+    public:
+        VectorStat() : _counterNames(NULL) {}
+
+        virtual uint64_t count(uint32_t idx) const = 0;
+        virtual uint32_t size() const = 0;
+
+        inline bool hasCounterNames() {
+            return (_counterNames != NULL);
+        }
+
+        inline const char* counterName(uint32_t idx) const {
+            return (_counterNames == NULL)? NULL : _counterNames[idx];
+        }
+
+        virtual void init(const char* name, const char* desc) {
+            initStat(name, desc);
+        }
+};
+
+
+class Counter : public Stat {
+    private:
+        uint64_t _count;
+
+    public:
+        Counter() : Stat(), _count(0) {}
+
+        void init(const char* name, const char* desc) {
+            initStat(name, desc);
+            _count = 0;
+        }
+
+        inline void inc(uint64_t delta) {
+            _count += delta;
+        }
+
+        inline void inc() {
+            _count++;
+        }
+
+        inline void atomicInc(uint64_t delta) {
+            __sync_fetch_and_add(&_count, delta);
+        }
+
+        inline void atomicInc() {
+            __sync_fetch_and_add(&_count, 1);
+        }
+
+        inline uint64_t count() const {
+            return _count;
+        }
+
+        inline void set(uint64_t data) {
+            _count = data;
+        }
+};
+
+class VectorCounter : public VectorStat {
+    private:
+        g_vector<uint64_t> _counters;
+
+    public:
+        VectorCounter() : VectorStat() {}
+
+        /* Without counter names */
+        virtual void init(const char* name, const char* desc, uint32_t size) {
+            initStat(name, desc);
+            assert(size > 0);
+            _counters.resize(size);
+            for (uint32_t i = 0; i < size; i++) _counters[i] = 0;
+            _counterNames = NULL;
+        }
+
+        /* With counter names */
+        virtual void init(const char* name, const char* desc, uint32_t size, const char** counterNames) {
+            init(name, desc, size);
+            assert(counterNames);
+            _counterNames = gm_dup<const char*>(counterNames, size);
+        }
+
+        inline void inc(uint32_t idx, uint64_t value) {
+            _counters[idx] += value;
+        }
+
+        inline void inc(uint32_t idx) {
+             _counters[idx]++;
+        }
+
+        inline void atomicInc(uint32_t idx, uint64_t delta) {
+            __sync_fetch_and_add(&_counters[idx], delta);
+        }
+
+        inline void atomicInc(uint32_t idx) {
+            __sync_fetch_and_add(&_counters[idx], 1);
+        }
+
+        inline virtual uint64_t count(uint32_t idx) const {
+            return _counters[idx];
+        }
+
+        inline uint32_t size() const {
+            return _counters.size();
+        }
+/*
+        inline bool hasCounterNames() {
+            return (_counterNames != NULL);
+        }
+
+        inline const char* counterName(uint32_t idx) const {
+            return (_counterNames == NULL)? NULL : _counterNames[idx];
+        }*/
+};
+
+/*
+class Histogram : public Stat {
+    //TBD
+};
+*/
+
+class ProxyStat : public Stat {
+    private:
+        uint64_t* _statPtr;
+
+    public:
+        ProxyStat() : Stat(), _statPtr(NULL) {}
+
+        void init(const char* name, const char* desc, uint64_t* ptr) {
+            initStat(name, desc);
+            _statPtr = ptr;
+        }
+
+        inline uint64_t stat() const {
+            assert(_statPtr); //TBD we may want to make this work only with volatiles...
+            return *_statPtr;
+        }
+};
+
+
+class ProxyFuncStat : public Stat {
+    private:
+        uint64_t (*_func)();
+
+    public:
+        ProxyFuncStat() : Stat(), _func(NULL) {}
+
+        void init(const char* name, const char* desc, uint64_t (*func)()) {
+            initStat(name, desc);
+            _func = func;
+        }
+
+        //Hmmm, this is a const function but the function pointer we use is not necessarily const. Oh well, it works.
+        inline uint64_t stat() const {
+            assert(_func);
+            return _func();
+        }
+};
+
+/*
+ * Generic lambda stats
+ * If your stat depends on a formula, this lets you encode it compactly using C++11 lambdas
+ *
+ * Usage example:
+ *  auto x = [this]() { return curCycle - haltedCycles; }; //declare the lambda function that computes the stat; note this is captured because these values are class members
+ *  LambdaStat<decltype(x)>* cyclesStat = new LambdaStat<decltype(x)>(x); //instantiate the templated stat. Each lambda has a unique type, which you get with decltype
+ *  cyclesStat->init("cycles", "Simulated cycles"); //etc. Use as an usual stat!
+ */
+template <typename F>
+class LambdaStat : public ScalarStat {
+    private:
+        F f;
+
+    public:
+        explicit LambdaStat(F _f) : f(_f) {} //copy the lambda
+        uint64_t get() const {return f();}
+};
+
+template<typename F>
+class LambdaVectorStat : public VectorStat {
+    private:
+        F f;
+        uint32_t s;
+
+    public:
+        LambdaVectorStat(F _f, uint32_t _s) : VectorStat(), f(_f), s(_s) {}
+        uint32_t size() const { return s; }
+        uint64_t count(uint32_t idx) const { //dsm: Interestingly, this compiles even if f() is not const. gcc may catch this eventually...
+            assert(idx < s);
+            return f(idx);
+        }
+};
+
+// Convenience creation functions
+template <typename F>
+LambdaStat<F>* makeLambdaStat(F f) { return new LambdaStat<F>(f); }
+
+template<typename F>
+LambdaVectorStat<F>* makeLambdaVectorStat(F f, uint32_t size) { return new LambdaVectorStat<F>(f, size); }
+
+//Stat Backends declarations.
+
+class StatsBackend : public GlobAlloc {
+    public:
+        StatsBackend() {}
+        virtual ~StatsBackend() {}
+        virtual void dump(bool buffered)=0;
+};
+
+
+class TextBackendImpl;
+
+class TextBackend : public StatsBackend {
+    private:
+        TextBackendImpl* backend;
+
+    public:
+        TextBackend(const char* filename, AggregateStat* rootStat);
+        virtual void dump(bool buffered);
+};
+
+
+class HDF5BackendImpl;
+
+class HDF5Backend : public StatsBackend {
+    private:
+        HDF5BackendImpl* backend;
+
+    public:
+        HDF5Backend(const char* filename, AggregateStat* rootStat, size_t bytesPerWrite, bool skipVectors, bool sumRegularAggregates);
+        virtual void dump(bool buffered);
+};
+
+#endif  // STATS_H_
diff --git a/src/stats_filter.cpp b/src/stats_filter.cpp
new file mode 100644
index 00000000..91fc1dbd
--- /dev/null
+++ b/src/stats_filter.cpp
@@ -0,0 +1,65 @@
+/** $lic$
+ * Copyright (C) 2012-2014 by Massachusetts Institute of Technology
+ * Copyright (C) 2010-2013 by The Board of Trustees of Stanford University
+ *
+ * This file is part of zsim.
+ *
+ * zsim is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU General Public License as published by the Free Software
+ * Foundation, version 2.
+ *
+ * If you use this software in your research, we request that you reference
+ * the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of
+ * Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the
+ * source of the simulator in any publications that use this software, and that
+ * you send us a citation of your work.
+ *
+ * zsim is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "stats_filter.h"
+#include <regex>
+#include <string>
+#include <vector>
+
+using std::regex; using std::regex_match; using std::string; using std::vector;
+
+// FilterStats operates recursively, building up a filtered hierarchy of aggregates
+
+AggregateStat* FilterStatsLevel(const AggregateStat* src, const regex& filter, const char* prefix) {
+    string base = prefix? (string(prefix) + src->name() + ".") : ""; //if NULL prefix, omit our name (we're root)
+    vector<Stat*> children;
+    for (uint32_t i = 0; i < src->size(); i++) {
+        Stat* child = src->get(i);
+        if (AggregateStat* as = dynamic_cast<AggregateStat*>(child)) {
+            AggregateStat* fs = FilterStatsLevel(as, filter, base.c_str());
+            if (fs) children.push_back(fs);
+        } else {
+            string name = base + child->name();
+            if (regex_match(name, filter)) children.push_back(child);
+        }
+    }
+
+    if (children.size()) {
+        AggregateStat* res = new AggregateStat(src->isRegular());
+        res->init(src->name(), src->desc());
+        for (Stat* c : children) res->append(c);
+        return res;
+    } else {
+        return NULL;
+    }
+}
+
+AggregateStat* FilterStats(const AggregateStat* rootStat, const char* regexStr) {
+    regex filter(regexStr);
+    AggregateStat* res = FilterStatsLevel(rootStat, filter, NULL /*root*/);
+    if (res) res->makeImmutable();
+    return res;
+}
+
diff --git a/src/stats_filter.h b/src/stats_filter.h
new file mode 100644
index 00000000..6712453d
--- /dev/null
+++ b/src/stats_filter.h
@@ -0,0 +1,38 @@
+/** $lic$
+ * Copyright (C) 2012-2014 by Massachusetts Institute of Technology
+ * Copyright (C) 2010-2013 by The Board of Trustees of Stanford University
+ *
+ * This file is part of zsim.
+ *
+ * zsim is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU General Public License as published by the Free Software
+ * Foundation, version 2.
+ *
+ * If you use this software in your research, we request that you reference
+ * the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of
+ * Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the
+ * source of the simulator in any publications that use this software, and that
+ * you send us a citation of your work.
+ *
+ * zsim is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef STATS_FILTER_H_
+#define STATS_FILTER_H_
+
+#include "stats.h"
+
+/* Produces a filtered stats tree, where only the base stats whose names match the regex are retained.
+ * Base stats are NOT copied, they are either kept or ommitted. Aggregate stats are created as needed.
+ * The returned tree can be passed to any backend to produce filtered dumps. Returns NULL if nothing
+ * matches the regex.
+ */
+AggregateStat* FilterStats(const AggregateStat* srcStat, const char* regex);
+
+#endif  // STATS_FILTER_H_
diff --git a/src/str.h b/src/str.h
new file mode 100644
index 00000000..23d12915
--- /dev/null
+++ b/src/str.h
@@ -0,0 +1,49 @@
+/** $lic$
+ * Copyright (C) 2012-2014 by Massachusetts Institute of Technology
+ * Copyright (C) 2010-2013 by The Board of Trustees of Stanford University
+ *
+ * This file is part of zsim.
+ *
+ * zsim is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU General Public License as published by the Free Software
+ * Foundation, version 2.
+ *
+ * If you use this software in your research, we request that you reference
+ * the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of
+ * Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the
+ * source of the simulator in any publications that use this software, and that
+ * you send us a citation of your work.
+ *
+ * zsim is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef STR_H_
+#define STR_H_
+
+/* Turn anything stringstream can grok into a string */
+
+#include <sstream>
+#include <string>
+#include <vector>
+
+template <typename T> std::string Str(T v) {
+    std::stringstream ss;
+    ss << v;
+    return ss.str();
+}
+
+template <typename T> std::string Str(const std::vector<T>& v) {
+    std::stringstream ss;
+    ss << "[";
+    for (auto& x : v) ss << " " << x;
+    ss << "]";
+    return ss.str();
+}
+
+#endif  // STR_H_
diff --git a/src/text_stats.cpp b/src/text_stats.cpp
new file mode 100644
index 00000000..198227d6
--- /dev/null
+++ b/src/text_stats.cpp
@@ -0,0 +1,94 @@
+/** $lic$
+ * Copyright (C) 2012-2014 by Massachusetts Institute of Technology
+ * Copyright (C) 2010-2013 by The Board of Trustees of Stanford University
+ *
+ * This file is part of zsim.
+ *
+ * zsim is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU General Public License as published by the Free Software
+ * Foundation, version 2.
+ *
+ * If you use this software in your research, we request that you reference
+ * the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of
+ * Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the
+ * source of the simulator in any publications that use this software, and that
+ * you send us a citation of your work.
+ *
+ * zsim is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <fstream>
+#include <iostream>
+#include "galloc.h"
+#include "log.h"
+#include "stats.h"
+#include "zsim.h"
+
+using std::endl;
+
+class TextBackendImpl : public GlobAlloc {
+    private:
+        const char* filename;
+        AggregateStat* rootStat;
+
+        void dumpStat(Stat* s, uint32_t level, std::ofstream* out) {
+            for (uint32_t i = 0; i < level; i++) *out << " ";
+            *out << s->name() << ": ";
+            if (AggregateStat* as = dynamic_cast<AggregateStat*>(s)) {
+                *out << "# " << as->desc() << endl;
+                for (uint32_t i = 0; i < as->size(); i++) {
+                    dumpStat(as->get(i), level+1, out);
+                }
+            } else if (Counter* cs = dynamic_cast<Counter*>(s)) {
+                *out << cs->count() << " # " << cs->desc() << endl;
+            } else if (ScalarStat* ss = dynamic_cast<ScalarStat*>(s)) {
+                *out << ss->get() << " # " << ss->desc() << endl;
+            } else if (VectorStat* vs = dynamic_cast<VectorStat*>(s)) {
+                *out << "# " << vs->desc() << endl;
+                for (uint32_t i = 0; i < vs->size(); i++) {
+                    for (uint32_t j = 0; j < level+1; j++) *out << " ";
+                    if (vs->hasCounterNames()) {
+                        *out << vs->counterName(i) << ": " << vs->count(i) << endl;
+                    } else {
+                        *out << i << ": " << vs->count(i) << endl;
+                    }
+                }
+            } else if (ProxyStat* ps = dynamic_cast<ProxyStat*>(s)) {
+                *out << ps->stat() << " # " << ps->desc() << endl;
+            } else if (ProxyFuncStat* pfs = dynamic_cast<ProxyFuncStat*>(s)) {
+                *out << pfs->stat() << " # " << pfs->desc() << endl;
+            } else {
+                panic("Unrecognized stat type");
+            }
+        }
+
+    public:
+        TextBackendImpl(const char* _filename, AggregateStat* _rootStat) :
+            filename(_filename), rootStat(_rootStat)
+        {
+            std::ofstream out(filename, std::ios_base::out);
+            out << "# zsim stats" << endl;
+            out << "===" << endl;
+        }
+
+        void dump(bool buffered) {
+            std::ofstream out(filename, std::ios_base::app);
+            dumpStat(rootStat, 0, &out);
+            out << "===" << endl;
+        }
+};
+
+TextBackend::TextBackend(const char* filename, AggregateStat* rootStat) {
+    backend = new TextBackendImpl(filename, rootStat);
+}
+
+void TextBackend::dump(bool buffered) {
+    backend->dump(buffered);
+}
+
diff --git a/src/tick_event.h b/src/tick_event.h
new file mode 100644
index 00000000..0131d43c
--- /dev/null
+++ b/src/tick_event.h
@@ -0,0 +1,68 @@
+/** $lic$
+ * Copyright (C) 2012-2014 by Massachusetts Institute of Technology
+ * Copyright (C) 2010-2013 by The Board of Trustees of Stanford University
+ *
+ * This file is part of zsim.
+ *
+ * zsim is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU General Public License as published by the Free Software
+ * Foundation, version 2.
+ *
+ * If you use this software in your research, we request that you reference
+ * the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of
+ * Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the
+ * source of the simulator in any publications that use this software, and that
+ * you send us a citation of your work.
+ *
+ * zsim is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef TICK_EVENT_H_
+#define TICK_EVENT_H_
+
+#include "contention_sim.h"
+#include "timing_event.h"
+#include "zsim.h"
+
+//FIXME: Rearchitect this SENSIBLY
+template <class T>
+class TickEvent : public TimingEvent, public GlobAlloc { //this one should be allocated from glob mem
+    private:
+        T* obj;
+        bool active;
+
+    public:
+        TickEvent(T* _obj, int32_t domain) : TimingEvent(0, 0, domain), obj(_obj), active(false) {
+            setMinStartCycle(0);
+        }
+
+        void parentDone(uint64_t startCycle) {
+            panic("This is queued directly");
+        }
+
+        void queue(uint64_t startCycle) {
+            assert(!active);
+            active = true;
+            zinfo->contentionSim->enqueueSynced(this, startCycle);
+        }
+
+        void simulate(uint64_t startCycle) {
+            uint32_t delay = obj->tick(startCycle);
+            if (delay) {
+                requeue(startCycle+delay);
+            } else {
+                active = false;
+            }
+        }
+
+        using GlobAlloc::operator new; //grrrrrrrrr
+        using GlobAlloc::operator delete;
+};
+
+#endif  // TICK_EVENT_H_
diff --git a/src/timing_cache.cpp b/src/timing_cache.cpp
new file mode 100644
index 00000000..cdc16daf
--- /dev/null
+++ b/src/timing_cache.cpp
@@ -0,0 +1,376 @@
+/** $lic$
+ * Copyright (C) 2012-2014 by Massachusetts Institute of Technology
+ * Copyright (C) 2010-2013 by The Board of Trustees of Stanford University
+ *
+ * This file is part of zsim.
+ *
+ * zsim is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU General Public License as published by the Free Software
+ * Foundation, version 2.
+ *
+ * If you use this software in your research, we request that you reference
+ * the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of
+ * Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the
+ * source of the simulator in any publications that use this software, and that
+ * you send us a citation of your work.
+ *
+ * zsim is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "timing_cache.h"
+#include "event_recorder.h"
+#include "timing_event.h"
+#include "zsim.h"
+
+// Events
+class HitEvent : public TimingEvent {
+    private:
+        TimingCache* cache;
+
+    public:
+        HitEvent(TimingCache* _cache,  uint32_t postDelay, int32_t domain) : TimingEvent(0, postDelay, domain), cache(_cache) {}
+
+        void simulate(uint64_t startCycle) {
+            cache->simulateHit(this, startCycle);
+        }
+};
+
+
+class MissStartEvent : public TimingEvent {
+    private:
+        TimingCache* cache;
+    public:
+        uint64_t startCycle; //for profiling purposes
+        MissStartEvent(TimingCache* _cache,  uint32_t postDelay, int32_t domain) : TimingEvent(0, postDelay, domain), cache(_cache) {}
+        void simulate(uint64_t startCycle) {cache->simulateMissStart(this, startCycle);}
+};
+
+class MissResponseEvent : public TimingEvent {
+    private:
+        TimingCache* cache;
+        MissStartEvent* mse;
+    public:
+        MissResponseEvent(TimingCache* _cache, MissStartEvent* _mse, int32_t domain) : TimingEvent(0, 0, domain), cache(_cache), mse(_mse) {}
+        void simulate(uint64_t startCycle) {cache->simulateMissResponse(this, startCycle, mse);}
+};
+
+class MissWritebackEvent : public TimingEvent {
+    private:
+        TimingCache* cache;
+        MissStartEvent* mse;
+    public:
+        MissWritebackEvent(TimingCache* _cache,  MissStartEvent* _mse, uint32_t postDelay, int32_t domain) : TimingEvent(0, postDelay, domain), cache(_cache), mse(_mse) {}
+        void simulate(uint64_t startCycle) {cache->simulateMissWriteback(this, startCycle, mse);}
+};
+
+class ReplAccessEvent : public TimingEvent {
+    private:
+        TimingCache* cache;
+    public:
+        uint32_t accsLeft;
+        ReplAccessEvent(TimingCache* _cache, uint32_t _accsLeft, uint32_t preDelay, uint32_t postDelay, int32_t domain) : TimingEvent(preDelay, postDelay, domain), cache(_cache), accsLeft(_accsLeft) {}
+        void simulate(uint64_t startCycle) {cache->simulateReplAccess(this, startCycle);}
+};
+
+TimingCache::TimingCache(uint32_t _numLines, CC* _cc, CacheArray* _array, ReplPolicy* _rp,
+        uint32_t _accLat, uint32_t _invLat, uint32_t mshrs, uint32_t _tagLat, uint32_t _ways, uint32_t _cands, uint32_t _domain, const g_string& _name)
+    : Cache(_numLines, _cc, _array, _rp, _accLat, _invLat, _name), numMSHRs(mshrs), tagLat(_tagLat), ways(_ways), cands(_cands)
+{
+    lastFreeCycle = 0;
+    lastAccCycle = 0;
+    assert(numMSHRs > 0);
+    activeMisses = 0;
+    domain = _domain;
+    info("%s: mshrs %d domain %d", name.c_str(), numMSHRs, domain);
+}
+
+void TimingCache::initStats(AggregateStat* parentStat) {
+    AggregateStat* cacheStat = new AggregateStat();
+    cacheStat->init(name.c_str(), "Timing cache stats");
+    initCacheStats(cacheStat);
+
+    //Stats specific to timing cache
+    profOccHist.init("occHist", "Occupancy MSHR cycle histogram", numMSHRs+1);
+    cacheStat->append(&profOccHist);
+
+    profHitLat.init("latHit", "Cumulative latency accesses that hit (demand and non-demand)");
+    profMissRespLat.init("latMissResp", "Cumulative latency for miss start to response");
+    profMissLat.init("latMiss", "Cumulative latency for miss start to finish (free MSHR)");
+
+    cacheStat->append(&profHitLat);
+    cacheStat->append(&profMissRespLat);
+    cacheStat->append(&profMissLat);
+
+    parentStat->append(cacheStat);
+}
+
+// TODO(dsm): This is copied verbatim from Cache. We should split Cache into different methods, then call those.
+uint64_t TimingCache::access(MemReq& req) {
+    EventRecorder* evRec = zinfo->eventRecorders[req.srcId];
+    assert_msg(evRec, "TimingCache is not connected to TimingCore");
+    uint32_t initialRecords = evRec->numRecords();
+
+    bool hasWritebackRecord = false;
+    TimingRecord writebackRecord;
+    bool hasAccessRecord = false;
+    TimingRecord accessRecord;
+    uint64_t evDoneCycle = 0;
+    
+    uint64_t respCycle = req.cycle;
+    bool skipAccess = cc->startAccess(req); //may need to skip access due to races (NOTE: may change req.type!)
+    if (likely(!skipAccess)) {
+        bool updateReplacement = (req.type == GETS) || (req.type == GETX);
+        int32_t lineId = array->lookup(req.lineAddr, &req, updateReplacement);
+        respCycle += accLat;
+
+        if (lineId == -1 /*&& cc->shouldAllocate(req)*/) {
+            assert(cc->shouldAllocate(req)); //dsm: for now, we don't deal with non-inclusion in TimingCache
+
+            //Make space for new line
+            Address wbLineAddr;
+            lineId = array->preinsert(req.lineAddr, &req, &wbLineAddr); //find the lineId to replace
+            trace(Cache, "[%s] Evicting 0x%lx", name.c_str(), wbLineAddr);
+
+            //Evictions are not in the critical path in any sane implementation -- we do not include their delays
+            //NOTE: We might be "evicting" an invalid line for all we know. Coherence controllers will know what to do
+            evDoneCycle = cc->processEviction(req, wbLineAddr, lineId, respCycle); //if needed, send invalidates/downgrades to lower level, and wb to upper level
+
+            array->postinsert(req.lineAddr, &req, lineId); //do the actual insertion. NOTE: Now we must split insert into a 2-phase thing because cc unlocks us.
+
+            if (evRec->numRecords() > initialRecords) {
+                assert_msg(evRec->numRecords() == initialRecords + 1, "evRec records on eviction %ld", evRec->numRecords());
+                writebackRecord = evRec->getRecord(initialRecords);
+                hasWritebackRecord = true;
+                evRec->popRecord();
+            }
+        }
+
+        uint64_t getDoneCycle = respCycle;
+        respCycle = cc->processAccess(req, lineId, respCycle, &getDoneCycle);
+
+        if (evRec->numRecords() > initialRecords) {
+            assert_msg(evRec->numRecords() == initialRecords + 1, "evRec records %ld", evRec->numRecords());
+            accessRecord = evRec->getRecord(initialRecords);
+            hasAccessRecord = true;
+            evRec->popRecord();
+        }
+
+        // At this point we have all the info we need to hammer out the timing record
+        TimingRecord tr = {req.lineAddr << lineBits, req.cycle, respCycle, req.type, NULL, NULL}; //note the end event is the response, not the wback
+
+        if (getDoneCycle - req.cycle == accLat) {
+            // Hit
+            assert(!hasWritebackRecord);
+            assert(!hasAccessRecord);
+            uint64_t hitLat = respCycle - req.cycle; // accLat + invLat
+            HitEvent* ev = new (evRec) HitEvent(this, hitLat, domain);
+            ev->setMinStartCycle(req.cycle);
+            tr.startEvent = tr.endEvent = ev;
+        } else {
+            assert_msg(getDoneCycle == respCycle, "gdc %ld rc %ld", getDoneCycle, respCycle);
+
+            // Miss events:
+            // MissStart (does high-prio lookup) -> getEvent || evictionEvent || replEvent (if needed) -> MissWriteback
+
+            MissStartEvent* mse = new (evRec) MissStartEvent(this, accLat, domain);
+            MissResponseEvent* mre = new (evRec) MissResponseEvent(this, mse, domain);
+            MissWritebackEvent* mwe = new (evRec) MissWritebackEvent(this, mse, accLat, domain);
+
+            mse->setMinStartCycle(req.cycle);
+            mre->setMinStartCycle(getDoneCycle);
+            mwe->setMinStartCycle(MAX(evDoneCycle, getDoneCycle));
+
+            // Tie two events to an optional timing record
+            // TODO: Promote to evRec if this is more generally useful
+            auto connect = [evRec](const TimingRecord* r, TimingEvent* startEv, TimingEvent* endEv, uint64_t startCycle, uint64_t endCycle) {
+                assert_msg(startCycle <= endCycle, "start > end? %ld %ld", startCycle, endCycle);
+                if (r) {
+                    assert_msg(startCycle <= r->reqCycle, "%ld / %ld", startCycle, r->reqCycle);
+                    assert_msg(r->respCycle <= endCycle, "%ld %ld %ld %ld", startCycle, r->reqCycle, r->respCycle, endCycle);
+                    uint64_t upLat = r->reqCycle - startCycle;
+                    uint64_t downLat = endCycle - r->respCycle;
+
+                    if (upLat) {
+                        DelayEvent* dUp = new (evRec) DelayEvent(upLat);
+                        dUp->setMinStartCycle(startCycle);
+                        startEv->addChild(dUp, evRec)->addChild(r->startEvent, evRec);
+                    } else {
+                        startEv->addChild(r->startEvent, evRec);
+                    }
+
+                    if (downLat) {
+                        DelayEvent* dDown = new (evRec) DelayEvent(downLat);
+                        dDown->setMinStartCycle(r->respCycle);
+                        r->endEvent->addChild(dDown, evRec)->addChild(endEv, evRec);
+                    } else {
+                        r->endEvent->addChild(endEv, evRec);
+                    }
+                } else {
+                    if (startCycle == endCycle) {
+                        startEv->addChild(endEv, evRec);
+                    } else {
+                        DelayEvent* dEv = new (evRec) DelayEvent(endCycle - startCycle);
+                        dEv->setMinStartCycle(startCycle);
+                        startEv->addChild(dEv, evRec)->addChild(endEv, evRec);
+                    }
+                }
+            };
+
+            // Get path
+            connect(hasAccessRecord? &accessRecord : NULL, mse, mre, req.cycle + accLat, getDoneCycle);
+            mre->addChild(mwe, evRec);
+
+            // Eviction path
+            if (evDoneCycle) {
+                connect(hasWritebackRecord? &writebackRecord : NULL, mse, mwe, req.cycle + accLat, evDoneCycle);
+            }
+
+            // Replacement path
+            if (evDoneCycle && cands > ways) {
+                uint32_t replLookups = (cands + (ways-1))/ways - 1; // e.g., with 4 ways, 5-8 -> 1, 9-12 -> 2, etc.
+                assert(replLookups);
+
+                uint32_t fringeAccs = ways - 1;
+                uint32_t accsSoFar = 0;
+
+                TimingEvent* p = mse;
+
+                // Candidate lookup events
+                while (accsSoFar < replLookups) {
+                    uint32_t preDelay = accsSoFar? 0 : tagLat;
+                    uint32_t postDelay = tagLat - MIN(tagLat - 1, fringeAccs);
+                    uint32_t accs = MIN(fringeAccs, replLookups - accsSoFar);
+                    //info("ReplAccessEvent rl %d fa %d preD %d postD %d accs %d", replLookups, fringeAccs, preDelay, postDelay, accs);
+                    ReplAccessEvent* raEv = new (evRec) ReplAccessEvent(this, accs, preDelay, postDelay, domain);
+                    raEv->setMinStartCycle(req.cycle /*lax...*/);
+                    accsSoFar += accs;
+                    p->addChild(raEv, evRec);
+                    p = raEv;
+                    fringeAccs *= ways - 1;
+                }
+
+                // Swap events -- typically, one read and one write work for 1-2 swaps. Exact number depends on layout.
+                ReplAccessEvent* rdEv = new (evRec) ReplAccessEvent(this, 1, tagLat, tagLat, domain);
+                rdEv->setMinStartCycle(req.cycle /*lax...*/);
+                ReplAccessEvent* wrEv = new (evRec) ReplAccessEvent(this, 1, 0, 0, domain);
+                wrEv->setMinStartCycle(req.cycle /*lax...*/);
+
+                p->addChild(rdEv, evRec)->addChild(wrEv, evRec)->addChild(mwe, evRec);
+            }
+
+
+            tr.startEvent = mse;
+            tr.endEvent = mre; // note the end event is the response, not the wback
+        }
+        evRec->pushRecord(tr);
+    }
+
+    cc->endAccess(req);
+
+    assert_msg(respCycle >= req.cycle, "[%s] resp < req? 0x%lx type %s childState %s, respCycle %ld reqCycle %ld",
+            name.c_str(), req.lineAddr, AccessTypeName(req.type), MESIStateName(*req.state), respCycle, req.cycle);
+    return respCycle;
+}
+
+
+uint64_t TimingCache::highPrioAccess(uint64_t cycle) {
+    assert(cycle >= lastFreeCycle);
+    uint64_t lookupCycle = MAX(cycle, lastAccCycle+1);
+    if (lastAccCycle < cycle-1) lastFreeCycle = cycle-1; //record last free run
+    lastAccCycle = lookupCycle;
+    return lookupCycle;
+}
+
+/* The simple things you see here are complicated,
+ * I look pretty young but I'm just back-dated...
+ * 
+ * To make this efficient, we do not want to keep priority queues. Instead, a
+ * low-priority access is granted if there was a free slot on the *previous*
+ * cycle. This means that low-prio accesses should be post-dated by 1 cycle.
+ * This is fine to do, since these accesses are writebacks and non critical
+ * path accesses. Essentially, we're modeling that we know those accesses one
+ * cycle in advance.
+ */
+uint64_t TimingCache::tryLowPrioAccess(uint64_t cycle) {
+    if (lastAccCycle < cycle-1 || lastFreeCycle == cycle-1) {
+        lastFreeCycle = 0;
+        lastAccCycle = MAX(cycle-1, lastAccCycle);
+        return cycle;
+    } else {
+        return 0;
+    }
+}
+
+void TimingCache::simulateHit(HitEvent* ev, uint64_t cycle) {
+    if (activeMisses < numMSHRs) {
+        uint64_t lookupCycle = highPrioAccess(cycle);
+        profHitLat.inc(lookupCycle-cycle);
+        ev->done(lookupCycle);  // postDelay includes accLat + invalLat
+    } else {
+        // queue
+        ev->hold();
+        pendingQueue.push_back(ev);
+    }
+}
+
+void TimingCache::simulateMissStart(MissStartEvent* ev, uint64_t cycle) {
+    if (activeMisses < numMSHRs) {
+        activeMisses++;
+        profOccHist.transition(activeMisses, cycle);
+
+        ev->startCycle = cycle;
+        uint64_t lookupCycle = highPrioAccess(cycle);
+        ev->done(lookupCycle);
+    } else {
+        //info("Miss, all MSHRs used, queuing");
+        ev->hold();
+        pendingQueue.push_back(ev);
+    }
+}
+
+void TimingCache::simulateMissResponse(MissResponseEvent* ev, uint64_t cycle, MissStartEvent* mse) {
+    profMissRespLat.inc(cycle - mse->startCycle);
+    ev->done(cycle);
+}
+
+void TimingCache::simulateMissWriteback(MissWritebackEvent* ev, uint64_t cycle, MissStartEvent* mse) {
+    uint64_t lookupCycle = tryLowPrioAccess(cycle);
+    if (lookupCycle) { //success, release MSHR
+        assert(activeMisses);
+        profMissLat.inc(cycle - mse->startCycle);
+        activeMisses--;
+        profOccHist.transition(activeMisses, lookupCycle);
+        if (!pendingQueue.empty()) {
+            //info("XXX %ld elems in pending queue", pendingQueue.size());
+            for (TimingEvent* qev : pendingQueue) {
+                qev->requeue(cycle+1);
+            }
+            pendingQueue.clear();
+        }
+        ev->done(cycle);
+    } else {
+        ev->requeue(cycle+1);
+    }
+}
+
+void TimingCache::simulateReplAccess(ReplAccessEvent* ev, uint64_t cycle) {
+    assert(ev->accsLeft);
+    uint64_t lookupCycle = tryLowPrioAccess(cycle);
+    if (lookupCycle) {
+        ev->accsLeft--;
+        if (!ev->accsLeft) {
+            ev->done(cycle);
+        } else {
+            ev->requeue(cycle+1);
+        }
+    } else {
+        ev->requeue(cycle+1);
+    }
+}
+
diff --git a/src/timing_cache.h b/src/timing_cache.h
new file mode 100644
index 00000000..5cbdc859
--- /dev/null
+++ b/src/timing_cache.h
@@ -0,0 +1,76 @@
+/** $lic$
+ * Copyright (C) 2012-2014 by Massachusetts Institute of Technology
+ * Copyright (C) 2010-2013 by The Board of Trustees of Stanford University
+ *
+ * This file is part of zsim.
+ *
+ * zsim is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU General Public License as published by the Free Software
+ * Foundation, version 2.
+ *
+ * If you use this software in your research, we request that you reference
+ * the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of
+ * Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the
+ * source of the simulator in any publications that use this software, and that
+ * you send us a citation of your work.
+ *
+ * zsim is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef TIMING_CACHE_H_
+#define TIMING_CACHE_H_
+
+#include "breakdown_stats.h"
+#include "cache.h"
+
+class HitEvent;
+class MissStartEvent;
+class MissResponseEvent;
+class MissWritebackEvent;
+class ReplAccessEvent;
+class TimingEvent;
+
+class TimingCache : public Cache {
+    private:
+        uint64_t lastAccCycle, lastFreeCycle;
+        uint32_t numMSHRs, activeMisses;
+        g_vector<TimingEvent*> pendingQueue;
+
+        // Stats
+        CycleBreakdownStat profOccHist;
+        Counter profHitLat, profMissRespLat, profMissLat;
+
+        uint32_t domain;
+
+        // For zcache replacement simulation (pessimistic, assumes we walk the whole tree)
+        uint32_t tagLat, ways, cands;
+
+        PAD();
+        lock_t topLock;
+        PAD();
+
+    public:
+        TimingCache(uint32_t _numLines, CC* _cc, CacheArray* _array, ReplPolicy* _rp, uint32_t _accLat, uint32_t _invLat, uint32_t mshrs,
+                uint32_t tagLat, uint32_t ways, uint32_t cands, uint32_t _domain, const g_string& _name);
+        void initStats(AggregateStat* parentStat);
+
+        uint64_t access(MemReq& req);
+
+        void simulateHit(HitEvent* ev, uint64_t cycle);
+        void simulateMissStart(MissStartEvent* ev, uint64_t cycle);
+        void simulateMissResponse(MissResponseEvent* ev, uint64_t cycle, MissStartEvent* mse);
+        void simulateMissWriteback(MissWritebackEvent* ev, uint64_t cycle, MissStartEvent* mse);
+        void simulateReplAccess(ReplAccessEvent* ev, uint64_t cycle);
+
+    private:
+        uint64_t highPrioAccess(uint64_t cycle);
+        uint64_t tryLowPrioAccess(uint64_t cycle);
+};
+
+#endif  // TIMING_CACHE_H_
diff --git a/src/timing_core.cpp b/src/timing_core.cpp
new file mode 100644
index 00000000..3da11168
--- /dev/null
+++ b/src/timing_core.cpp
@@ -0,0 +1,136 @@
+/** $lic$
+ * Copyright (C) 2012-2014 by Massachusetts Institute of Technology
+ * Copyright (C) 2010-2013 by The Board of Trustees of Stanford University
+ *
+ * This file is part of zsim.
+ *
+ * zsim is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU General Public License as published by the Free Software
+ * Foundation, version 2.
+ *
+ * If you use this software in your research, we request that you reference
+ * the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of
+ * Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the
+ * source of the simulator in any publications that use this software, and that
+ * you send us a citation of your work.
+ *
+ * zsim is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "timing_core.h"
+#include "filter_cache.h"
+#include "zsim.h"
+
+#define DEBUG_MSG(args...)
+//#define DEBUG_MSG(args...) info(args)
+
+TimingCore::TimingCore(FilterCache* _l1i, FilterCache* _l1d, uint32_t _domain, g_string& _name)
+    : Core(_name), l1i(_l1i), l1d(_l1d), instrs(0), curCycle(0), cRec(_domain, _name) {}
+
+uint64_t TimingCore::getPhaseCycles() const {
+    return curCycle % zinfo->phaseLength;
+}
+
+void TimingCore::initStats(AggregateStat* parentStat) {
+    AggregateStat* coreStat = new AggregateStat();
+    coreStat->init(name.c_str(), "Core stats");
+
+    auto x = [this]() { return cRec.getUnhaltedCycles(curCycle); };
+    LambdaStat<decltype(x)>* cyclesStat = new LambdaStat<decltype(x)>(x);
+    cyclesStat->init("cycles", "Simulated unhalted cycles");
+    coreStat->append(cyclesStat);
+
+    auto y = [this]() { return cRec.getContentionCycles(); };
+    LambdaStat<decltype(y)>* cCyclesStat = new LambdaStat<decltype(y)>(y);
+    cCyclesStat->init("cCycles", "Cycles due to contention stalls");
+    coreStat->append(cCyclesStat);
+
+    ProxyStat* instrsStat = new ProxyStat();
+    instrsStat->init("instrs", "Simulated instructions", &instrs);
+    coreStat->append(instrsStat);
+
+    parentStat->append(coreStat);
+}
+
+
+void TimingCore::contextSwitch(int32_t gid) {
+    if (gid == -1) {
+        l1i->contextSwitch();
+        l1d->contextSwitch();
+    }
+}
+
+void TimingCore::join() {
+    DEBUG_MSG("[%s] Joining, curCycle %ld phaseEnd %ld", name.c_str(), curCycle, phaseEndCycle);
+    curCycle = cRec.notifyJoin(curCycle);
+    phaseEndCycle = zinfo->globPhaseCycles + zinfo->phaseLength;
+    DEBUG_MSG("[%s] Joined, curCycle %ld phaseEnd %ld", name.c_str(), curCycle, phaseEndCycle);
+}
+
+void TimingCore::leave() {
+    cRec.notifyLeave(curCycle);
+}
+
+void TimingCore::loadAndRecord(Address addr) {
+    uint64_t startCycle = curCycle;
+    curCycle = l1d->load(addr, curCycle);
+    cRec.record(startCycle);
+}
+
+void TimingCore::storeAndRecord(Address addr) {
+    uint64_t startCycle = curCycle;
+    curCycle = l1d->store(addr, curCycle);
+    cRec.record(startCycle);
+}
+
+void TimingCore::bblAndRecord(Address bblAddr, BblInfo* bblInfo) {
+    instrs += bblInfo->instrs;
+    curCycle += bblInfo->instrs;
+
+    Address endBblAddr = bblAddr + bblInfo->bytes;
+    for (Address fetchAddr = bblAddr; fetchAddr < endBblAddr; fetchAddr+=(1 << lineBits)) {
+        uint64_t startCycle = curCycle;
+        curCycle = l1i->load(fetchAddr, curCycle);
+        cRec.record(startCycle);
+    }
+}
+
+
+InstrFuncPtrs TimingCore::GetFuncPtrs() {
+    return {LoadAndRecordFunc, StoreAndRecordFunc, BblAndRecordFunc, BranchFunc, PredLoadAndRecordFunc, PredStoreAndRecordFunc, FPTR_ANALYSIS, {0}};
+}
+
+void TimingCore::LoadAndRecordFunc(THREADID tid, ADDRINT addr) {
+    static_cast<TimingCore*>(cores[tid])->loadAndRecord(addr);
+}
+
+void TimingCore::StoreAndRecordFunc(THREADID tid, ADDRINT addr) {
+    static_cast<TimingCore*>(cores[tid])->storeAndRecord(addr);
+}
+
+void TimingCore::BblAndRecordFunc(THREADID tid, ADDRINT bblAddr, BblInfo* bblInfo) {
+    TimingCore* core = static_cast<TimingCore*>(cores[tid]);
+    core->bblAndRecord(bblAddr, bblInfo);
+
+    while (core->curCycle > core->phaseEndCycle) {
+        core->phaseEndCycle += zinfo->phaseLength;
+        uint32_t cid = getCid(tid);
+        uint32_t newCid = TakeBarrier(tid, cid);
+        if (newCid != cid) break; /*context-switch*/
+    }
+}
+
+void TimingCore::PredLoadAndRecordFunc(THREADID tid, ADDRINT addr, BOOL pred) {
+    if (pred) static_cast<TimingCore*>(cores[tid])->loadAndRecord(addr);
+}
+
+void TimingCore::PredStoreAndRecordFunc(THREADID tid, ADDRINT addr, BOOL pred) {
+    if (pred) static_cast<TimingCore*>(cores[tid])->storeAndRecord(addr);
+}
+
diff --git a/src/timing_core.h b/src/timing_core.h
new file mode 100644
index 00000000..6641cb37
--- /dev/null
+++ b/src/timing_core.h
@@ -0,0 +1,83 @@
+/** $lic$
+ * Copyright (C) 2012-2014 by Massachusetts Institute of Technology
+ * Copyright (C) 2010-2013 by The Board of Trustees of Stanford University
+ *
+ * This file is part of zsim.
+ *
+ * zsim is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU General Public License as published by the Free Software
+ * Foundation, version 2.
+ *
+ * If you use this software in your research, we request that you reference
+ * the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of
+ * Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the
+ * source of the simulator in any publications that use this software, and that
+ * you send us a citation of your work.
+ *
+ * zsim is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef TIMING_CORE_H_
+#define TIMING_CORE_H_
+
+#include "core.h"
+#include "core_recorder.h"
+#include "event_recorder.h"
+#include "memory_hierarchy.h"
+#include "pad.h"
+
+class FilterCache;
+
+class TimingCore : public Core {
+    private:
+        FilterCache* l1i;
+        FilterCache* l1d;
+
+        uint64_t instrs;
+
+        uint64_t curCycle; //phase 1 clock
+        uint64_t phaseEndCycle; //phase 1 end clock
+
+        CoreRecorder cRec;
+
+    public:
+        TimingCore(FilterCache* _l1i, FilterCache* _l1d, uint32_t domain, g_string& _name);
+        void initStats(AggregateStat* parentStat);
+
+        uint64_t getInstrs() const {return instrs;}
+        uint64_t getPhaseCycles() const;
+        uint64_t getCycles() const {return cRec.getUnhaltedCycles(curCycle);}
+
+        void contextSwitch(int32_t gid);
+        virtual void join();
+        virtual void leave();
+
+        InstrFuncPtrs GetFuncPtrs();
+
+        //Contention simulation interface
+        inline EventRecorder* getEventRecorder() {return cRec.getEventRecorder();}
+        void cSimStart() {curCycle = cRec.cSimStart(curCycle);}
+        void cSimEnd() {curCycle = cRec.cSimEnd(curCycle);}
+
+    private:
+        inline void loadAndRecord(Address addr);
+        inline void storeAndRecord(Address addr);
+        inline void bblAndRecord(Address bblAddr, BblInfo* bblInstrs);
+        inline void record(uint64_t startCycle);
+
+        static void LoadAndRecordFunc(THREADID tid, ADDRINT addr);
+        static void StoreAndRecordFunc(THREADID tid, ADDRINT addr);
+        static void BblAndRecordFunc(THREADID tid, ADDRINT bblAddr, BblInfo* bblInfo);
+        static void PredLoadAndRecordFunc(THREADID tid, ADDRINT addr, BOOL pred);
+        static void PredStoreAndRecordFunc(THREADID tid, ADDRINT addr, BOOL pred);
+
+        static void BranchFunc(THREADID, ADDRINT, BOOL, ADDRINT, ADDRINT) {}
+} ATTR_LINE_ALIGNED;
+
+#endif  // TIMING_CORE_H_
diff --git a/src/timing_event.cpp b/src/timing_event.cpp
new file mode 100644
index 00000000..2ebdb653
--- /dev/null
+++ b/src/timing_event.cpp
@@ -0,0 +1,175 @@
+/** $lic$
+ * Copyright (C) 2012-2014 by Massachusetts Institute of Technology
+ * Copyright (C) 2010-2013 by The Board of Trustees of Stanford University
+ *
+ * This file is part of zsim.
+ *
+ * zsim is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU General Public License as published by the Free Software
+ * Foundation, version 2.
+ *
+ * If you use this software in your research, we request that you reference
+ * the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of
+ * Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the
+ * source of the simulator in any publications that use this software, and that
+ * you send us a citation of your work.
+ *
+ * zsim is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "timing_event.h"
+#include <sstream>
+#include <typeinfo>
+#include "contention_sim.h"
+#include "zsim.h"
+
+/* TimingEvent */
+
+void TimingEvent::parentDone(uint64_t startCycle) {
+    cycle = MAX(cycle, startCycle);
+    assert(numParents);
+    numParents--;
+    if (!numParents) {
+        assert(state == EV_NONE);
+        state = EV_QUEUED;
+        zinfo->contentionSim->enqueue(this, cycle+preDelay);
+    }
+}
+
+void TimingEvent::queue(uint64_t nextCycle) {
+    assert(state == EV_NONE && numParents == 0);
+    state = EV_QUEUED;
+    zinfo->contentionSim->enqueueSynced(this, nextCycle);
+}
+
+void TimingEvent::requeue(uint64_t nextCycle) {
+    assert(numParents == 0);
+    assert(state == EV_RUNNING || state == EV_HELD);
+    state = EV_QUEUED;
+    zinfo->contentionSim->enqueue(this, nextCycle);
+}
+
+void TimingEvent::produceCrossings(EventRecorder* evRec) {
+    assert(domain != -1);
+    //assert(dynamic_cast<CrossingEvent*>(this) == NULL); //careful, expensive...
+    auto pcLambda = [this, evRec](TimingEvent** childPtr) {
+        TimingEvent* c = *childPtr;
+        if (c->domain != domain) *childPtr = handleCrossing(c, evRec, true);
+        c->produceCrossings(evRec);
+    };
+    visitChildren< decltype(pcLambda) > (pcLambda);
+}
+
+TimingEvent* TimingEvent::handleCrossing(TimingEvent* childEv, EventRecorder* evRec, bool unlinkChild) {
+    if (unlinkChild) {
+        assert_msg(childEv->numParents, "child has %d parents, nonzero expected", childEv->numParents);
+        childEv->numParents--;
+    }
+    assert_msg(minStartCycle != ((uint64_t)-1L), "Crossing domain (%d -> %d), but parent's minStartCycle is not set (my class: %s)",
+            domain, childEv->domain, typeid(*this).name()); //we can only handle a crossing if this has been set
+    CrossingEvent* xe = new (evRec) CrossingEvent(this, childEv, minStartCycle+postDelay, evRec);
+    return xe->getSrcDomainEvent();
+}
+
+void TimingEvent::checkDomain(TimingEvent* ch) {
+    //dynamic_cast takes a while, so let's just punt on this now that it's correct
+    //assert(domain == ch->domain || dynamic_cast<CrossingEvent*>(ch));
+}
+
+
+/* CrossingEvent */
+
+CrossingEvent::CrossingEvent(TimingEvent* parent, TimingEvent* child, uint64_t _minStartCycle, EventRecorder* _evRec)
+    : TimingEvent(0, 0, child->domain), cpe(this, parent->domain)
+{
+    assert(parent->domain != child->domain);
+    parentEv = parent;
+    evRec = _evRec;
+    srcDomain = parent->domain;
+    assert(srcDomain >= 0);
+    simCount = 0;
+    called = false;
+    addChild(child, evRec);
+    doneCycle = 0;
+
+    //Delay stealing
+    preSlack = parent->postDelay;
+    postSlack = child->preDelay;
+
+    //assert(preSlack > 0);
+    if (preSlack == 0) {
+        //warn("%ld: No preSlack", _minStartCycle);
+        preSlack = 1;
+        _minStartCycle++;
+    }
+
+    minStartCycle = _minStartCycle;
+    origStartCycle = minStartCycle - evRec->getGapCycles();
+    //queue(MAX(zinfo->contentionSim->getLastLimit(), minStartCycle)); //this initial queue always works --- 0 parents
+    //childCrossing = NULL;
+    zinfo->contentionSim->enqueueCrossing(this, MAX(zinfo->contentionSim->getLastLimit(), minStartCycle), evRec->getSourceId(), srcDomain, child->domain, evRec);
+}
+
+void CrossingEvent::markSrcEventDone(uint64_t cycle) {
+    assert(!called);
+    //Sanity check
+    srcDomainCycleAtDone = zinfo->contentionSim->getCurCycle(srcDomain);
+    assert(cycle >= srcDomainCycleAtDone);
+    //NOTE: No fencing needed; TSO ensures writes to doneCycle and callled happen in order.
+    doneCycle = cycle;
+    called = true;
+    //Also, no fencing needed after.
+}
+
+void CrossingEvent::parentDone(uint64_t startCycle) {
+    //We don't pad chained crossings with delays; just make sure we don't enqueue ourselves before minStartCycle
+    uint64_t cycle = MAX(startCycle, minStartCycle);
+    if (called) {
+        if (doneCycle < cycle) {
+            //warn("Crossing enqueued too late, doneCycle %ld startCycle %ld minStartCycle %ld cycle %ld", doneCycle, startCycle, minStartCycle, cycle);
+            doneCycle = cycle;
+        }
+        //assert_msg(doneCycle >= cycle, "Crossing enqueued too late, doneCycle %ld startCycle %ld minStartCycle %ld cycle %ld", doneCycle, startCycle, minStartCycle, cycle);
+    }
+    TimingEvent::parentDone(cycle);
+}
+
+void CrossingEvent::simulate(uint64_t simCycle) {
+    if (!called) {
+        uint64_t curSrcCycle = zinfo->contentionSim->getCurCycle(srcDomain) + preSlack + postSlack;
+        //uint64_t coreRelCycle = 0; //evRec->getSlack(origStartCycle) + postSlack; //note we do not add preDelay, because minStartCycle already has it
+        uint64_t coreRelCycle = evRec->getSlack(origStartCycle) + postSlack; //note we do not add preDelay, because minStartCycle already has it
+        uint64_t nextCycle = MAX(coreRelCycle, MAX(curSrcCycle, simCycle));
+
+        __sync_synchronize(); //not needed --- these are all volatile, and by TSO, if we see a cycle > doneCycle, by force we must see doneCycle set
+        if (!called) { //have to check again, AFTER reading the cycles! Otherwise, we have a race
+            zinfo->contentionSim->setPrio(domain, (nextCycle == simCycle)? 1 : 2);
+
+#if PROFILE_CROSSINGS
+            simCount++;
+#endif
+            numParents = 0; //HACK
+            requeue(nextCycle);
+            return;
+        }
+    }
+
+    //Runs if called
+    //assert_msg(simCycle <= doneCycle+preSlack+postSlack+1, "simCycle %ld doneCycle %ld, preSlack %d postSlack %d simCount %ld child %s", simCycle, doneCycle, preSlack, postSlack, simCount, typeid(*child).name());
+    zinfo->contentionSim->setPrio(domain, 0);
+
+#if PROFILE_CROSSINGS
+    zinfo->contentionSim->profileCrossing(srcDomain, domain, simCount);
+#endif
+
+    uint64_t dCycle = MAX(simCycle, doneCycle);
+    //info("Crossing %d->%d done %ld", srcDomain, domain, dCycle);
+    done(dCycle);
+}
+
diff --git a/src/timing_event.h b/src/timing_event.h
new file mode 100644
index 00000000..5ba4a52e
--- /dev/null
+++ b/src/timing_event.h
@@ -0,0 +1,341 @@
+/** $lic$
+ * Copyright (C) 2012-2014 by Massachusetts Institute of Technology
+ * Copyright (C) 2010-2013 by The Board of Trustees of Stanford University
+ *
+ * This file is part of zsim.
+ *
+ * zsim is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU General Public License as published by the Free Software
+ * Foundation, version 2.
+ *
+ * If you use this software in your research, we request that you reference
+ * the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of
+ * Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the
+ * source of the simulator in any publications that use this software, and that
+ * you send us a citation of your work.
+ *
+ * zsim is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef TIMING_EVENT_H_
+#define TIMING_EVENT_H_
+
+#include <stdint.h>
+#include <string>
+#include <typeinfo>
+#include "bithacks.h"
+#include "event_recorder.h"
+#include "galloc.h"
+
+#define TIMING_BLOCK_EVENTS 3
+struct TimingEventBlock {
+    TimingEvent* events[TIMING_BLOCK_EVENTS];
+    TimingEventBlock* next;
+
+    TimingEventBlock() {
+        for (uint32_t i = 0; i < TIMING_BLOCK_EVENTS; i++) events[i] = NULL;
+        next = NULL;
+    }
+
+    void* operator new (size_t sz, EventRecorder* evRec) {
+        return evRec->alloc(sz);
+    }
+
+    void operator delete(void*, size_t) {
+        panic("TimingEventBlock::delete should never be called");
+    }
+
+    //Placement delete... make ICC happy. This would only fire on an exception
+    void operator delete (void* p, EventRecorder* evRec) {
+        panic("TimingEventBlock::delete PLACEMENT delete called");
+    }
+
+    private:
+        void* operator new (size_t);
+};
+
+enum EventState {EV_NONE, EV_QUEUED, EV_RUNNING, EV_HELD, EV_DONE};
+
+class CrossingEvent;
+
+class TimingEvent {
+    private:
+        uint64_t privCycle; //only touched by ContentionSim
+
+    public:
+        TimingEvent* next; //used by PrioQueue --- PRIVATE
+
+    private:
+        EventState state;
+        uint64_t cycle;
+
+        uint64_t minStartCycle;
+        union {
+            TimingEvent* child;
+            TimingEventBlock* children;
+        };
+        int32_t domain; //-1 if none; if none, it acquires it from the parent. Cannot be a starting event (no parents at enqueue time) and get -1 as domain
+        uint32_t numChildren;
+        uint32_t numParents;
+        uint32_t preDelay;
+        uint32_t postDelay; //we could get by with one delay, but pre/post makes it easier to code
+
+    public:
+        TimingEvent(uint32_t _preDelay, uint32_t _postDelay, int32_t _domain = -1) : next(NULL), state(EV_NONE), cycle(0), minStartCycle(-1L), child(NULL),
+                    domain(_domain), numChildren(0), numParents(0), preDelay(_preDelay), postDelay(_postDelay) {}
+        explicit TimingEvent(int32_t _domain = -1) : next(NULL), state(EV_NONE), minStartCycle(-1L), child(NULL),
+                    domain(_domain), numChildren(0), numParents(0), preDelay(0), postDelay(0) {} //no delegating constructors until gcc 4.7...
+
+        inline uint32_t getDomain() const {return domain;}
+        inline uint32_t getNumChildren() const {return numChildren;}
+        inline uint32_t getPreDelay() const {return preDelay;}
+        inline uint32_t getPostDelay() const {return postDelay;}
+
+        inline void setPreDelay(uint32_t d) {preDelay = d;}
+        inline void setPostDelay(uint32_t d) {postDelay = d;}
+
+        inline uint64_t getMinStartCycle() const {return minStartCycle;}
+        inline void setMinStartCycle(uint64_t c) {minStartCycle = c;}
+
+        TimingEvent* addChild(TimingEvent* childEv, EventRecorder* evRec) {
+            assert_msg(state == EV_NONE || state == EV_QUEUED, "adding child in invalid state %d %s -> %s", state, typeid(*this).name(), typeid(*childEv).name()); //either not scheduled or not executed yet
+            assert(childEv->state == EV_NONE);
+
+            TimingEvent* res = childEv;
+
+            if (numChildren == 0) {
+                numChildren = 1;
+                child = childEv;
+            } else if (numChildren == 1) {
+                TimingEvent* firstChild = child;
+                children = new (evRec) TimingEventBlock();
+                children->events[0] = firstChild;
+                children->events[1] = childEv;
+                numChildren = 2;
+            } else {
+                uint32_t idx = numChildren % TIMING_BLOCK_EVENTS;
+                if (idx == 0) {
+                    TimingEventBlock* tmp = children;
+                    children = new (evRec) TimingEventBlock();
+                    children->next = tmp;
+                }
+                children->events[idx] = childEv;
+                numChildren++;
+            }
+
+            if (domain != -1 && childEv->domain == -1) {
+                childEv->propagateDomain(domain);
+            }
+
+            childEv->numParents++;
+            return res; //useful for chaining
+        }
+
+        TimingEvent* addChild(TimingEvent* childEv, EventRecorder& evRec) {
+            return addChild(childEv, &evRec);
+        }
+
+        virtual void parentDone(uint64_t startCycle); // see cpp
+
+        //queue for the first time
+        //always happens on PHASE 1 (bound), and is synchronized
+        void queue(uint64_t qCycle); //see cpp
+
+        //mark an already-dequeued event for reexecution (simulate will be called again at the specified cycle)
+        //always happens on PHASE 2 (weave), and is unsynchronized
+        void requeue(uint64_t cycle); //see cpp
+
+        virtual void simulate(uint64_t startCycle) = 0;
+
+        inline void run(uint64_t startCycle) {
+            assert(this);
+            assert_msg(state == EV_NONE || state == EV_QUEUED, "state %d expected %d (%s)", state, EV_QUEUED, typeid(*this).name());
+            state = EV_RUNNING;
+            assert_msg(startCycle >= minStartCycle, "startCycle %ld < minStartCycle %ld (%s), preDelay %d postDelay %d numChildren %d str %s",
+                    startCycle, minStartCycle, typeid(*this).name(), preDelay, postDelay, numChildren, str().c_str());
+            simulate(startCycle);
+            assert_msg(state == EV_DONE || state == EV_QUEUED || state == EV_HELD, "post-sim state %d (%s)", state, typeid(*this).name());
+        }
+
+        // Used when an external, event-driven object takes control of the object --- it becomes queued, but externally
+        inline void hold() {
+            assert_msg(state == EV_RUNNING, "called hold() with state %d", state);
+            state = EV_HELD;
+        }
+
+        inline void release() {
+            assert_msg(state == EV_HELD, "state should be %d, %d instead", EV_HELD, state);
+            state = EV_RUNNING;
+        }
+
+        void done(uint64_t doneCycle) {
+            assert(state == EV_RUNNING); //ContentionSim sets it when calling simulate()
+            state = EV_DONE;
+            auto vLambda = [this, doneCycle](TimingEvent** childPtr) {
+                checkDomain(*childPtr);
+                (*childPtr)->parentDone(doneCycle+postDelay);
+            };
+            visitChildren< decltype(vLambda) >(vLambda);
+        }
+
+        void produceCrossings(EventRecorder* evRec);
+
+        void* operator new (size_t sz, EventRecorder* evRec) {
+            return evRec->alloc(sz);
+        }
+
+        void* operator new (size_t sz, EventRecorder& evRec) {
+            return evRec.alloc(sz);
+        }
+
+        void operator delete(void*, size_t) {
+            panic("TimingEvent::delete should never be called");
+        }
+
+        //Placement deletes... make ICC happy. This would only fire on an exception
+        void operator delete (void* p, EventRecorder* evRec) {
+            panic("TimingEvent::delete PLACEMENT delete called");
+        }
+        void operator delete (void* p, EventRecorder& evRec) {
+            panic("TimingEvent::delete PLACEMENT delete called");
+        }
+
+        //Describe yourself, useful for debugging
+        virtual std::string str() { std::string res; return res; }
+
+    private:
+        void* operator new (size_t);
+
+        void propagateDomain(int32_t dom) {
+            assert(domain == -1);
+            domain = dom;
+            auto vLambda = [this](TimingEvent** childPtr) {
+                TimingEvent* child = *childPtr;
+                if (child->domain == -1) child->propagateDomain(domain);
+            };
+            visitChildren< decltype(vLambda) >(vLambda);
+        }
+
+        template <typename F> //F has to be decltype(f)
+        inline void visitChildren(F f) {
+            if (numChildren == 0) return;
+            //info("visit %p nc %d", this, numChildren);
+            if (numChildren == 1) {
+                f(&child);
+            } else {
+                TimingEventBlock* curBlock = children;
+                while (curBlock) {
+                    for (uint32_t i = 0; i < TIMING_BLOCK_EVENTS; i++) {
+                        //info("visit %p i %d %p", this, i, curBlock->events[i]);
+                        if (!curBlock->events[i]) {break;}
+                        //info("visit %p i %d %p PASS", this, i, curBlock->events[i]);
+                        f(&(curBlock->events[i]));
+                    }
+                    curBlock = curBlock->next;
+                }
+                //info("visit %p multi done", this);
+            }
+        }
+
+        TimingEvent* handleCrossing(TimingEvent* child, EventRecorder* evRec, bool unlinkChild);
+
+        void checkDomain(TimingEvent* ch);
+
+    protected:
+
+        // If an event is externally handled, and has no parents or children,
+        // it can call this at initialization to always be between RUNNING and
+        // QUEUED (through requeue())
+        void setRunning() {
+            assert(state == EV_NONE);
+            state = EV_RUNNING;
+        }
+
+
+    friend class ContentionSim;
+    friend class DelayEvent; //DelayEvent is, for now, the only child of TimingEvent that should do anything other than implement simulate
+    friend class CrossingEvent;
+};
+
+class DelayEvent : public TimingEvent {
+    public:
+        explicit DelayEvent(uint32_t delay) : TimingEvent(delay, 0) {}
+
+        virtual void parentDone(uint64_t startCycle) {
+            cycle = MAX(cycle, startCycle);
+            numParents--;
+            if (!numParents) {
+                uint64_t doneCycle = cycle + preDelay;
+                state = EV_RUNNING;
+                done(doneCycle);
+            }
+        }
+
+        virtual void simulate(uint64_t simCycle) {
+            panic("DelayEvent::simulate() was called --- DelayEvent wakes its children directly");
+        }
+};
+
+class CrossingEvent : public TimingEvent {
+    private:
+        uint32_t srcDomain;
+        volatile bool called;
+        volatile uint64_t doneCycle;
+        volatile uint64_t srcDomainCycleAtDone;
+        EventRecorder* evRec;
+        uint64_t origStartCycle;
+        uint64_t simCount;
+        TimingEvent* parentEv; //stored exclusively for resp-req xing chaining
+
+        uint32_t preSlack, postSlack;
+
+        class CrossingSrcEvent : public TimingEvent {
+            private:
+                CrossingEvent* ce;
+            public:
+                CrossingSrcEvent(CrossingEvent* _ce, uint32_t dom) : TimingEvent(0, 0, dom), ce(_ce) {
+                    //These are never connected to anything, but substitute an existing event; so, this never gets
+                    //numParents incremented, but we set it to 1 to maintain semantics in case we have a walk
+                    assert(numParents == 0);
+                    numParents = 1;
+                }
+
+                virtual void parentDone(uint64_t startCycle) {
+                    assert_msg(numParents == 1, "CSE: numParents %d", numParents);
+                    numParents = 0;
+                    assert(numChildren == 0);
+                    ce->markSrcEventDone(startCycle);
+                    assert(state == EV_NONE);
+                    state = EV_DONE;
+                }
+
+                virtual void simulate(uint64_t simCycle) {
+                    panic("DelayEvent::simulate() called");
+                }
+        };
+
+        CrossingSrcEvent cpe;
+
+    public:
+        CrossingEvent(TimingEvent* parent, TimingEvent* child, uint64_t _minStartCycle, EventRecorder* _evRec);
+
+        TimingEvent* getSrcDomainEvent() {return &cpe;}
+
+        virtual void parentDone(uint64_t startCycle);
+
+        virtual void simulate(uint64_t simCycle);
+
+    private:
+        void markSrcEventDone(uint64_t cycle);
+
+        friend class ContentionSim;
+};
+
+
+#endif  // TIMING_EVENT_H_
diff --git a/src/utility_monitor.cpp b/src/utility_monitor.cpp
new file mode 100644
index 00000000..0ec5b302
--- /dev/null
+++ b/src/utility_monitor.cpp
@@ -0,0 +1,144 @@
+/** $lic$
+ * Copyright (C) 2012-2014 by Massachusetts Institute of Technology
+ * Copyright (C) 2010-2013 by The Board of Trustees of Stanford University
+ *
+ * This file is part of zsim.
+ *
+ * zsim is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU General Public License as published by the Free Software
+ * Foundation, version 2.
+ *
+ * If you use this software in your research, we request that you reference
+ * the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of
+ * Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the
+ * source of the simulator in any publications that use this software, and that
+ * you send us a citation of your work.
+ *
+ * zsim is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "utility_monitor.h"
+#include "hash.h"
+
+#define DEBUG_UMON 0
+//#define DEBUG_UMON 1
+
+UMon::UMon(uint32_t _bankLines, uint32_t _umonLines, uint32_t _buckets) {
+    umonLines = _umonLines;
+    buckets = _buckets;
+    samplingFactor = _bankLines/umonLines;
+    sets = umonLines/buckets;
+
+    heads = gm_calloc<Node*>(sets);
+    array = gm_calloc<Node*>(sets);
+    for (uint32_t i = 0; i < sets; i++) {
+        array[i] = gm_calloc<Node>(buckets);
+        heads[i] = &array[i][0];
+        for (uint32_t j = 0; j < buckets-1; j++) {
+            array[i][j].next = &array[i][j+1];
+        }
+    }
+
+    curWayHits = gm_calloc<uint64_t>(buckets);
+    curMisses = 0;
+
+    hf = new H3HashFamily(2, 32, 0xF000BAAD);
+
+    samplingFactorBits = 0;
+    uint32_t tmp = samplingFactor;
+    while (tmp >>= 1) samplingFactorBits++;
+
+    setsBits = 0;
+    tmp = sets;
+    while (tmp >>= 1) setsBits++;
+}
+
+void UMon::initStats(AggregateStat* parentStat) {
+    profWayHits.init("hits", "Sampled hits per bucket", buckets); parentStat->append(&profWayHits);
+    profMisses.init("misses", "Sampled misses"); parentStat->append(&profMisses);
+}
+
+
+void UMon::access(Address lineAddr) {
+    //1. Hash to decide if it should go in the cache
+    uint64_t sampleMask = ~(((uint64_t)-1LL) << samplingFactorBits);
+    uint64_t sampleSel = (hf->hash(0, lineAddr)) & sampleMask;
+
+    //info("0x%lx 0x%lx", sampleMask, sampleSel);
+
+    if (sampleSel != 0) {
+        return;
+    }
+
+    //2. Insert; hit or miss?
+    uint64_t setMask = ~(((uint64_t)-1LL) << setsBits);
+    uint64_t set = (hf->hash(1, lineAddr)) & setMask;
+
+    // Check hit
+    Node* prev = NULL;
+    Node* cur = heads[set];
+    bool hit = false;
+    for (uint32_t b = 0; b < buckets; b++) {
+        if (cur->addr == lineAddr) { //Hit at position b, profile
+            //profHits.inc();
+            //profWayHits.inc(b);
+            curWayHits[b]++;
+            hit = true;
+            break;
+        } else if (b < buckets-1) {
+            prev = cur;
+            cur = cur->next;
+        }
+    }
+
+    //Profile miss, kick cur out, put lineAddr in
+    if (!hit) {
+        curMisses++;
+        //profMisses.inc();
+        assert(cur->next == NULL);
+        cur->addr = lineAddr;
+    }
+
+    //Move cur to MRU (happens regardless of whether this is a hit or a miss)
+    if (prev) {
+        prev->next = cur->next;
+        cur->next = heads[set];
+        heads[set] = cur;
+    }
+}
+
+uint64_t UMon::getNumAccesses() const {
+    uint64_t total = curMisses;
+    for (uint32_t i = 0; i < buckets; i++) {
+        total += curWayHits[buckets - i - 1];
+    }
+    return total;
+}
+
+void UMon::getMisses(uint64_t* misses) {
+    uint64_t total = curMisses;
+    for (uint32_t i = 0; i < buckets; i++) {
+        misses[buckets - i] = total;
+        total += curWayHits[buckets - i - 1];
+    }
+    misses[0] = total;
+#if DEBUG_UMON
+    info("UMON miss utility curve:");
+    for (uint32_t i = 0; i <= buckets; i++) info(" misses[%d] = %ld", i, misses[i]);
+#endif
+}
+
+
+void UMon::startNextInterval() {
+curMisses = 0;
+                for (uint32_t b = 0; b < buckets; b++) {
+                    curWayHits[b] = 0;
+                }
+}
+
diff --git a/src/utility_monitor.h b/src/utility_monitor.h
new file mode 100644
index 00000000..158dedde
--- /dev/null
+++ b/src/utility_monitor.h
@@ -0,0 +1,81 @@
+/** $lic$
+ * Copyright (C) 2012-2014 by Massachusetts Institute of Technology
+ * Copyright (C) 2010-2013 by The Board of Trustees of Stanford University
+ *
+ * This file is part of zsim.
+ *
+ * zsim is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU General Public License as published by the Free Software
+ * Foundation, version 2.
+ *
+ * If you use this software in your research, we request that you reference
+ * the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of
+ * Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the
+ * source of the simulator in any publications that use this software, and that
+ * you send us a citation of your work.
+ *
+ * zsim is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef UTILITY_MONITOR_H_
+#define UTILITY_MONITOR_H_
+
+#include "galloc.h"
+#include "memory_hierarchy.h"
+#include "stats.h"
+
+//Print some information regarding utility monitors and partitioning
+#define UMON_INFO 0
+//#define UMON_INFO 1
+
+class HashFamily;
+
+class UMon : public GlobAlloc {
+    private:
+        uint32_t umonLines;
+        uint32_t samplingFactor; //Size of sampled cache (lines)/size of umon. Should be power of 2
+        uint32_t buckets; //umon ways
+        uint32_t sets; //umon sets. Should be power of 2.
+
+        //Used in masks for set indices and sampling factor descisions
+        uint64_t samplingFactorBits;
+        uint64_t setsBits;
+
+        uint64_t* curWayHits;
+        uint64_t curMisses;
+
+        Counter profHits;
+        Counter profMisses;
+        VectorCounter profWayHits;
+
+        //Even for high associativity/number of buckets, performance of this is not important because we downsample so much (so this is a LL)
+        struct Node {
+            Address addr;
+            struct Node* next;
+        };
+        Node** array;
+        Node** heads;
+
+        HashFamily* hf;
+
+    public:
+        UMon(uint32_t _bankLines, uint32_t _umonLines, uint32_t _buckets);
+        void initStats(AggregateStat* parentStat);
+
+        void access(Address lineAddr);
+
+        uint64_t getNumAccesses() const;
+        void getMisses(uint64_t* misses);
+        void startNextInterval();
+
+        uint32_t getBuckets() const { return buckets; }
+};
+
+#endif  // UTILITY_MONITOR_H_
+
diff --git a/src/virt/common.h b/src/virt/common.h
new file mode 100644
index 00000000..d94eb9e9
--- /dev/null
+++ b/src/virt/common.h
@@ -0,0 +1,67 @@
+/** $lic$
+ * Copyright (C) 2012-2014 by Massachusetts Institute of Technology
+ * Copyright (C) 2010-2013 by The Board of Trustees of Stanford University
+ *
+ * This file is part of zsim.
+ *
+ * zsim is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU General Public License as published by the Free Software
+ * Foundation, version 2.
+ *
+ * If you use this software in your research, we request that you reference
+ * the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of
+ * Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the
+ * source of the simulator in any publications that use this software, and that
+ * you send us a citation of your work.
+ *
+ * zsim is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef VIRT_COMMON_H_
+#define VIRT_COMMON_H_
+
+// Typedefs and common functions for Virt implementation
+// This is internal to virt, and should only be included withing virt/ files
+
+#include <functional>
+#include "log.h"
+#include "pin.H"
+#include "virt/virt.h"
+
+struct PrePatchArgs {
+    uint32_t tid;
+    CONTEXT* ctxt;
+    SYSCALL_STANDARD std;
+    const char* patchRoot;
+    bool isNopThread;
+};
+
+struct PostPatchArgs {
+    uint32_t tid;
+    CONTEXT* ctxt;
+    SYSCALL_STANDARD std;
+};
+
+typedef std::function<PostPatchAction(PostPatchArgs)> PostPatchFn;
+typedef PostPatchFn (*PrePatchFn)(PrePatchArgs);
+
+extern const PostPatchFn NullPostPatch; // defined in virt.cpp
+
+// PIN_SafeCopy wrapper. We expect the default thing to be correct access
+template<typename T>
+static inline bool safeCopy(const T* src, T* dst, const char* file = __FILE__, int line = __LINE__) {
+    size_t copiedBytes = PIN_SafeCopy(dst, src, sizeof(T));
+    if (copiedBytes != sizeof(T)) {
+        warn("[%d] %s:%d Failed app<->tool copy (%ld/%ld bytes copied)", PIN_ThreadId(), file, line, copiedBytes, sizeof(T));
+        return false;
+    }
+    return true;
+}
+
+#endif  // VIRT_COMMON_H_
diff --git a/src/virt/cpu.cpp b/src/virt/cpu.cpp
new file mode 100644
index 00000000..79f9aa43
--- /dev/null
+++ b/src/virt/cpu.cpp
@@ -0,0 +1,95 @@
+/** $glic$
+ * Copyright (C) 2012-2014 by Massachusetts Institute of Technology
+ * Copyright (C) 2010-2013 by The Board of Trustees of Stanford University
+ * Copyright (C) 2011 Google Inc.
+ *
+ * This file is part of zsim.
+ *
+ * zsim is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU General Public License as published by the Free Software
+ * Foundation, version 2.
+ *
+ * If you use this software in your research, we request that you reference
+ * the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of
+ * Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the
+ * source of the simulator in any publications that use this software, and that
+ * you send us a citation of your work.
+ *
+ * zsim is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "cpuenum.h"
+#include "log.h"
+#include "virt/common.h"
+
+// SYS_getcpu
+
+// Call without CPU from vdso, with CPU from syscall version
+void VirtGetcpu(uint32_t tid, uint32_t cpu, ADDRINT arg0, ADDRINT arg1) {
+    unsigned resCpu;
+    unsigned resNode = 0;
+    if (!arg0) {
+        info("getcpu() called with NULL cpu arg");
+    }
+    if (!safeCopy((unsigned*)arg0, &resCpu)) {
+        info("getcpu() called with invalid cpu arg");
+        return;
+    }
+    if (arg1 && !safeCopy((unsigned*)arg1, &resNode)) {
+        info("getcpu() called with invalid node arg");
+        return;
+    }
+
+    trace(TimeVirt, "Patching getcpu()");
+    trace(TimeVirt, "Orig cpu %d, node %d, patching core %d / node 0", resCpu, resNode, cpu);
+    resCpu = cpu;
+    resNode = 0;
+
+    safeCopy(&resCpu, (unsigned*)arg0);
+    if (arg1) safeCopy(&resNode, (unsigned*)arg1);
+}
+
+PostPatchFn PatchGetcpu(PrePatchArgs args) {
+    uint32_t cpu = cpuenumCpu(procIdx, getCid(args.tid));  // still valid, may become invalid when we leave()
+    assert(cpu != (uint32_t)-1);
+    return [cpu](PostPatchArgs args) {
+        trace(TimeVirt, "[%d] Post-patching SYS_getcpu", tid);
+        ADDRINT arg0 = PIN_GetSyscallArgument(args.ctxt, args.std, 0);
+        ADDRINT arg1 = PIN_GetSyscallArgument(args.ctxt, args.std, 1);
+        VirtGetcpu(args.tid, cpu, arg0, arg1);
+        return PPA_NOTHING;
+    };
+}
+
+// Scheduler affinity
+
+PostPatchFn PatchSchedGetaffinity(PrePatchArgs args) {
+    return [](PostPatchArgs args) {
+        uint32_t size = PIN_GetSyscallArgument(args.ctxt, args.std, 1);
+        cpu_set_t* set = (cpu_set_t*)PIN_GetSyscallArgument(args.ctxt, args.std, 2);
+        if (set) { //TODO: use SafeCopy, this can still segfault
+            CPU_ZERO_S(size, set);
+            std::vector<bool> cpumask = cpuenumMask(procIdx);
+            for (uint32_t i = 0; i < MIN(cpumask.size(), size*8 /*size is in bytes, supports 1 cpu/bit*/); i++) {
+                if (cpumask[i]) CPU_SET_S(i, (size_t)size, set);
+            }
+        }
+        info("[%d] Post-patching SYS_sched_getaffinity size %d cpuset %p", args.tid, size, set);
+        return PPA_NOTHING;
+    };
+}
+
+PostPatchFn PatchSchedSetaffinity(PrePatchArgs args) {
+    PIN_SetSyscallNumber(args.ctxt, args.std, (ADDRINT) SYS_getpid);  // squash
+    return [](PostPatchArgs args) {
+        PIN_SetSyscallNumber(args.ctxt, args.std, (ADDRINT)-EPERM);  // make it a proper failure
+        return PPA_NOTHING;
+    };
+}
+
diff --git a/src/virt/fs.cpp b/src/virt/fs.cpp
new file mode 100644
index 00000000..8e5c5eb6
--- /dev/null
+++ b/src/virt/fs.cpp
@@ -0,0 +1,141 @@
+/** $glic$
+ * Copyright (C) 2012-2014 by Massachusetts Institute of Technology
+ * Copyright (C) 2010-2013 by The Board of Trustees of Stanford University
+ * Copyright (C) 2011 Google Inc.
+ *
+ * This file is part of zsim.
+ *
+ * zsim is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU General Public License as published by the Free Software
+ * Foundation, version 2.
+ *
+ * If you use this software in your research, we request that you reference
+ * the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of
+ * Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the
+ * source of the simulator in any publications that use this software, and that
+ * you send us a citation of your work.
+ *
+ * zsim is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <fcntl.h>
+#include <libgen.h>
+#include "process_tree.h"
+#include "str.h"
+#include "virt/common.h"
+
+static const char* fakedPaths[] = {"/proc/cpuinfo", "/proc/stat", "/sys"};
+
+// SYS_open and SYS_openat; these are ALWAYS patched
+PostPatchFn PatchOpen(PrePatchArgs args) {
+    CONTEXT* ctxt = args.ctxt;
+    SYSCALL_STANDARD std = args.std;
+    const char* patchRoot = args.patchRoot;
+
+    uint32_t syscall = PIN_GetSyscallNumber(ctxt, std);
+    assert(syscall == SYS_open || syscall == SYS_openat);
+
+    if (!patchRoot) return NullPostPatch;  // process does not want patched system...
+        
+    string fileName;
+    int pathReg = (syscall == SYS_open)? 0 : 1;
+    ADDRINT pathArg = PIN_GetSyscallArgument(ctxt, std, pathReg);
+    if (pathArg) fileName = (const char*) pathArg;  // TODO(dsm): SafeCopy
+    if (syscall == SYS_openat) {
+        // Get path relative to dirfd's path; if AT_CWDFD, readlink() should fail
+        int dirfd = PIN_GetSyscallArgument(ctxt, std, 0);
+        char buf[PATH_MAX+1];
+        string fd = "/proc/self/fd/" + Str(dirfd);
+        int res = readlink(fd.c_str(), buf, PATH_MAX);
+        if (res > 0) {
+            buf[res] = '\0';  // argh... readlink does not null-terminate strings!
+            // Double-check deref'd symlink is valid
+            char* rp = realpath(buf, NULL);
+            if (rp) {
+                fileName = string(buf) + "/" + fileName;
+                free(rp);
+            } else {
+                panic("Not a valid path, but readlink() succeeded! %s fd %d res %d", buf, dirfd, res);
+            }
+        }
+    }
+
+    // Canonicalize as much as you can, even if the file does not exist
+    vector<string> bases;
+    string cur = fileName;
+    string absPath;
+
+    while (true) {
+        char* rp = realpath(cur.c_str(), NULL);
+        if (rp) {
+            absPath = rp;  // copies
+            free(rp);
+            while (bases.size()) {
+                absPath += "/" + bases.back();
+                bases.pop_back();
+            }
+            break;  // success
+        } else {
+            if (!cur.size()) break;  // failed
+            char* dirc = strdup(cur.c_str());
+            char* basec = strdup(cur.c_str());
+            char* dname = dirname(dirc);
+            char* bname = basename(basec);
+            bases.push_back(bname);
+            cur = dname;  // copies
+            free(dirc);
+            free(basec);
+        }
+    }
+
+    //info("Canonicalized %s -> %s", fileName.c_str(), absPath.c_str());
+
+    if (absPath.size()) {
+        bool match = false;
+        for (uint32_t i = 0; i < sizeof(fakedPaths)/sizeof(const char*); i++) {
+            uint32_t diff = strncmp(absPath.c_str(), fakedPaths[i], strlen(fakedPaths[i]));
+            if (!diff) {
+                match = true;
+                break;
+            }
+        }
+
+        if (match) {
+            std::string patchPath = patchRoot;
+            patchPath += absPath;
+
+            bool patch = true;
+            //Try to open the patched file to see if it exists
+            //NOTE: We now rely on always patching; uncomment to do selectively, but this leaks info
+            //FILE * patchedFd = fopen(patchPath.c_str(), "r");
+            //if (patchedFd) fclose(patchedFd); else patch = false;
+            if (patch) {
+                char* patchPathMem = strdup(patchPath.c_str());  // in heap
+                info("Patched SYS_open, original %s, patched %s", fileName.c_str(), patchPathMem);
+                PIN_SetSyscallArgument(ctxt, std, pathReg, (ADDRINT) patchPathMem);
+
+                // Restore old path on syscall exit
+                return [pathReg, pathArg, patchPathMem](PostPatchArgs args) {
+                    PIN_SetSyscallArgument(args.ctxt, args.std, pathReg, pathArg);
+                    free(patchPathMem);
+                    return PPA_NOTHING;
+                };
+            } else {
+                info("Patched SYS_open to match %s, left unpatched (no patch)", fileName.c_str());
+            }
+        } else {
+            //info("Non-matching SYS_open/at, path %s (canonical %s)", fileName.c_str(), absPath.c_str());
+        }
+    } else {
+        //info("Non-realpath file %s (%s)", fileName.c_str(), pathArg);
+    }
+
+    return NullPostPatch;
+}
+
diff --git a/src/virt/patchdefs.h b/src/virt/patchdefs.h
new file mode 100644
index 00000000..2e8a171c
--- /dev/null
+++ b/src/virt/patchdefs.h
@@ -0,0 +1,60 @@
+/** $lic$
+ * Copyright (C) 2012-2014 by Massachusetts Institute of Technology
+ * Copyright (C) 2010-2013 by The Board of Trustees of Stanford University
+ *
+ * This file is part of zsim.
+ *
+ * zsim is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU General Public License as published by the Free Software
+ * Foundation, version 2.
+ *
+ * If you use this software in your research, we request that you reference
+ * the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of
+ * Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the
+ * source of the simulator in any publications that use this software, and that
+ * you send us a citation of your work.
+ *
+ * zsim is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+// Definitions of which patch functions handle which syscalls
+// Uses macros, assumes you'll include this from somewhere else
+
+// Unconditional patches
+
+// File system -- fs.cpp
+PF(SYS_open, PatchOpen);
+PF(SYS_openat, PatchOpen);
+
+// Port virtualization -- ports.cpp
+PF(SYS_bind, PatchBind);
+PF(SYS_getsockname, PatchGetsockname);
+PF(SYS_connect, PatchConnect);
+
+// CPU virtualization -- cpu.cpp
+PF(SYS_getcpu, PatchGetcpu);
+PF(SYS_sched_getaffinity, PatchSchedGetaffinity);
+PF(SYS_sched_setaffinity, PatchSchedSetaffinity);
+
+
+// Conditional patches, only when not fast-forwarded
+
+// Time virtualization -- time.cpp
+PF(SYS_gettimeofday, PatchGettimeofday);
+PF(SYS_time, PatchTime);
+PF(SYS_clock_gettime, PatchClockGettime);
+PF(SYS_nanosleep, PatchNanosleep);
+PF(SYS_clock_nanosleep, PatchNanosleep);
+
+// Timeout virtualization -- timeout.cpp
+PF(SYS_futex, PatchTimeoutSyscall);
+PF(SYS_epoll_wait, PatchTimeoutSyscall);
+PF(SYS_epoll_pwait, PatchTimeoutSyscall);
+PF(SYS_poll, PatchTimeoutSyscall);
+
diff --git a/src/virt/port_virtualizer.h b/src/virt/port_virtualizer.h
new file mode 100644
index 00000000..fd0ca36b
--- /dev/null
+++ b/src/virt/port_virtualizer.h
@@ -0,0 +1,73 @@
+/** $lic$
+ * Copyright (C) 2012-2014 by Massachusetts Institute of Technology
+ * Copyright (C) 2010-2013 by The Board of Trustees of Stanford University
+ *
+ * This file is part of zsim.
+ *
+ * zsim is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU General Public License as published by the Free Software
+ * Foundation, version 2.
+ *
+ * If you use this software in your research, we request that you reference
+ * the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of
+ * Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the
+ * source of the simulator in any publications that use this software, and that
+ * you send us a citation of your work.
+ *
+ * zsim is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef VIRT_PORT_VIRTUALIZER_H_
+#define VIRT_PORT_VIRTUALIZER_H_
+
+/* Simple class to keep tabs on virtualized ports */
+
+#include "g_std/g_unordered_map.h"
+#include "galloc.h"
+#include "locks.h"
+
+class PortVirtualizer : public GlobAlloc {
+    private:
+        g_unordered_map<int, int> realToVirt;
+        g_unordered_map<int, int> virtToReal;
+
+        lock_t pvLock;
+
+    public:
+        PortVirtualizer() {
+            futex_init(&pvLock);
+        }
+
+        //Must always lock before any operation, and unlock after!
+        //lock() unlock() are external because bind() spans multiple methods
+        void lock() { futex_lock(&pvLock); }
+        void unlock() { futex_unlock(&pvLock); }
+
+        //Note there's no error checking for a bind that binds on a previous one.
+        //If someone previous bound to that port, the virtualization code should just go ahead with that mapping and
+        //either let bind() fail (if the previous bind is stil active) or succeed (if the previous bind ended)
+        void registerBind(int virt, int real) {
+            realToVirt[real] = virt;
+            virtToReal[virt] = real;
+        }
+
+        //Returns -1 if not in map. For connect() and bind()
+        int lookupReal(int virt) {
+            g_unordered_map<int, int>::iterator it = virtToReal.find(virt);
+            return (it == virtToReal.end())? -1 : it->second;
+        }
+
+        //Returns -1 if not in map. For getsockname(), where the OS returns real and we need virt
+        int lookupVirt(int real) {
+            g_unordered_map<int, int>::iterator it = realToVirt.find(real);
+            return (it == realToVirt.end())? -1 : it->second;
+        }
+};
+
+#endif  // VIRT_PORT_VIRTUALIZER_H_
diff --git a/src/virt/ports.cpp b/src/virt/ports.cpp
new file mode 100644
index 00000000..cd94723a
--- /dev/null
+++ b/src/virt/ports.cpp
@@ -0,0 +1,167 @@
+/** $lic$
+ * Copyright (C) 2012-2014 by Massachusetts Institute of Technology
+ * Copyright (C) 2010-2013 by The Board of Trustees of Stanford University
+ *
+ * This file is part of zsim.
+ *
+ * zsim is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU General Public License as published by the Free Software
+ * Foundation, version 2.
+ *
+ * If you use this software in your research, we request that you reference
+ * the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of
+ * Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the
+ * source of the simulator in any publications that use this software, and that
+ * you send us a citation of your work.
+ *
+ * zsim is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <arpa/inet.h>
+#include <netinet/in.h>
+#include <sys/socket.h>
+#include "process_tree.h"
+#include "virt/common.h"
+#include "virt/port_virtualizer.h"
+#include "zsim.h"
+
+// Helper function
+static struct sockaddr_in* GetSockAddr(ADDRINT guestAddr, size_t guestSize) {
+    if (guestSize != sizeof(struct sockaddr_in)) return NULL;
+    struct sockaddr_in* res = (struct sockaddr_in*) malloc(sizeof(struct sockaddr_in));
+    if (!safeCopy((struct sockaddr_in*) guestAddr, res) || res->sin_family != AF_INET) {
+        free(res);
+        return NULL;
+    }
+    return res;
+}
+
+// Patch functions
+
+PostPatchFn PatchBind(PrePatchArgs args) {
+    CONTEXT* ctxt = args.ctxt;
+    SYSCALL_STANDARD std = args.std;
+    
+    ADDRINT sAddrPtr = PIN_GetSyscallArgument(ctxt, std, 1);
+    ADDRINT sLen = PIN_GetSyscallArgument(ctxt, std, 2);
+    struct sockaddr_in* servAddr = GetSockAddr(sAddrPtr, sLen);
+    if (!servAddr) return NullPostPatch;  // invalid input or socketaddr family
+
+    int port = ntohs(servAddr->sin_port);
+    if (port != 0) {  // if port is 0, we don't need to virtualize, OS will assign a free one
+        uint32_t portDomain = zinfo->procArray[procIdx]->getPortDomain();
+        info("Virtualizing bind() to port %d (domain %d)", port, portDomain);
+        zinfo->portVirt[portDomain]->lock(); //unlocked either on write failure below, or after the syscall
+        int prevPort = zinfo->portVirt[portDomain]->lookupReal(port);
+        if (prevPort == -1) {
+            // No previous bind(), request whatever
+            servAddr->sin_port = htons(0);
+        } else {
+            // There was a previous bind() on this port, so we reuse the translation
+            // This should work in MOST cases, but may fail if the port is reused by something else and we conflict. Should be quite rare, since Linux tries to space out anonymous reassigns to the same port
+            warn("bind() to port %d, this port already has a translation %d, using it --- in rare cases this may fail when the unvirtualized case should succeed", port, prevPort);
+            servAddr->sin_port = htons(prevPort);
+        }
+        PIN_SetSyscallArgument(ctxt, std, 1, (ADDRINT) servAddr);
+
+        auto postFn = [sAddrPtr](PostPatchArgs args) {
+            struct sockaddr_in* servAddr = (struct sockaddr_in*) PIN_GetSyscallArgument(args.ctxt, args.std, 1);
+            int virtPort = ntohs(((struct sockaddr_in*)sAddrPtr)->sin_port);
+
+            uint32_t portDomain = zinfo->procArray[procIdx]->getPortDomain();
+            REG out = (REG) PIN_GetSyscallNumber(args.ctxt, args.std);
+            if (out == 0) {
+                int sockfd = PIN_GetSyscallArgument(args.ctxt, args.std, 0);
+                struct sockaddr_in sockName; //NOTE: sockaddr_in to sockaddr casts are fine
+                socklen_t sockLen = sizeof(sockName);
+                if (getsockname(sockfd, (struct sockaddr*)&sockName, &sockLen) != 0) {
+                    panic("bind() succeeded, but getsockname() failed...");
+                }
+                int realPort = ntohs(sockName.sin_port);
+
+                info("Virtualized bind(), v: %d r: %d (domain %d)", virtPort, realPort, portDomain);
+                zinfo->portVirt[portDomain]->registerBind(virtPort, realPort);
+            } else {
+                info("bind(): tried to virtualize port, but bind() failed, not registering (domain %d)", portDomain);
+            }
+            zinfo->portVirt[portDomain]->unlock();  // note lock was in prepatch
+
+            // Restore original descriptor, free alloc
+            PIN_SetSyscallArgument(args.ctxt, args.std, 1, sAddrPtr);
+            free(servAddr);
+            return PPA_NOTHING;
+        };
+        return postFn;
+    } else {
+        free(servAddr);
+        return NullPostPatch;
+    }
+}
+
+PostPatchFn PatchGetsockname(PrePatchArgs args) {
+    return [](PostPatchArgs args) {
+        CONTEXT* ctxt = args.ctxt;
+        SYSCALL_STANDARD std = args.std;
+
+        REG out = (REG) PIN_GetSyscallNumber(ctxt, std);
+        if (out == 0) {
+            ADDRINT sockAddrPtr = PIN_GetSyscallArgument(ctxt, std, 1);
+            struct sockaddr_in sockAddr;
+            //safecopy may fail here and that's OK, it's just not a sockaddr_in, so not IPv4
+            if (safeCopy((struct sockaddr_in*) sockAddrPtr, &sockAddr) && sockAddr.sin_family == AF_INET) {
+                int realPort = ntohs(sockAddr.sin_port);
+                uint32_t portDomain = zinfo->procArray[procIdx]->getPortDomain();
+                zinfo->portVirt[portDomain]->lock();
+                int virtPort = zinfo->portVirt[portDomain]->lookupVirt(realPort);
+                zinfo->portVirt[portDomain]->unlock();
+                if (virtPort != -1) {
+                    info("Virtualizing getsockname() on previously bound port, r: %d, v: %d (domain %d)", realPort, virtPort, portDomain);
+                    sockAddr.sin_port = htons(virtPort);
+                    if (!safeCopy(&sockAddr, (struct sockaddr_in*) sockAddrPtr)) {
+                        panic("getsockname() virt fail");
+                    }
+                }
+            }
+        } //else this failed, no need to virtualize
+        return PPA_NOTHING;
+    };
+}
+
+PostPatchFn PatchConnect(PrePatchArgs args) {
+    CONTEXT* ctxt = args.ctxt;
+    SYSCALL_STANDARD std = args.std;
+
+    ADDRINT sAddrPtr = PIN_GetSyscallArgument(ctxt, std, 1);
+    ADDRINT sLen = PIN_GetSyscallArgument(ctxt, std, 2);
+    struct sockaddr_in* servAddr = GetSockAddr(sAddrPtr, sLen);
+    if (!servAddr) return NullPostPatch;  // invalid input or socketaddr family
+
+    int virtPort = ntohs(servAddr->sin_port);
+    uint32_t portDomain = zinfo->procArray[procIdx]->getPortDomain();
+    zinfo->portVirt[portDomain]->lock();
+    int realPort = zinfo->portVirt[portDomain]->lookupReal(virtPort);
+    zinfo->portVirt[portDomain]->unlock();
+    if (realPort != -1) {
+        info("Virtualizing connect(), v: %d r: %d (domain %d)", virtPort, realPort, portDomain);
+        servAddr->sin_port = htons(realPort);
+        PIN_SetSyscallArgument(ctxt, std, 1, (ADDRINT) servAddr);
+
+        auto postFn = [sAddrPtr, servAddr](PostPatchArgs args) {
+            //Restore original (virt) port (NOTE: regardless of whether connect() succeeded or not)
+            PIN_SetSyscallArgument(args.ctxt, args.std, 1, sAddrPtr);
+            free(servAddr);
+            return PPA_NOTHING;
+        };
+        return postFn;
+    } else {
+        free(servAddr);
+        return NullPostPatch;
+    }
+}
+
diff --git a/src/virt/syscall_name.cpp b/src/virt/syscall_name.cpp
new file mode 100644
index 00000000..81f93e0f
--- /dev/null
+++ b/src/virt/syscall_name.cpp
@@ -0,0 +1,42 @@
+/** $lic$
+ * Copyright (C) 2012-2014 by Massachusetts Institute of Technology
+ * Copyright (C) 2010-2013 by The Board of Trustees of Stanford University
+ *
+ * This file is part of zsim.
+ *
+ * zsim is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU General Public License as published by the Free Software
+ * Foundation, version 2.
+ *
+ * If you use this software in your research, we request that you reference
+ * the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of
+ * Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the
+ * source of the simulator in any publications that use this software, and that
+ * you send us a citation of your work.
+ *
+ * zsim is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+// Put this before any includes
+#define QUOTED_(x) #x
+#define QUOTED(x) QUOTED_(x)
+
+#define __SYSCALL(a, b) QUOTED(b),
+
+static const char* syscallNames[] = {
+#include <asm/unistd.h>
+"INVALID"
+};
+
+#include <stdint.h>
+
+const char* GetSyscallName(uint32_t syscall) {
+    return (syscall >= sizeof(syscallNames)/sizeof(syscallNames[0]))? "INVALID" : syscallNames[syscall];
+}
+
diff --git a/src/virt/syscall_name.h b/src/virt/syscall_name.h
new file mode 100644
index 00000000..d35fa09e
--- /dev/null
+++ b/src/virt/syscall_name.h
@@ -0,0 +1,31 @@
+/** $lic$
+ * Copyright (C) 2012-2014 by Massachusetts Institute of Technology
+ * Copyright (C) 2010-2013 by The Board of Trustees of Stanford University
+ *
+ * This file is part of zsim.
+ *
+ * zsim is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU General Public License as published by the Free Software
+ * Foundation, version 2.
+ *
+ * If you use this software in your research, we request that you reference
+ * the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of
+ * Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the
+ * source of the simulator in any publications that use this software, and that
+ * you send us a citation of your work.
+ *
+ * zsim is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef VIRT_SYSCALL_NAME_H_
+#define VIRT_SYSCALL_NAME_H_
+
+const char* GetSyscallName(uint32_t syscall);
+
+#endif  // VIRT_SYSCALL_NAME_H_
diff --git a/src/virt/time.cpp b/src/virt/time.cpp
new file mode 100644
index 00000000..9a498783
--- /dev/null
+++ b/src/virt/time.cpp
@@ -0,0 +1,309 @@
+/** $glic$
+ * Copyright (C) 2012-2014 by Massachusetts Institute of Technology
+ * Copyright (C) 2010-2013 by The Board of Trustees of Stanford University
+ * Copyright (C) 2011 Google Inc.
+ *
+ * This file is part of zsim.
+ *
+ * zsim is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU General Public License as published by the Free Software
+ * Foundation, version 2.
+ *
+ * If you use this software in your research, we request that you reference
+ * the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of
+ * Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the
+ * source of the simulator in any publications that use this software, and that
+ * you send us a citation of your work.
+ *
+ * zsim is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <unistd.h>
+#include "log.h"
+#include "process_tree.h"
+#include "rdtsc.h"
+#include "scheduler.h"
+#include "virt/common.h"
+#include "virt/time_conv.h"
+#include "zsim.h"
+
+static bool SkipTimeVirt(PrePatchArgs args) {
+    // having both conditions ensures that we don't virtualize in the interim of toggling ff ON
+    return args.isNopThread || zinfo->procArray[procIdx]->isInFastForward();
+}
+
+// General virtualization functions, used for both syscall and vsyscall/vdso virtualization
+
+void VirtGettimeofday(uint32_t tid, ADDRINT arg0) {
+    trace(TimeVirt, "[%d] Post-patching gettimeofday", tid);
+    if (arg0) {
+        struct timeval tv;
+        if (!safeCopy((struct timeval*) arg0, &tv)) {
+            info("Failed read of gettimeofday() input");
+            return;
+        }
+        trace(TimeVirt, "Orig %ld sec, %ld usec", tv.tv_sec, tv.tv_usec);
+        uint64_t simNs = cyclesToNs(zinfo->globPhaseCycles);
+        uint32_t domain = zinfo->procArray[procIdx]->getClockDomain();
+        tv = nsToTimeval(zinfo->clockDomainInfo[domain].realtimeOffsetNs + simNs);
+
+        trace(TimeVirt, " Patched %ld sec, %ld usec", tv.tv_sec, tv.tv_usec);
+        if (!safeCopy(&tv, (struct timeval*) arg0)) {
+            info("Failed write of gettimeofday() output");
+        }
+    }
+}
+
+void VirtTime(uint32_t tid, REG* out, ADDRINT arg0) {
+    time_t origRes = (time_t)out;
+    if (origRes == ((time_t)-1) || origRes == ((time_t)-EFAULT)) { //glibc will return -1; raw syscall will return -EFAULT
+        info("[%d] post-patch time(), returned error or EFAULT (%ld)", tid, origRes);
+        return;
+    }
+
+    uint64_t simNs = cyclesToNs(zinfo->globPhaseCycles);
+    uint32_t domain = zinfo->procArray[procIdx]->getClockDomain();
+    time_t tm = (zinfo->clockDomainInfo[domain].realtimeOffsetNs + simNs)/NSPS;
+
+    trace(TimeVirt, "[%d] Post-patching time(), orig %ld, new %ld", tid, (time_t)*out, tm);
+    *out = (REG)tm;
+    if (arg0) {
+        if (!safeCopy(&tm, (time_t*) arg0)) {
+            info("Failed write of time() output");
+        }
+    }
+}
+
+void VirtClockGettime(uint32_t tid, ADDRINT arg0, ADDRINT arg1) {
+    uint32_t domain = zinfo->procArray[procIdx]->getClockDomain();
+    ClockDomainInfo& dom =  zinfo->clockDomainInfo[domain];
+
+    //arg0 indicates clock type
+    uint64_t offset = 0;
+    switch (arg0) {
+        case CLOCK_MONOTONIC:
+            offset = dom.monotonicOffsetNs;
+            break;
+        case CLOCK_REALTIME:
+            offset = dom.realtimeOffsetNs;
+            break;
+        case CLOCK_PROCESS_CPUTIME_ID:
+            offset = dom.processOffsetNs;
+            break;
+        case CLOCK_THREAD_CPUTIME_ID:
+            offset = dom.processOffsetNs;
+            warn("clock_gettime() called with CLOCK_THREAD_CPUTIME_ID, faking with CLOCK_PROCESS_CPUTIME_ID");
+            break;
+    } //with others, the result does not matter --- actual clock_gettime has returned -1 and EINVAL
+
+    if (arg1) {
+        struct timespec ts;
+        if (!safeCopy((struct timespec*) arg1, &ts)) {
+            info("Failed read of clock_gettime() input");
+            return;
+        }
+
+        trace(TimeVirt, "Patching clock_gettime()");
+        trace(TimeVirt, "Orig %ld sec, %ld nsec", ts.tv_sec, ts.tv_nsec);
+
+        clock_gettime(CLOCK_MONOTONIC, &ts);
+        trace(TimeVirt, "MONOTONIC %ld sec, %ld nsec", ts.tv_sec, ts.tv_nsec);
+        clock_gettime(CLOCK_REALTIME, &ts);
+        trace(TimeVirt, "REALTIME %ld sec, %ld nsec", ts.tv_sec, ts.tv_nsec);
+        clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts);
+        trace(TimeVirt, "PROCESS_CPUTIME_ID %ld sec, %ld nsec", ts.tv_sec, ts.tv_nsec);
+        clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts);
+        trace(TimeVirt, "THREAD_CPUTIME_ID %ld sec, %ld nsec", ts.tv_sec, ts.tv_nsec);
+
+        uint64_t simNs = cyclesToNs(zinfo->globPhaseCycles);
+        ts = nsToTimespec(offset + simNs);
+        trace(TimeVirt, "Patched %ld sec, %ld nsec", ts.tv_sec, ts.tv_nsec);
+
+        if (!safeCopy(&ts, (struct timespec*) arg1)) {
+            info("Failed write of gettimeofday() output");
+        }
+    }
+}
+
+// Syscall patch wrappers
+
+PostPatchFn PatchGettimeofday(PrePatchArgs args) {
+    if (SkipTimeVirt(args)) return NullPostPatch;
+    return [](PostPatchArgs args) {
+        trace(TimeVirt, "[%d] Post-patching SYS_gettimeofday", args.tid);
+        ADDRINT arg0 = PIN_GetSyscallArgument(args.ctxt, args.std, 0);
+        VirtGettimeofday(args.tid, arg0);
+        return PPA_NOTHING;
+    };
+} 
+
+PostPatchFn PatchTime(PrePatchArgs args) {
+    if (SkipTimeVirt(args)) return NullPostPatch;
+    return [](PostPatchArgs args) {
+        trace(TimeVirt, "[%d] Post-patching SYS_time", args.tid);
+        ADDRINT arg0 = PIN_GetSyscallArgument(args.ctxt, args.std, 0);
+        REG out = (REG)PIN_GetSyscallNumber(args.ctxt, args.std);
+        VirtTime(args.tid, &out, arg0);
+        PIN_SetSyscallNumber(args.ctxt, args.std, (ADDRINT) out);  // hack, we have no way of setting the result, this changes rax just as well
+        return PPA_NOTHING;
+    };
+}
+
+PostPatchFn PatchClockGettime(PrePatchArgs args) {
+    if (SkipTimeVirt(args)) return NullPostPatch;
+    return [](PostPatchArgs args) {
+        trace(TimeVirt, "[%d] Post-patching SYS_clock_gettime", args.tid);
+        ADDRINT arg0 = PIN_GetSyscallArgument(args.ctxt, args.std, 0);
+        ADDRINT arg1 = PIN_GetSyscallArgument(args.ctxt, args.std, 1);
+        VirtClockGettime(args.tid, arg0, arg1);
+        return PPA_NOTHING;
+    };
+}
+
+// SYS_nanosleep & SYS_clock_nanosleep
+
+PostPatchFn PatchNanosleep(PrePatchArgs args) {
+    if (SkipTimeVirt(args)) return NullPostPatch;
+
+    CONTEXT* ctxt = args.ctxt;
+    SYSCALL_STANDARD std = args.std;
+    uint32_t syscall = PIN_GetSyscallNumber(ctxt, std);
+    bool isClock = (syscall == SYS_clock_nanosleep);
+    assert(isClock || syscall == SYS_nanosleep);
+
+    struct timespec* ts;
+    uint64_t offsetNsec = 0;
+    if (isClock) {
+        trace(TimeVirt, "[%d] Pre-patching SYS_clock_nanosleep", tid);
+        int flags = (int) PIN_GetSyscallArgument(ctxt, std, 1);
+        ts = (struct timespec*) PIN_GetSyscallArgument(ctxt, std, 2);
+        if (flags == TIMER_ABSTIME) {
+            trace(TimeVirt, "[%d] SYS_clock_nanosleep requests TIMER_ABSTIME, offsetting", tid);
+            uint32_t domain = zinfo->procArray[procIdx]->getClockDomain();
+            uint64_t simNs = cyclesToNs(zinfo->globPhaseCycles);
+            offsetNsec = simNs + zinfo->clockDomainInfo[domain].realtimeOffsetNs;
+        }
+    } else {
+        trace(TimeVirt, "[%d] Pre-patching SYS_nanosleep", tid);
+        ts = (struct timespec*) PIN_GetSyscallArgument(ctxt, std, 0);
+    }
+
+    // Check preconditions
+    // FIXME, shouldn't this use safeCopy??
+    if (!ts) return NullPostPatch;  // kernel will return EFAULT
+    if (ts->tv_sec < 0 || ts->tv_nsec < 0 || ts->tv_nsec > 999999999) return false;  // kernel will return EINVAL
+
+    uint64_t waitNsec = timespecToNs(*ts);
+    if (waitNsec >= offsetNsec) waitNsec -= offsetNsec;
+    else waitNsec = 0;
+
+    uint64_t waitCycles = nsToCycles(waitNsec);
+    uint64_t waitPhases = waitCycles/zinfo->phaseLength + 1; //wait at least 1 phase
+    uint64_t wakeupPhase = zinfo->numPhases + waitPhases;
+
+    volatile uint32_t* futexWord = zinfo->sched->markForSleep(procIdx, args.tid, wakeupPhase);
+
+    // Save args
+    ADDRINT arg0 = PIN_GetSyscallArgument(ctxt, std, 0);
+    ADDRINT arg1 = PIN_GetSyscallArgument(ctxt, std, 1);
+    ADDRINT arg2 = PIN_GetSyscallArgument(ctxt, std, 2);
+    ADDRINT arg3 = PIN_GetSyscallArgument(ctxt, std, 3);
+    struct timespec* rem = (struct timespec*) PIN_GetSyscallArgument(ctxt, std, isClock? 3 : 1);
+
+    // Turn this into a non-timed FUTEX_WAIT syscall
+    PIN_SetSyscallNumber(ctxt, std, SYS_futex);
+    PIN_SetSyscallArgument(ctxt, std, 0, (ADDRINT)futexWord);
+    PIN_SetSyscallArgument(ctxt, std, 1, (ADDRINT)FUTEX_WAIT);
+    PIN_SetSyscallArgument(ctxt, std, 2, (ADDRINT)1 /*by convention, see sched code*/);
+    PIN_SetSyscallArgument(ctxt, std, 3, (ADDRINT)NULL);
+    
+    return [isClock, wakeupPhase, arg0, arg1, arg2, arg3, rem](PostPatchArgs args) {
+        CONTEXT* ctxt = args.ctxt;
+        SYSCALL_STANDARD std = args.std;
+        
+        if (isClock) {
+            trace(TimeVirt, "[%d] Post-patching SYS_clock_nanosleep", tid);
+        } else {
+            trace(TimeVirt, "[%d] Post-patching SYS_nanosleep", tid);
+        }
+
+        int res = (int)(-PIN_GetSyscallNumber(ctxt, std));
+        if (res == EWOULDBLOCK) {
+            trace(TimeVirt, "Fixing EWOULDBLOCK --> 0");
+            PIN_SetSyscallNumber(ctxt, std, 0);  // this is fine, you just called a very very short sleep
+        } else if (res == EINTR) {
+            PIN_SetSyscallNumber(ctxt, std, -EINTR);  // we got an interrupt
+        } else {
+            trace(TimeVirt, "%d", res);
+            assert(res == 0);
+        }
+
+        // Restore pre-call args
+        PIN_SetSyscallArgument(ctxt, std, 0, arg0);
+        PIN_SetSyscallArgument(ctxt, std, 1, arg1);
+        PIN_SetSyscallArgument(ctxt, std, 2, arg2);
+        PIN_SetSyscallArgument(ctxt, std, 3, arg3);
+
+        // Handle remaining time stuff
+        if (rem) {
+            if (res == EINTR) {
+                assert(wakeupPhase >= zinfo->numPhases);  // o/w why is this EINTR...
+                uint64_t remainingCycles = wakeupPhase - zinfo->numPhases;
+                uint64_t remainingNsecs = remainingCycles*1000/zinfo->freqMHz;
+                rem->tv_sec = remainingNsecs/1000000000;
+                rem->tv_nsec = remainingNsecs % 1000000000;
+            } else {
+                rem->tv_sec = 0;
+                rem->tv_nsec = 0;
+            }
+        }
+
+        return PPA_NOTHING;
+    };
+}
+
+// Clock domain query functions
+
+void VirtCaptureClocks(bool isDeffwd) {
+    uint32_t domain = zinfo->procArray[procIdx]->getClockDomain();
+    ClockDomainInfo& dom = zinfo->clockDomainInfo[domain];
+    futex_lock(&dom.lock);
+    if (isDeffwd || dom.realtimeOffsetNs == 0) {
+        info("[%d] Adjusting clocks, domain %d, de-ffwd %d", procIdx, domain, isDeffwd);
+
+        struct timespec realtime;
+        struct timespec monotonic;
+        struct timespec process;
+        clock_gettime(CLOCK_REALTIME, &realtime);
+        clock_gettime(CLOCK_MONOTONIC, &monotonic);
+        clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &process);
+        uint64_t realRdtsc = rdtsc();
+
+        uint64_t curCycles = zinfo->globPhaseCycles;
+        uint64_t curNs = cyclesToNs(curCycles);
+
+        uint64_t realtimeNs = timespecToNs(realtime);
+        uint64_t monotonicNs = timespecToNs(monotonic);
+        uint64_t processNs = timespecToNs(process);
+
+        dom.realtimeOffsetNs = realtimeNs - curNs;
+        dom.monotonicOffsetNs = monotonicNs - curNs;
+        dom.processOffsetNs = processNs - curNs;
+        dom.rdtscOffset = realRdtsc - curCycles;
+
+        //info("Offsets: %ld %ld %ld %ld", dom.realtimeOffsetNs, dom.monotonicOffsetNs, dom.processOffsetNs, dom.rdtscOffset)
+    }
+    futex_unlock(&dom.lock);
+}
+
+uint64_t VirtGetPhaseRDTSC() {
+    uint32_t domain = zinfo->procArray[procIdx]->getClockDomain();
+    return zinfo->clockDomainInfo[domain].rdtscOffset + zinfo->globPhaseCycles;
+}
+
diff --git a/src/virt/time_conv.h b/src/virt/time_conv.h
new file mode 100644
index 00000000..b6c01cf4
--- /dev/null
+++ b/src/virt/time_conv.h
@@ -0,0 +1,67 @@
+/** $glic$
+ * Copyright (C) 2012-2014 by Massachusetts Institute of Technology
+ * Copyright (C) 2010-2013 by The Board of Trustees of Stanford University
+ * Copyright (C) 2011 Google Inc.
+ *
+ * This file is part of zsim.
+ *
+ * zsim is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU General Public License as published by the Free Software
+ * Foundation, version 2.
+ *
+ * If you use this software in your research, we request that you reference
+ * the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of
+ * Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the
+ * source of the simulator in any publications that use this software, and that
+ * you send us a citation of your work.
+ *
+ * zsim is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef VIRT_TIME_CONV_H_
+#define VIRT_TIME_CONV_H_
+
+#include <unistd.h>
+
+// Helper functions to translate between ns, timespec/timeval, and cycles
+
+// ns per s :)
+#define NSPS (1000*1000*1000L)
+
+static inline uint64_t timevalToNs(struct timeval tv) {
+    return tv.tv_sec*NSPS + tv.tv_usec*1000L;
+}
+
+static inline uint64_t timespecToNs(struct timespec ts) {
+    return ts.tv_sec*NSPS + ts.tv_nsec;
+}
+
+static inline struct timeval nsToTimeval(uint64_t ns) {
+    struct timeval res;
+    res.tv_sec = ns/NSPS;
+    res.tv_usec = (ns % NSPS)/1000;
+    return res;
+}
+
+static inline struct timespec nsToTimespec(uint64_t ns) {
+    struct timespec res;
+    res.tv_sec = ns/NSPS;
+    res.tv_nsec = (ns % NSPS);
+    return res;
+}
+
+static inline uint64_t cyclesToNs(uint64_t cycles) {
+    return cycles*1000/zinfo->freqMHz;
+}
+
+static inline uint64_t nsToCycles(uint64_t cycles) {
+    return cycles*zinfo->freqMHz/1000;
+}
+
+#endif  // VIRT_TIME_CONV_H_
diff --git a/src/virt/timeout.cpp b/src/virt/timeout.cpp
new file mode 100644
index 00000000..0824f822
--- /dev/null
+++ b/src/virt/timeout.cpp
@@ -0,0 +1,251 @@
+/** $lic$
+ * Copyright (C) 2012-2014 by Massachusetts Institute of Technology
+ * Copyright (C) 2010-2013 by The Board of Trustees of Stanford University
+ *
+ * This file is part of zsim.
+ *
+ * zsim is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU General Public License as published by the Free Software
+ * Foundation, version 2.
+ *
+ * If you use this software in your research, we request that you reference
+ * the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of
+ * Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the
+ * source of the simulator in any publications that use this software, and that
+ * you send us a citation of your work.
+ *
+ * zsim is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "constants.h"
+#include "log.h"
+#include "scheduler.h"
+#include "process_tree.h"
+#include "virt/common.h"
+#include "virt/syscall_name.h"
+#include "virt/time_conv.h"
+#include "zsim.h"
+
+static struct timespec fakeTimeouts[MAX_THREADS]; //for syscalls that use timespec to indicate a timeout
+static bool inFakeTimeoutMode[MAX_THREADS];
+
+static bool SkipTimeoutVirt(PrePatchArgs args) {
+    // having both conditions ensures that we don't virtualize in the interim of toggling ff ON
+    return args.isNopThread || zinfo->procArray[procIdx]->isInFastForward();
+}
+
+// Helper function, see /usr/include/linux/futex.h
+static bool isFutexWaitOp(int op) {
+    switch (op & FUTEX_CMD_MASK) { //handles PRIVATE / REALTIME as well
+        case FUTEX_WAIT:
+        case FUTEX_WAIT_BITSET:
+        case FUTEX_WAIT_REQUEUE_PI:
+            return true;
+        default:
+            return false;
+    }
+}
+
+static bool isFutexWakeOp(int op) {
+    switch (op & FUTEX_CMD_MASK) {
+        case FUTEX_WAKE:
+        case FUTEX_REQUEUE:
+        case FUTEX_CMP_REQUEUE:
+        case FUTEX_WAKE_OP:
+        case FUTEX_WAKE_BITSET:
+        case FUTEX_CMP_REQUEUE_PI:
+            return true;
+        default:
+            return false;
+    }
+}
+
+
+static int getTimeoutArg(int syscall) {
+    if (syscall == SYS_poll) return 2;
+    return 3;  // futex, epoll_wait, epoll_pwait
+}
+
+static bool PrePatchTimeoutSyscall(uint32_t tid, CONTEXT* ctxt, SYSCALL_STANDARD std, int syscall) {
+    assert(!inFakeTimeoutMode[tid]);  // canary: this will probably fail...
+    int64_t waitNsec = 0;
+
+    // Per-syscall manipulation. This code either succeeds, fakes timeout value and sets waitNsec, or returns false
+    int timeoutArg = getTimeoutArg(syscall);
+    if (syscall == SYS_futex) {
+        // Check preconditions
+        assert(timeoutArg == 3);
+        int* uaddr = (int*) PIN_GetSyscallArgument(ctxt, std, 0);
+        int op = (int) PIN_GetSyscallArgument(ctxt, std, 1);
+        const struct timespec* timeout = (const struct timespec*) PIN_GetSyscallArgument(ctxt, std, 3);
+
+        //info("FUTEX op %d  waitOp %d uaddr %p ts %p", op, isFutexWaitOp(op), uaddr, timeout);
+        if (!(uaddr && isFutexWaitOp(op) && timeout)) return false;  // not a timeout FUTEX_WAIT
+
+        waitNsec = timeout->tv_sec*1000000000L + timeout->tv_nsec;
+
+        if (op | FUTEX_CLOCK_REALTIME) {
+            // NOTE: FUTEX_CLOCK_REALTIME is not a documented interface AFAIK, but looking at the Linux source code + with some verification, this is the xlat
+            uint32_t domain = zinfo->procArray[procIdx]->getClockDomain();
+            uint64_t simNs = cyclesToNs(zinfo->globPhaseCycles);
+            uint64_t offsetNs = simNs + zinfo->clockDomainInfo[domain].realtimeOffsetNs;
+            //info(" REALTIME FUTEX: %ld %ld %ld %ld", waitNsec, simNs, offsetNs, waitNsec-offsetNs);
+            waitNsec = (waitNsec > (int64_t)offsetNs)? (waitNsec - offsetNs) : 0;
+        }
+
+        if (waitNsec <= 0) return false;  // while technically waiting, this does not block. I'm guessing this is done for trylocks? It's weird.
+
+        fakeTimeouts[tid].tv_sec = 0;
+        fakeTimeouts[tid].tv_nsec = 20*1000*1000;  // timeout every 20ms of actual host time
+        PIN_SetSyscallArgument(ctxt, std, 3, (ADDRINT)&fakeTimeouts[tid]);
+    } else {
+        assert(syscall == SYS_epoll_wait || syscall == SYS_epoll_pwait || syscall == SYS_poll);
+        int timeout = (int) PIN_GetSyscallArgument(ctxt, std, timeoutArg);
+        if (timeout <= 0) return false;
+        //info("[%d] pre-patch epoll_wait/pwait", tid);
+
+        PIN_SetSyscallArgument(ctxt, std, timeoutArg, 20); // 20ms timeout
+        waitNsec = ((uint64_t)timeout)*1000*1000;  // timeout is in ms
+    }
+
+    //info("[%d] pre-patch %s (%d) waitNsec = %ld", tid, GetSyscallName(syscall), syscall, waitNsec);
+
+    uint64_t waitCycles = waitNsec*zinfo->freqMHz/1000;
+    uint64_t waitPhases = waitCycles/zinfo->phaseLength;
+    if (waitPhases < 2) waitPhases = 2;  // at least wait 2 phases; this should basically eliminate the chance that we get a SIGSYS before we start executing the syscal instruction
+    uint64_t wakeupPhase = zinfo->numPhases + waitPhases;
+
+    /*volatile uint32_t* futexWord =*/ zinfo->sched->markForSleep(procIdx, tid, wakeupPhase);  // we still want to mark for sleep, bear with me...
+    inFakeTimeoutMode[tid] = true;
+    return true;
+}
+
+static bool PostPatchTimeoutSyscall(uint32_t tid, CONTEXT* ctxt, SYSCALL_STANDARD std, int syscall, ADDRINT prevIp, ADDRINT timeoutArgVal) {
+    assert(inFakeTimeoutMode[tid]);
+    int res = (int)PIN_GetSyscallNumber(ctxt, std);
+
+    // Decide if it timed out
+    bool timedOut;
+    if (syscall == SYS_futex) {
+        timedOut = (res == -ETIMEDOUT);
+    } else {
+        timedOut = (res == 0);
+    }
+
+    bool isSleeping = zinfo->sched->isSleeping(procIdx, tid);
+
+    // Decide whether to retry
+    bool retrySyscall;
+    if (!timedOut) {
+        if (isSleeping) zinfo->sched->notifySleepEnd(procIdx, tid);
+        retrySyscall = false;
+    } else {
+        retrySyscall = isSleeping;
+    }
+
+    if (retrySyscall && zinfo->procArray[procIdx]->isInFastForward()) {
+        warn("[%d] Fast-forwarding started, not retrying timeout syscall (%s)", tid, GetSyscallName(syscall));
+        retrySyscall = false;
+        assert(isSleeping);
+        zinfo->sched->notifySleepEnd(procIdx, tid);
+    }
+
+    if (retrySyscall) {
+        // ADDRINT curIp = PIN_GetContextReg(ctxt, REG_INST_PTR);
+        //info("[%d] post-patch, retrying, IP: 0x%lx -> 0x%lx", tid, curIp, prevIp);
+        PIN_SetContextReg(ctxt, REG_INST_PTR, prevIp);
+        PIN_SetSyscallNumber(ctxt, std, syscall);
+    } else {
+        // Restore timeout arg
+        PIN_SetSyscallArgument(ctxt, std, getTimeoutArg(syscall), timeoutArgVal);
+        inFakeTimeoutMode[tid] = false;
+
+        // Restore arg? I don't think we need this!
+        /*if (syscall == SYS_futex) {
+            PIN_SetSyscallNumber(ctxt, std, -ETIMEDOUT);
+        } else {
+            assert(syscall == SYS_epoll_wait || syscall == SYS_epoll_pwait || syscall == SYS_poll);
+            PIN_SetSyscallNumber(ctxt, std, 0); //no events returned
+        }*/
+    }
+
+    //info("[%d] post-patch %s (%d), timedOut %d, sleeping (orig) %d, retrying %d, orig res %d, patched res %d", tid, GetSyscallName(syscall), syscall, timedOut, isSleeping, retrySyscall, res, (int)PIN_GetSyscallNumber(ctxt, std));
+    return retrySyscall;
+}
+
+/* Notify scheduler about FUTEX_WAITs woken up by FUTEX_WAKEs, FUTEX_WAKE entries, and FUTEX_WAKE exits */
+
+struct FutexInfo {
+    int op;
+    int val;
+};
+
+FutexInfo PrePatchFutex(uint32_t tid, CONTEXT* ctxt, SYSCALL_STANDARD std) {
+    FutexInfo fi;
+    fi.op = (int) PIN_GetSyscallArgument(ctxt, std, 1);
+    fi.val = (int) PIN_GetSyscallArgument(ctxt, std, 2);
+    if (isFutexWakeOp(fi.op)) {
+        zinfo->sched->notifyFutexWakeStart(procIdx, tid, fi.val);
+    }
+    return fi;
+}
+
+void PostPatchFutex(uint32_t tid, FutexInfo fi, CONTEXT* ctxt, SYSCALL_STANDARD std) {
+    int res = (int) PIN_GetSyscallNumber(ctxt, std);
+    if (isFutexWaitOp(fi.op) && res == 0) {
+        zinfo->sched->notifyFutexWaitWoken(procIdx, tid);
+    } else if (isFutexWakeOp(fi.op) && res >= 0) {
+        /* In contrast to the futex manpage, from the kernel's futex.c
+         * (do_futex), WAKE and WAKE_OP return the number of threads woken up,
+         * but the REQUEUE and CMP_REQUEUE and REQUEUE_PI ops return the number
+         * of threads woken up + requeued. However, these variants
+         * (futex_requeue) first try to wake the specified threads, then
+         * requeue as many other threads as they can.
+         *
+         * Therefore, this wokenUp expression should be correct for all variants
+         * of SYS_futex that wake up threads (WAKE, REQUEUE, CMP_REQUEUE, ...)
+         */
+        uint32_t wokenUp = std::min(res, fi.val);
+        zinfo->sched->notifyFutexWakeEnd(procIdx, tid, wokenUp);
+    }
+}
+
+PostPatchFn PatchTimeoutSyscall(PrePatchArgs args) {
+    if (SkipTimeoutVirt(args)) return NullPostPatch;
+
+    int syscall = PIN_GetSyscallNumber(args.ctxt, args.std);
+    assert_msg(syscall == SYS_futex || syscall == SYS_epoll_wait || syscall == SYS_epoll_pwait || syscall == SYS_poll,
+            "Invalid timeout syscall %d", syscall);
+
+    FutexInfo fi = {0, 0};
+    if (syscall == SYS_futex) fi = PrePatchFutex(args.tid, args.ctxt, args.std);
+
+    if (PrePatchTimeoutSyscall(args.tid, args.ctxt, args.std, syscall)) {
+        ADDRINT prevIp = PIN_GetContextReg(args.ctxt, REG_INST_PTR);
+        ADDRINT timeoutArgVal = PIN_GetSyscallArgument(args.ctxt, args.std, getTimeoutArg(syscall));
+        return [syscall, prevIp, timeoutArgVal, fi](PostPatchArgs args) {
+            if (PostPatchTimeoutSyscall(args.tid, args.ctxt, args.std, syscall, prevIp, timeoutArgVal)) {
+                return PPA_USE_NOP_PTRS;  // retry
+            } else {
+                if (syscall == SYS_futex) PostPatchFutex(args.tid, fi, args.ctxt, args.std);
+                return PPA_USE_JOIN_PTRS;  // finish
+            }
+        };
+    } else {
+        if (syscall == SYS_futex) {
+            return [fi](PostPatchArgs args) {
+                PostPatchFutex(args.tid, fi, args.ctxt, args.std);
+                return PPA_NOTHING;
+            };
+        } else {
+            return NullPostPatch;
+        }
+    }
+}
+
diff --git a/src/virt/virt.cpp b/src/virt/virt.cpp
new file mode 100644
index 00000000..379854e4
--- /dev/null
+++ b/src/virt/virt.cpp
@@ -0,0 +1,88 @@
+/** $glic$
+ * Copyright (C) 2012-2014 by Massachusetts Institute of Technology
+ * Copyright (C) 2010-2013 by The Board of Trustees of Stanford University
+ * Copyright (C) 2011 Google Inc.
+ *
+ * This file is part of zsim.
+ *
+ * zsim is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU General Public License as published by the Free Software
+ * Foundation, version 2.
+ *
+ * If you use this software in your research, we request that you reference
+ * the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of
+ * Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the
+ * source of the simulator in any publications that use this software, and that
+ * you send us a citation of your work.
+ *
+ * zsim is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <syscall.h>
+#include "constants.h"
+#include "log.h"
+#include "virt/common.h"
+#include "virt/syscall_name.h"
+#include "virt/virt.h"
+
+#define MAX_SYSCALLS 350  // doesn't need to be accurate
+
+PrePatchFn prePatchFunctions[MAX_SYSCALLS];
+PostPatchFn postPatchFunctions[MAX_THREADS];
+
+const PostPatchFn NullPostPatch = [](PostPatchArgs) {return PPA_NOTHING;};
+
+// Common prepatch functions
+PostPatchFn NullPatch(PrePatchArgs) {
+    return NullPostPatch;
+}
+
+PostPatchFn WarnTimingRelated(PrePatchArgs args) {
+    uint32_t syscall = PIN_GetSyscallNumber(args.ctxt, args.std);
+    warn("[%d] Executing unvirtualized potentially timing-sensitive syscall: %s (%d)", args.tid, GetSyscallName(syscall), syscall);
+    return NullPostPatch;
+}
+
+// Define all patch functions
+#define PF(syscall, pfn) PostPatchFn pfn(PrePatchArgs args);
+#include "virt/patchdefs.h"
+#undef PF
+
+void VirtInit() {
+    for (uint32_t i = 0; i < MAX_SYSCALLS; i++) prePatchFunctions[i] = NullPatch;
+   
+    // Issue warnings on timing-sensitive syscalls
+    for (uint32_t syscall : {SYS_select, SYS_getitimer, SYS_alarm, SYS_setitimer, SYS_semop,
+            SYS_gettimeofday, SYS_times, SYS_rt_sigtimedwait, SYS_time, SYS_futex, SYS_mq_timedsend,
+            SYS_mq_timedreceive, SYS_pselect6, SYS_ppoll}) {
+        prePatchFunctions[syscall] = WarnTimingRelated;
+    }
+    
+    // Bind all patch functions
+    #define PF(syscall, pfn) prePatchFunctions[syscall] = pfn;
+    #include "virt/patchdefs.h"
+    #undef PF
+}
+
+
+// Dispatch methods
+void VirtSyscallEnter(THREADID tid, CONTEXT *ctxt, SYSCALL_STANDARD std, const char* patchRoot, bool isNopThread) {
+    uint32_t syscall = PIN_GetSyscallNumber(ctxt, std);
+    if (syscall >= MAX_SYSCALLS) {
+        warn("syscall %d out of range", syscall);
+        postPatchFunctions[tid] = NullPostPatch;
+    } else {
+        postPatchFunctions[tid] = prePatchFunctions[syscall]({tid, ctxt, std, patchRoot, isNopThread});
+    }
+}
+
+PostPatchAction VirtSyscallExit(THREADID tid, CONTEXT *ctxt, SYSCALL_STANDARD std) {
+    return postPatchFunctions[tid]({tid, ctxt, std});
+}
+
diff --git a/src/virt/virt.h b/src/virt/virt.h
new file mode 100644
index 00000000..5b0a9de3
--- /dev/null
+++ b/src/virt/virt.h
@@ -0,0 +1,54 @@
+/** $glic$
+ * Copyright (C) 2012-2014 by Massachusetts Institute of Technology
+ * Copyright (C) 2010-2013 by The Board of Trustees of Stanford University
+ * Copyright (C) 2011 Google Inc.
+ *
+ * This file is part of zsim.
+ *
+ * zsim is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU General Public License as published by the Free Software
+ * Foundation, version 2.
+ *
+ * If you use this software in your research, we request that you reference
+ * the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of
+ * Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the
+ * source of the simulator in any publications that use this software, and that
+ * you send us a citation of your work.
+ *
+ * zsim is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef VIRT_VIRT_H_
+#define VIRT_VIRT_H_
+
+// External virt interface
+
+#include "pin.H"
+
+enum PostPatchAction {
+    PPA_NOTHING,
+    PPA_USE_NOP_PTRS,
+    PPA_USE_JOIN_PTRS,
+};
+
+void VirtInit();  // per-process, not global
+void VirtSyscallEnter(THREADID tid, CONTEXT *ctxt, SYSCALL_STANDARD std, const char* patchRoot, bool isNopThread);
+PostPatchAction VirtSyscallExit(THREADID tid, CONTEXT *ctxt, SYSCALL_STANDARD std);
+
+// VDSO / external virt functions
+void VirtGettimeofday(uint32_t tid, ADDRINT arg0);
+void VirtTime(uint32_t tid, REG* retVal, ADDRINT arg0);
+void VirtClockGettime(uint32_t tid, ADDRINT arg0, ADDRINT arg1);
+void VirtGetcpu(uint32_t tid, uint32_t cpu, ADDRINT arg0, ADDRINT arg1);
+
+// Time virtualization direct functions
+void VirtCaptureClocks(bool isDeffwd);  // called on start and ffwd to get all clocks together
+uint64_t VirtGetPhaseRDTSC();
+
+#endif  // VIRT_VIRT_H_
diff --git a/src/weave_md1_mem.h b/src/weave_md1_mem.h
new file mode 100644
index 00000000..f0c4db75
--- /dev/null
+++ b/src/weave_md1_mem.h
@@ -0,0 +1,122 @@
+/** $lic$
+ * Copyright (C) 2012-2014 by Massachusetts Institute of Technology
+ * Copyright (C) 2010-2013 by The Board of Trustees of Stanford University
+ *
+ * This file is part of zsim.
+ *
+ * zsim is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU General Public License as published by the Free Software
+ * Foundation, version 2.
+ *
+ * If you use this software in your research, we request that you reference
+ * the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of
+ * Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the
+ * source of the simulator in any publications that use this software, and that
+ * you send us a citation of your work.
+ *
+ * zsim is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef WEAVE_MD1_MEM_H_
+#define WEAVE_MD1_MEM_H_
+
+#include "mem_ctrls.h"
+#include "timing_event.h"
+#include "zsim.h"
+
+/* Implements a weave-phase memory controller based on the MD1 controller, returning the same
+ * latencies MD1 would return in the weave phase, but
+ */
+
+//Weave-phase event
+class WeaveMemAccEvent : public TimingEvent {
+    private:
+        uint32_t lat;
+
+    public:
+        WeaveMemAccEvent(uint32_t _lat, int32_t domain, uint32_t preDelay, uint32_t postDelay) :  TimingEvent(preDelay, postDelay, domain), lat(_lat) {}
+
+        void simulate(uint64_t startCycle) {
+            done(startCycle + lat);
+        }
+};
+
+// Actual controller
+class WeaveMD1Memory : public MD1Memory {
+    private:
+        const uint32_t zeroLoadLatency;
+        const uint32_t boundLatency;
+        const uint32_t domain;
+        uint32_t preDelay, postDelay;
+
+    public:
+        WeaveMD1Memory(uint32_t lineSize, uint32_t megacyclesPerSecond, uint32_t megabytesPerSecond, uint32_t _zeroLoadLatency, uint32_t _boundLatency, uint32_t _domain, g_string& _name) :
+            MD1Memory(lineSize, megacyclesPerSecond, megabytesPerSecond, _zeroLoadLatency, _name), zeroLoadLatency(_zeroLoadLatency), boundLatency(_boundLatency), domain(_domain)
+        {
+            preDelay = zeroLoadLatency/2;
+            postDelay = zeroLoadLatency - preDelay;
+        }
+
+        uint64_t access(MemReq& req) {
+            uint64_t realRespCycle = MD1Memory::access(req);
+            uint32_t realLatency = realRespCycle - req.cycle;
+
+            uint64_t respCycle = req.cycle + ((req.type == PUTS)? 0 : boundLatency);
+            assert(realRespCycle >= respCycle);
+            assert(req.type == PUTS || realLatency >= zeroLoadLatency);
+
+            if ((req.type != PUTS) && zinfo->eventRecorders[req.srcId]) {
+                WeaveMemAccEvent* memEv = new (zinfo->eventRecorders[req.srcId]) WeaveMemAccEvent(realLatency-zeroLoadLatency, domain, preDelay, postDelay);
+                memEv->setMinStartCycle(req.cycle);
+                TimingRecord tr = {req.lineAddr, req.cycle, respCycle, req.type, memEv, memEv};
+                zinfo->eventRecorders[req.srcId]->pushRecord(tr);
+            }
+
+            // info("Access to %lx at %ld, %d lat, returning %d", req.lineAddr, req.cycle, realLatency, zeroLoadLatency);
+            return respCycle;
+        }
+};
+
+// OK, even simpler...
+class WeaveSimpleMemory : public SimpleMemory {
+    private:
+        uint32_t zeroLoadLatency;
+        uint32_t domain;
+        uint32_t preDelay, postDelay;
+
+    public:
+        WeaveSimpleMemory(uint32_t _latency, uint32_t _zeroLoadLatency, uint32_t _domain, g_string& _name) :
+            SimpleMemory(_latency, _name), zeroLoadLatency(_zeroLoadLatency), domain(_domain)
+        {
+            assert(_latency >= _zeroLoadLatency);
+            preDelay = zeroLoadLatency/2;
+            postDelay = zeroLoadLatency - preDelay;
+        }
+
+        uint64_t access(MemReq& req) {
+            uint64_t realRespCycle = SimpleMemory::access(req);
+            uint32_t realLatency = realRespCycle - req.cycle;
+
+            uint64_t respCycle = req.cycle + ((req.type == PUTS)? 0 : zeroLoadLatency);
+            assert(realRespCycle >= respCycle);
+            assert(req.type == PUTS || realLatency >= zeroLoadLatency);
+
+            if ((req.type != PUTS) && zinfo->eventRecorders[req.srcId]) {
+                WeaveMemAccEvent* memEv = new (zinfo->eventRecorders[req.srcId]) WeaveMemAccEvent(realLatency-zeroLoadLatency, domain, preDelay, postDelay);
+                memEv->setMinStartCycle(req.cycle);
+                TimingRecord tr = {req.lineAddr, req.cycle, respCycle, req.type, memEv, memEv};
+                zinfo->eventRecorders[req.srcId]->pushRecord(tr);
+            }
+
+            // info("Access to %lx at %ld, %d lat, returning %d", req.lineAddr, req.cycle, realLatency, zeroLoadLatency);
+            return respCycle;
+        }
+};
+
+#endif  // WEAVE_MD1_MEM_H_
diff --git a/src/zsim.cpp b/src/zsim.cpp
new file mode 100644
index 00000000..a9bf01e0
--- /dev/null
+++ b/src/zsim.cpp
@@ -0,0 +1,1560 @@
+/** $glic$
+ * Copyright (C) 2012-2014 by Massachusetts Institute of Technology
+ * Copyright (C) 2010-2013 by The Board of Trustees of Stanford University
+ * Copyright (C) 2011 Google Inc.
+ *
+ * This file is part of zsim.
+ *
+ * zsim is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU General Public License as published by the Free Software
+ * Foundation, version 2.
+ *
+ * If you use this software in your research, we request that you reference
+ * the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of
+ * Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the
+ * source of the simulator in any publications that use this software, and that
+ * you send us a citation of your work.
+ *
+ * zsim is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/* The Pin-facing part of the simulator */
+
+#include "zsim.h"
+#include <algorithm>
+#include <bits/signum.h>
+#include <dlfcn.h>
+#include <execinfo.h>
+#include <fstream>
+#include <iostream>
+#include <sched.h>
+#include <sstream>
+#include <string>
+#include <sys/prctl.h>
+#include <sys/resource.h>
+#include <sys/time.h>
+#include <unistd.h>
+#include "constants.h"
+#include "contention_sim.h"
+#include "core.h"
+#include "cpuenum.h"
+#include "cpuid.h"
+#include "debug_zsim.h"
+#include "event_queue.h"
+#include "galloc.h"
+#include "init.h"
+#include "log.h"
+#include "pin.H"
+#include "pin_cmd.h"
+#include "process_tree.h"
+#include "profile_stats.h"
+#include "scheduler.h"
+#include "stats.h"
+//#include "syscall_funcs.h"
+#include "virt/virt.h"
+
+//#include <signal.h> //can't include this, conflicts with PIN's
+
+/* Command-line switches (used to pass info from harness that cannot be passed through the config file, most config is file-based) */
+
+KNOB<INT32> KnobProcIdx(KNOB_MODE_WRITEONCE, "pintool",
+        "procIdx", "0", "zsim process idx (internal)");
+
+KNOB<INT32> KnobShmid(KNOB_MODE_WRITEONCE, "pintool",
+        "shmid", "0", "SysV IPC shared memory id used when running in multi-process mode");
+
+KNOB<string> KnobConfigFile(KNOB_MODE_WRITEONCE, "pintool",
+        "config", "zsim.cfg", "config file name (only needed for the first simulated process)");
+
+//We need to know these as soon as we start, otherwise we could not log anything until we attach and read the config
+KNOB<bool> KnobLogToFile(KNOB_MODE_WRITEONCE, "pintool",
+        "logToFile", "false", "true if all messages should be logged to a logfile instead of stdout/err");
+
+KNOB<string> KnobOutputDir(KNOB_MODE_WRITEONCE, "pintool",
+        "outputDir", "./", "absolute path to write output files into");
+
+
+
+/* ===================================================================== */
+
+INT32 Usage() {
+    cerr << "zsim simulator pintool" << endl;
+    cerr << KNOB_BASE::StringKnobSummary();
+    cerr << endl;
+    return -1;
+}
+
+/* Global Variables */
+
+GlobSimInfo* zinfo;
+
+/* Per-process variables */
+
+uint32_t procIdx;
+uint32_t lineBits; //process-local for performance, but logically global
+Address procMask;
+
+static ProcessTreeNode* procTreeNode;
+
+//tid to cid translation
+#define INVALID_CID ((uint32_t)-1)
+#define UNINITIALIZED_CID ((uint32_t)-2) //Value set at initialization
+
+static uint32_t cids[MAX_THREADS];
+
+// Per TID core pointers (TODO: phase out cid/tid state --- this is enough)
+Core* cores[MAX_THREADS];
+
+static inline void clearCid(uint32_t tid) {
+    assert(tid < MAX_THREADS);
+    assert(cids[tid] != INVALID_CID);
+    cids[tid] = INVALID_CID;
+    cores[tid] = NULL;
+}
+
+static inline void setCid(uint32_t tid, uint32_t cid) {
+    assert(tid < MAX_THREADS);
+    assert(cids[tid] == INVALID_CID);
+    assert(cid < zinfo->numCores);
+    cids[tid] = cid;
+    cores[tid] = zinfo->cores[cid];
+}
+
+uint32_t getCid(uint32_t tid) {
+    //assert(tid < MAX_THREADS); //these assertions are fine, but getCid is called everywhere, so they are expensive!
+    uint32_t cid = cids[tid];
+    //assert(cid != INVALID_CID);
+    return cid;
+}
+
+// Internal function declarations
+void EnterFastForward();
+void ExitFastForward();
+
+VOID SimThreadStart(THREADID tid);
+VOID SimThreadFini(THREADID tid);
+VOID SimEnd();
+
+VOID HandleMagicOp(THREADID tid, ADDRINT op);
+
+VOID FakeCPUIDPre(THREADID tid, REG eax, REG ecx);
+VOID FakeCPUIDPost(THREADID tid, ADDRINT* eax, ADDRINT* ebx, ADDRINT* ecx, ADDRINT* edx); //REG* eax, REG* ebx, REG* ecx, REG* edx);
+
+VOID FakeRDTSCPost(THREADID tid, REG* eax, REG* edx);
+
+VOID VdsoInstrument(INS ins);
+VOID FFThread(VOID* arg);
+
+/* Indirect analysis calls to work around PIN's synchronization
+ *
+ * NOTE(dsm): Be extremely careful when modifying this code. It is simple, but
+ * it runs VERY frequently.  For example, with 24-byte structs on a fairly
+ * unoptimized L1 cache, this code introduced a 4% overhead, down to 2% with
+ * 32-byte structs. Also, be aware that a miss or unpredictable indirect jump
+ * is about the worst kind of pain you can inflict on an ooo core, so ensure
+ * that 1) there's no false sharing, and 2) these pointers are modified
+ * sparingly.
+ */
+
+InstrFuncPtrs fPtrs[MAX_THREADS] ATTR_LINE_ALIGNED; //minimize false sharing
+
+VOID PIN_FAST_ANALYSIS_CALL IndirectLoadSingle(THREADID tid, ADDRINT addr) {
+    fPtrs[tid].loadPtr(tid, addr);
+}
+
+VOID PIN_FAST_ANALYSIS_CALL IndirectStoreSingle(THREADID tid, ADDRINT addr) {
+    fPtrs[tid].storePtr(tid, addr);
+}
+
+VOID PIN_FAST_ANALYSIS_CALL IndirectBasicBlock(THREADID tid, ADDRINT bblAddr, BblInfo* bblInfo) {
+    fPtrs[tid].bblPtr(tid, bblAddr, bblInfo);
+}
+
+VOID PIN_FAST_ANALYSIS_CALL IndirectRecordBranch(THREADID tid, ADDRINT branchPc, BOOL taken, ADDRINT takenNpc, ADDRINT notTakenNpc) {
+    fPtrs[tid].branchPtr(tid, branchPc, taken, takenNpc, notTakenNpc);
+}
+
+VOID PIN_FAST_ANALYSIS_CALL IndirectPredLoadSingle(THREADID tid, ADDRINT addr, BOOL pred) {
+    fPtrs[tid].predLoadPtr(tid, addr, pred);
+}
+
+VOID PIN_FAST_ANALYSIS_CALL IndirectPredStoreSingle(THREADID tid, ADDRINT addr, BOOL pred) {
+    fPtrs[tid].predStorePtr(tid, addr, pred);
+}
+
+
+//Non-simulation variants of analysis functions
+
+// Join variants: Call join on the next instrumentation poin and return to analysis code
+void Join(uint32_t tid) {
+    assert(fPtrs[tid].type == FPTR_JOIN);
+    uint32_t cid = zinfo->sched->join(procIdx, tid); //can block
+    setCid(tid, cid);
+
+    if (unlikely(zinfo->terminationConditionMet)) {
+        info("Caught termination condition on join, exiting");
+        zinfo->sched->leave(procIdx, tid, cid);
+        SimEnd();
+    }
+
+    fPtrs[tid] = cores[tid]->GetFuncPtrs(); //back to normal pointers
+}
+
+VOID JoinAndLoadSingle(THREADID tid, ADDRINT addr) {
+    Join(tid);
+    fPtrs[tid].loadPtr(tid, addr);
+}
+
+VOID JoinAndStoreSingle(THREADID tid, ADDRINT addr) {
+    Join(tid);
+    fPtrs[tid].storePtr(tid, addr);
+}
+
+VOID JoinAndBasicBlock(THREADID tid, ADDRINT bblAddr, BblInfo* bblInfo) {
+    Join(tid);
+    fPtrs[tid].bblPtr(tid, bblAddr, bblInfo);
+}
+
+VOID JoinAndRecordBranch(THREADID tid, ADDRINT branchPc, BOOL taken, ADDRINT takenNpc, ADDRINT notTakenNpc) {
+    Join(tid);
+    fPtrs[tid].branchPtr(tid, branchPc, taken, takenNpc, notTakenNpc);
+}
+
+VOID JoinAndPredLoadSingle(THREADID tid, ADDRINT addr, BOOL pred) {
+    Join(tid);
+    fPtrs[tid].predLoadPtr(tid, addr, pred);
+}
+
+VOID JoinAndPredStoreSingle(THREADID tid, ADDRINT addr, BOOL pred) {
+    Join(tid);
+    fPtrs[tid].predStorePtr(tid, addr, pred);
+}
+
+// NOP variants: Do nothing
+VOID NOPLoadStoreSingle(THREADID tid, ADDRINT addr) {}
+VOID NOPBasicBlock(THREADID tid, ADDRINT bblAddr, BblInfo* bblInfo) {}
+VOID NOPRecordBranch(THREADID tid, ADDRINT addr, BOOL taken, ADDRINT takenNpc, ADDRINT notTakenNpc) {}
+VOID NOPPredLoadStoreSingle(THREADID tid, ADDRINT addr, BOOL pred) {}
+
+// FF is basically NOP except for basic blocks
+VOID FFBasicBlock(THREADID tid, ADDRINT bblAddr, BblInfo* bblInfo) {
+    if (unlikely(!procTreeNode->isInFastForward())) {
+        SimThreadStart(tid);
+    }
+}
+
+// FFI is instruction-based fast-forwarding
+/* FFI works as follows: when in fast-forward, we install a special FF BBL func
+ * ptr that counts instructions and checks whether we have reached the switch
+ * point. Then, it exits FF, and queues an event that counts the instructions
+ * where the app should be scheduled. That event cannot access any local state,
+ * so when it hits the limit, it just makes the process enter FF. On that
+ * entry, we install a special handler that advances to the next FFI point and
+ * installs the normal FFI handlers (pretty much like joins work).
+ *
+ * REQUIREMENTS: Single-threaded during FF (non-FF can be MT)
+ */
+
+//TODO (dsm): Went for quick, dirty and contained here. This could use a cleanup.
+
+// FFI state
+static bool ffiEnabled;
+static uint32_t ffiPoint;
+static uint64_t ffiInstrsDone;
+static uint64_t ffiInstrsLimit;
+static bool ffiNFF;
+
+//Track the non-FF instructions executed at the beginning of this and last interval.
+//Can only be updated at ends of phase, by the NFF tracking event.
+static uint64_t* ffiFFStartInstrs; //hack, needs to be a pointer, written to outside this process
+static uint64_t* ffiPrevFFStartInstrs;
+
+static const InstrFuncPtrs& GetFFPtrs();
+
+VOID FFITrackNFFInterval() {
+    assert(!procTreeNode->isInFastForward());
+    assert(ffiInstrsDone < ffiInstrsLimit); //unless you have ~10-instr FFWds, this does not happen
+
+    //Queue up an event to detect and end FF
+    //Note vars are captured, so these lambdas can be called from any process
+    uint64_t startInstrs = *ffiFFStartInstrs;
+    uint32_t p = procIdx;
+    uint64_t* _ffiFFStartInstrs = ffiFFStartInstrs;
+    uint64_t* _ffiPrevFFStartInstrs = ffiPrevFFStartInstrs;
+    auto ffiGet = [p, startInstrs]() { return zinfo->processStats->getProcessInstrs(p) - startInstrs; };
+    auto ffiFire = [p, _ffiFFStartInstrs, _ffiPrevFFStartInstrs]() {
+        info("FFI: Entering fast-forward for process %d", p);
+        /* Note this is sufficient due to the lack of reinstruments on FF, and this way we do not need to touch global state */
+        futex_lock(&zinfo->ffLock);
+        assert(!zinfo->procArray[p]->isInFastForward());
+        zinfo->procArray[p]->enterFastForward();
+        futex_unlock(&zinfo->ffLock);
+        *_ffiPrevFFStartInstrs = *_ffiFFStartInstrs;
+        *_ffiFFStartInstrs = zinfo->processStats->getProcessInstrs(p);
+    };
+    zinfo->eventQueue->insert(makeAdaptiveEvent(ffiGet, ffiFire, 0, ffiInstrsLimit - ffiInstrsDone, MAX_IPC*zinfo->phaseLength));
+
+    ffiNFF = true;
+}
+
+// Called on process start
+VOID FFIInit() {
+    const g_vector<uint64_t>& ffiPoints = procTreeNode->getFFIPoints();
+    if (!ffiPoints.empty()) {
+        if (zinfo->ffReinstrument) panic("FFI and reinstrumenting on FF switches are incompatible");
+        ffiEnabled = true;
+        ffiPoint = 0;
+        ffiInstrsDone = 0;
+        ffiInstrsLimit = ffiPoints[0];
+
+        ffiFFStartInstrs = gm_calloc<uint64_t>(1);
+        ffiPrevFFStartInstrs = gm_calloc<uint64_t>(1);
+        ffiNFF = false;
+        info("FFI mode initialized, %ld ffiPoints", ffiPoints.size());
+        if (!procTreeNode->isInFastForward()) FFITrackNFFInterval();
+    } else {
+        ffiEnabled = false;
+    }
+}
+
+//Set the next ffiPoint, or finish
+VOID FFIAdvance() {
+    const g_vector<uint64_t>& ffiPoints = procTreeNode->getFFIPoints();
+    ffiPoint++;
+    if (ffiPoint >= ffiPoints.size()) {
+        info("Last ffiPoint reached, %ld instrs, limit %ld", ffiInstrsDone, ffiInstrsLimit);
+        SimEnd();
+    } else {
+        info("ffiPoint reached, %ld instrs, limit %ld", ffiInstrsDone, ffiInstrsLimit);
+        ffiInstrsLimit += ffiPoints[ffiPoint];
+    }
+}
+
+VOID FFIBasicBlock(THREADID tid, ADDRINT bblAddr, BblInfo* bblInfo) {
+    ffiInstrsDone += bblInfo->instrs;
+    if (unlikely(ffiInstrsDone >= ffiInstrsLimit)) {
+        FFIAdvance();
+        assert(procTreeNode->isInFastForward());
+        futex_lock(&zinfo->ffLock);
+        info("FFI: Exiting fast-forward");
+        ExitFastForward();
+        futex_unlock(&zinfo->ffLock);
+        FFITrackNFFInterval();
+
+        SimThreadStart(tid);
+    }
+}
+
+// One-off, called after we go from NFF to FF
+VOID FFIEntryBasicBlock(THREADID tid, ADDRINT bblAddr, BblInfo* bblInfo) {
+    ffiInstrsDone += *ffiFFStartInstrs - *ffiPrevFFStartInstrs; //add all instructions executed in the NFF phase
+    FFIAdvance();
+    assert(ffiNFF);
+    ffiNFF = false;
+    fPtrs[tid] = GetFFPtrs();
+    FFIBasicBlock(tid, bblAddr, bblInfo);
+}
+
+// Non-analysis pointer vars
+static const InstrFuncPtrs joinPtrs = {JoinAndLoadSingle, JoinAndStoreSingle, JoinAndBasicBlock, JoinAndRecordBranch, JoinAndPredLoadSingle, JoinAndPredStoreSingle, FPTR_JOIN};
+static const InstrFuncPtrs nopPtrs = {NOPLoadStoreSingle, NOPLoadStoreSingle, NOPBasicBlock, NOPRecordBranch, NOPPredLoadStoreSingle, NOPPredLoadStoreSingle, FPTR_NOP};
+static const InstrFuncPtrs ffPtrs = {NOPLoadStoreSingle, NOPLoadStoreSingle, FFBasicBlock, NOPRecordBranch, NOPPredLoadStoreSingle, NOPPredLoadStoreSingle, FPTR_NOP};
+
+static const InstrFuncPtrs ffiPtrs = {NOPLoadStoreSingle, NOPLoadStoreSingle, FFIBasicBlock, NOPRecordBranch, NOPPredLoadStoreSingle, NOPPredLoadStoreSingle, FPTR_NOP};
+static const InstrFuncPtrs ffiEntryPtrs = {NOPLoadStoreSingle, NOPLoadStoreSingle, FFIEntryBasicBlock, NOPRecordBranch, NOPPredLoadStoreSingle, NOPPredLoadStoreSingle, FPTR_NOP};
+
+static const InstrFuncPtrs& GetFFPtrs() {
+    return ffiEnabled? (ffiNFF? ffiEntryPtrs : ffiPtrs) : ffPtrs;
+}
+
+//Fast-forwarding
+void EnterFastForward() {
+    assert(!procTreeNode->isInFastForward());
+    procTreeNode->enterFastForward();
+    __sync_synchronize(); //Make change globally visible
+
+    //Re-instrument; VM/client lock are not needed
+    if (zinfo->ffReinstrument) {
+        PIN_RemoveInstrumentation();
+    }
+    //Transition to FF; we have the ff lock, this should be safe with end of phase code. This avoids profiling the end of a simulation as bound time
+    //NOTE: Does not work well with multiprocess runs
+    zinfo->profSimTime->transition(PROF_FF);
+}
+
+
+void ExitFastForward() {
+    assert(procTreeNode->isInFastForward());
+
+    VirtCaptureClocks(true /*exiting ffwd*/);
+
+    procTreeNode->exitFastForward();
+    __sync_synchronize(); //make change globally visible
+
+    //Re-instrument; VM/client lock are not needed
+    if (zinfo->ffReinstrument) {
+        PIN_RemoveInstrumentation();
+    }
+}
+
+
+
+//Termination
+volatile uint32_t perProcessEndFlag;
+
+VOID SimEnd();
+
+VOID CheckForTermination() {
+    assert(zinfo->terminationConditionMet == false);
+    if (zinfo->maxPhases && zinfo->numPhases >= zinfo->maxPhases) {
+        zinfo->terminationConditionMet = true;
+        info("Max phases reached (%ld)", zinfo->numPhases);
+        return;
+    }
+
+    if (zinfo->maxMinInstrs) {
+        uint64_t minInstrs = zinfo->cores[0]->getInstrs();
+        for (uint32_t i = 1; i < zinfo->numCores; i++) {
+            uint64_t coreInstrs = zinfo->cores[i]->getInstrs();
+            if (coreInstrs < minInstrs && coreInstrs > 0) {
+                minInstrs = coreInstrs;
+            }
+        }
+
+        if (minInstrs >= zinfo->maxMinInstrs) {
+            zinfo->terminationConditionMet = true;
+            info("Max min instructions reached (%ld)", minInstrs);
+            return;
+        }
+    }
+
+    if (zinfo->maxTotalInstrs) {
+        uint64_t totalInstrs = 0;
+        for (uint32_t i = 0; i < zinfo->numCores; i++) {
+            totalInstrs += zinfo->cores[i]->getInstrs();
+        }
+
+        if (totalInstrs >= zinfo->maxTotalInstrs) {
+            zinfo->terminationConditionMet = true;
+            info("Max total (aggregate) instructions reached (%ld)", totalInstrs);
+            return;
+        }
+    }
+
+    if (zinfo->maxSimTimeNs) {
+        uint64_t simNs = zinfo->profSimTime->count(PROF_BOUND) + zinfo->profSimTime->count(PROF_WEAVE);
+        if (simNs >= zinfo->maxSimTimeNs) {
+            zinfo->terminationConditionMet = true;
+            info("Max simulation time reached (%ld ns)", simNs);
+            return;
+        }
+    }
+
+    if (zinfo->externalTermPending) {
+        zinfo->terminationConditionMet = true;
+        info("Terminating due to external notification");
+        return;
+    }
+}
+
+/* This is called by the scheduler at the end of a phase. At that point, zinfo->numPhases
+ * has not incremented, so it denotes the END of the current phase
+ */
+VOID EndOfPhaseActions() {
+    zinfo->profSimTime->transition(PROF_WEAVE);
+    if (zinfo->globalPauseFlag) {
+        info("Simulation entering global pause");
+        zinfo->profSimTime->transition(PROF_FF);
+        while (zinfo->globalPauseFlag) usleep(20*1000);
+        zinfo->profSimTime->transition(PROF_WEAVE);
+        info("Global pause DONE");
+    }
+
+    // Done before tick() to avoid deadlock in most cases when entering synced ffwd (can we still deadlock with sleeping threads?)
+    if (unlikely(zinfo->globalSyncedFFProcs)) {
+        info("Simulation paused due to synced fast-forwarding");
+        zinfo->profSimTime->transition(PROF_FF);
+        while (zinfo->globalSyncedFFProcs) usleep(20*1000);
+        zinfo->profSimTime->transition(PROF_WEAVE);
+        info("Synced fast-forwarding done, resuming simulation");
+    }
+
+    CheckForTermination();
+    zinfo->contentionSim->simulatePhase(zinfo->globPhaseCycles + zinfo->phaseLength);
+    zinfo->eventQueue->tick();
+    zinfo->profSimTime->transition(PROF_BOUND);
+}
+
+
+uint32_t TakeBarrier(uint32_t tid, uint32_t cid) {
+    uint32_t newCid = zinfo->sched->sync(procIdx, tid, cid);
+    clearCid(tid); //this is after the sync for a hack needed to make EndOfPhase reliable
+    setCid(tid, newCid);
+
+    if (procTreeNode->isInFastForward()) {
+        info("Thread %d entering fast-forward", tid);
+        clearCid(tid);
+        zinfo->sched->leave(procIdx, tid, newCid);
+        SimThreadFini(tid);
+        fPtrs[tid] = GetFFPtrs();
+    } else if (zinfo->terminationConditionMet) {
+        info("Termination condition met, exiting");
+        zinfo->sched->leave(procIdx, tid, newCid);
+        SimEnd(); //need to call this on a per-process basis...
+    }
+
+    return newCid;
+}
+
+/* ===================================================================== */
+
+#if 0
+static void PrintIp(THREADID tid, ADDRINT ip) {
+    if (zinfo->globPhaseCycles > 1000000000L /*&& zinfo->globPhaseCycles < 1000030000L*/) {
+        info("[%d] %ld 0x%lx", tid, zinfo->globPhaseCycles, ip);
+    }
+}
+#endif
+
+VOID Instruction(INS ins) {
+    //Uncomment to print an instruction trace
+    //INS_InsertCall(ins, IPOINT_BEFORE, (AFUNPTR)PrintIp, IARG_THREAD_ID, IARG_REG_VALUE, REG_INST_PTR, IARG_END);
+
+    if (!procTreeNode->isInFastForward() || !zinfo->ffReinstrument) {
+        AFUNPTR LoadFuncPtr = (AFUNPTR) IndirectLoadSingle;
+        AFUNPTR StoreFuncPtr = (AFUNPTR) IndirectStoreSingle;
+
+        AFUNPTR PredLoadFuncPtr = (AFUNPTR) IndirectPredLoadSingle;
+        AFUNPTR PredStoreFuncPtr = (AFUNPTR) IndirectPredStoreSingle;
+
+        if (INS_IsMemoryRead(ins)) {
+            if (!INS_IsPredicated(ins)) {
+                INS_InsertCall(ins, IPOINT_BEFORE, LoadFuncPtr, IARG_FAST_ANALYSIS_CALL, IARG_THREAD_ID, IARG_MEMORYREAD_EA, IARG_END);
+            } else {
+                INS_InsertCall(ins, IPOINT_BEFORE, PredLoadFuncPtr, IARG_FAST_ANALYSIS_CALL, IARG_THREAD_ID, IARG_MEMORYREAD_EA, IARG_EXECUTING, IARG_END);
+            }
+        }
+
+        if (INS_HasMemoryRead2(ins)) {
+            if (!INS_IsPredicated(ins)) {
+                INS_InsertCall(ins, IPOINT_BEFORE, LoadFuncPtr, IARG_FAST_ANALYSIS_CALL, IARG_THREAD_ID, IARG_MEMORYREAD2_EA, IARG_END);
+            } else {
+                INS_InsertCall(ins, IPOINT_BEFORE, PredLoadFuncPtr, IARG_FAST_ANALYSIS_CALL, IARG_THREAD_ID, IARG_MEMORYREAD2_EA, IARG_EXECUTING, IARG_END);
+            }
+        }
+
+        if (INS_IsMemoryWrite(ins)) {
+            if (!INS_IsPredicated(ins)) {
+                INS_InsertCall(ins, IPOINT_BEFORE,  StoreFuncPtr, IARG_FAST_ANALYSIS_CALL, IARG_THREAD_ID, IARG_MEMORYWRITE_EA, IARG_END);
+            } else {
+                INS_InsertCall(ins, IPOINT_BEFORE,  PredStoreFuncPtr, IARG_FAST_ANALYSIS_CALL, IARG_THREAD_ID, IARG_MEMORYWRITE_EA, IARG_EXECUTING, IARG_END);
+            }
+        }
+
+        // Instrument only conditional branches
+        if (INS_Category(ins) == XED_CATEGORY_COND_BR) {
+            INS_InsertCall(ins, IPOINT_BEFORE, (AFUNPTR) IndirectRecordBranch, IARG_FAST_ANALYSIS_CALL, IARG_THREAD_ID,
+                    IARG_INST_PTR, IARG_BRANCH_TAKEN, IARG_BRANCH_TARGET_ADDR, IARG_FALLTHROUGH_ADDR, IARG_END);
+        }
+    }
+
+    //Intercept and process magic ops
+    /* xchg %rcx, %rcx is our chosen magic op. It is effectively a NOP, but it
+     * is never emitted by any x86 compiler, as they use other (recommended) nop
+     * instructions or sequences.
+     */
+    if (INS_IsXchg(ins) && INS_OperandReg(ins, 0) == REG_RCX && INS_OperandReg(ins, 1) == REG_RCX) {
+        //info("Instrumenting magic op");
+        INS_InsertCall(ins, IPOINT_BEFORE, (AFUNPTR) HandleMagicOp, IARG_THREAD_ID, IARG_REG_VALUE, REG_ECX, IARG_END);
+    }
+
+    if (INS_Opcode(ins) == XED_ICLASS_CPUID) {
+       INS_InsertCall(ins, IPOINT_BEFORE, (AFUNPTR) FakeCPUIDPre, IARG_THREAD_ID, IARG_REG_VALUE, REG_EAX, IARG_REG_VALUE, REG_ECX, IARG_END);
+       INS_InsertCall(ins, IPOINT_AFTER, (AFUNPTR) FakeCPUIDPost, IARG_THREAD_ID, IARG_REG_REFERENCE, REG_EAX,
+               IARG_REG_REFERENCE, REG_EBX, IARG_REG_REFERENCE, REG_ECX, IARG_REG_REFERENCE, REG_EDX, IARG_END);
+    }
+
+    if (INS_IsRDTSC(ins)) {
+        //No pre; note that this also instruments RDTSCP
+        INS_InsertCall(ins, IPOINT_AFTER, (AFUNPTR) FakeRDTSCPost, IARG_THREAD_ID, IARG_REG_REFERENCE, REG_EAX, IARG_REG_REFERENCE, REG_EDX, IARG_END);
+    }
+
+    //Must run for every instruction
+    VdsoInstrument(ins);
+}
+
+
+VOID Trace(TRACE trace, VOID *v) {
+    if (!procTreeNode->isInFastForward() || !zinfo->ffReinstrument) {
+        // Visit every basic block in the trace
+        for (BBL bbl = TRACE_BblHead(trace); BBL_Valid(bbl); bbl = BBL_Next(bbl)) {
+            BblInfo* bblInfo = Decoder::decodeBbl(bbl, zinfo->oooDecode);
+            BBL_InsertCall(bbl, IPOINT_BEFORE /*could do IPOINT_ANYWHERE if we redid load and store simulation in OOO*/, (AFUNPTR)IndirectBasicBlock, IARG_FAST_ANALYSIS_CALL,
+                 IARG_THREAD_ID, IARG_ADDRINT, BBL_Address(bbl), IARG_PTR, bblInfo, IARG_END);
+        }
+    }
+
+    //Instruction instrumentation now here to ensure proper ordering
+    for (BBL bbl = TRACE_BblHead(trace); BBL_Valid(bbl); bbl = BBL_Next(bbl)) {
+        for (INS ins = BBL_InsHead(bbl); INS_Valid(ins); ins = INS_Next(ins)) {
+            Instruction(ins);
+        }
+    }
+}
+
+/***** vDSO instrumentation and patching code *****/
+
+// Helper function to find section address
+// adapted from http://outflux.net/aslr/aslr.c
+struct Section {
+    uintptr_t start;
+    uintptr_t end;
+};
+
+static Section FindSection(const char* sec) {
+    /* locate the vdso from the maps file */
+    char buf[129];
+    buf[128] = '\0';
+    FILE * fp = fopen("/proc/self/maps", "r");
+    Section res = {0x0, 0x0};
+    if (fp) {
+        while (fgets(buf, 128, fp)) {
+            if (strstr(buf, sec)) {
+                char * dash = strchr(buf, '-');
+                if (dash) {
+                    *dash='\0';
+                    res.start = strtoul(buf, NULL, 16);
+                    res.end   = strtoul(dash+1, NULL, 16);
+                }
+            }
+        }
+    }
+
+    //Uncomment to print maps
+    //fseek(fp, 0, SEEK_SET);
+    //while (fgets(buf, 128, fp)) info("%s", buf);
+    return res;
+}
+
+// Initialization code and global per-process data
+
+enum VdsoFunc {VF_CLOCK_GETTIME, VF_GETTIMEOFDAY, VF_TIME, VF_GETCPU};
+
+static std::unordered_map<ADDRINT, VdsoFunc> vdsoEntryMap;
+static uintptr_t vdsoStart;
+static uintptr_t vdsoEnd;
+
+//Used to warn
+static uintptr_t vsyscallStart;
+static uintptr_t vsyscallEnd;
+static bool vsyscallWarned = false;
+
+void VdsoInsertFunc(IMG vi, const char* fName, VdsoFunc func) {
+    ADDRINT baseAddr = IMG_LowAddress(vi);
+    RTN rtn = RTN_FindByName(vi, fName);
+    if (rtn == RTN_Invalid()) {
+        warn("Did not find %s in vDSO", fName);
+    } else {
+        ADDRINT rtnAddr = RTN_Address(rtn) - baseAddr + vdsoStart;
+        vdsoEntryMap[rtnAddr] = func;
+    }
+}
+
+void VdsoInit() {
+    Section vdso = FindSection("vdso");
+    vdsoStart = vdso.start;
+    vdsoEnd = vdso.end;
+
+    if (!vdsoEnd) {
+        // Non-fatal, but should not happen --- even static binaries get vDSO AFAIK
+        warn("vDSO not found");
+        return;
+    }
+
+    // Write it out
+    std::stringstream file_ss;
+    file_ss << zinfo->outputDir << "/vdso.dso." << procIdx;
+    const char* file = file_ss.str().c_str();
+    FILE* vf = fopen(file, "w");
+    fwrite(reinterpret_cast<void*>(vdso.start), 1, vdsoEnd-vdsoStart, vf);
+    fclose(vf);
+
+    // Load it and analyze it
+    IMG vi = IMG_Open(file);
+    if (!IMG_Valid(vi)) panic("Loaded vDSO not valid");
+
+    VdsoInsertFunc(vi, "clock_gettime", VF_CLOCK_GETTIME);
+    VdsoInsertFunc(vi, "__vdso_clock_gettime", VF_CLOCK_GETTIME);
+
+    VdsoInsertFunc(vi, "gettimeofday", VF_GETTIMEOFDAY);
+    VdsoInsertFunc(vi, "__vdso_gettimeofday", VF_GETTIMEOFDAY);
+
+    VdsoInsertFunc(vi, "time", VF_TIME);
+    VdsoInsertFunc(vi, "__vdso_time", VF_TIME);
+
+    VdsoInsertFunc(vi, "getcpu", VF_GETCPU);
+    VdsoInsertFunc(vi, "__vdso_getcpu", VF_GETCPU);
+
+    info("vDSO info initialized");
+    IMG_Close(vi);
+    remove(file);
+
+    Section vsyscall = FindSection("vsyscall");
+    vsyscallStart = vsyscall.start;
+    vsyscallEnd = vsyscall.end;
+    // Could happen in the future when vsyscall is phased out, kill the warn then
+    if (!vsyscallEnd) warn("vsyscall page not found");
+}
+
+// Register hooks to intercept and virtualize time-related vsyscalls and vdso syscalls, as they do not show up as syscalls!
+// NOTE: getcpu is also a VDSO syscall, but is not patched for now
+
+// Per-thread VDSO data
+struct VdsoPatchData {
+    // Input arguments --- must save them because they are not caller-saved
+    // Careful: REG is 32 bits; PIN_REGISTER, which is the actual type of the
+    // pointer, is 64 bits but opaque. We just use ADDRINT, it works
+    ADDRINT arg0, arg1;
+    VdsoFunc func;
+    uint32_t level;  // if 0, invalid. Used for VDSO-internal calls
+};
+VdsoPatchData vdsoPatchData[MAX_THREADS];
+
+// Analysis functions
+
+VOID VdsoEntryPoint(THREADID tid, uint32_t func, ADDRINT arg0, ADDRINT arg1) {
+    if (vdsoPatchData[tid].level) {
+        // common, in Ubuntu 11.10 several vdso functions jump back to the callpoint
+        // info("vDSO function (%d) called from vdso (%d), level %d, skipping", func, vdsoPatchData[tid].func, vdsoPatchData[tid].level);
+    } else {
+        vdsoPatchData[tid].arg0 = arg0;
+        vdsoPatchData[tid].arg1 = arg1;
+        vdsoPatchData[tid].func = (VdsoFunc)func;
+        vdsoPatchData[tid].level++;
+    }
+}
+
+VOID VdsoCallPoint(THREADID tid) {
+    assert(vdsoPatchData[tid].level);
+    vdsoPatchData[tid].level++;
+    // info("vDSO internal callpoint, now level %d", vdsoPatchData[tid].level); //common
+}
+
+VOID VdsoRetPoint(THREADID tid, REG* raxPtr) {
+    if (vdsoPatchData[tid].level == 0) {
+        warn("vDSO return without matching call --- did we instrument all the functions?");
+        return;
+    }
+    vdsoPatchData[tid].level--;
+    if (vdsoPatchData[tid].level) {
+        // info("vDSO return post level %d, skipping ret handling", vdsoPatchData[tid].level); //common
+        return;
+    }
+    if (fPtrs[tid].type != FPTR_NOP || vdsoPatchData[tid].func == VF_GETCPU) {
+        // info("vDSO patching for func %d", vdsoPatchData[tid].func);  // common
+        ADDRINT arg0 = vdsoPatchData[tid].arg0;
+        ADDRINT arg1 = vdsoPatchData[tid].arg1;
+        switch (vdsoPatchData[tid].func) {
+            case VF_CLOCK_GETTIME:
+                VirtClockGettime(tid, arg0, arg1);
+                break;
+            case VF_GETTIMEOFDAY:
+                VirtGettimeofday(tid, arg0);
+                break;
+            case VF_TIME:
+                VirtTime(tid, raxPtr, arg0);
+                break;
+            case VF_GETCPU:
+                {
+                uint32_t cpu = cpuenumCpu(procIdx, getCid(tid));
+                VirtGetcpu(tid, cpu, arg0, arg1);
+                }
+                break;
+            default:
+                panic("vDSO garbled func %d", vdsoPatchData[tid].func);
+        }
+    }
+}
+
+// Instrumentation function, called for EVERY instruction
+VOID VdsoInstrument(INS ins) {
+    ADDRINT insAddr = INS_Address(ins);
+    if (unlikely(insAddr >= vdsoStart && insAddr < vdsoEnd)) {
+        if (vdsoEntryMap.find(insAddr) != vdsoEntryMap.end()) {
+            VdsoFunc func = vdsoEntryMap[insAddr];
+            INS_InsertCall(ins, IPOINT_BEFORE, (AFUNPTR) VdsoEntryPoint, IARG_THREAD_ID, IARG_UINT32, (uint32_t)func, IARG_REG_VALUE, REG_RDI, IARG_REG_VALUE, REG_RSI, IARG_END);
+        } else if (INS_IsCall(ins)) {
+            INS_InsertCall(ins, IPOINT_BEFORE, (AFUNPTR) VdsoCallPoint, IARG_THREAD_ID, IARG_END);
+        } else if (INS_IsRet(ins)) {
+            INS_InsertCall(ins, IPOINT_BEFORE, (AFUNPTR) VdsoRetPoint, IARG_THREAD_ID, IARG_REG_REFERENCE, REG_RAX /* return val */, IARG_END);
+        }
+    }
+
+    //Warn on the first vsyscall code translation
+    if (unlikely(insAddr >= vsyscallStart && insAddr < vsyscallEnd && !vsyscallWarned)) {
+        warn("Instrumenting vsyscall page code --- this process executes vsyscalls, which zsim does not virtualize!");
+        vsyscallWarned = true;
+    }
+}
+
+/* ===================================================================== */
+
+
+bool activeThreads[MAX_THREADS];  // set in ThreadStart, reset in ThreadFini, we need this for exec() (see FollowChild)
+bool inSyscall[MAX_THREADS];  // set in SyscallEnter, reset in SyscallExit, regardless of state. We MAY need this for ContextChange
+
+uint32_t CountActiveThreads() {
+    // Finish all threads in this process w.r.t. the global scheduler
+    uint32_t activeCount = 0;
+    for (uint32_t i = 0; i < MAX_THREADS; i++) {
+        if (activeThreads[i]) activeCount++;
+    }
+    return activeCount;
+}
+
+void SimThreadStart(THREADID tid) {
+    info("Thread %d starting", tid);
+    if (tid > MAX_THREADS) panic("tid > MAX_THREADS");
+    zinfo->sched->start(procIdx, tid, procTreeNode->getMask());
+    activeThreads[tid] = true;
+
+    //Pinning
+#if 0
+    if (true) {
+        uint32_t nprocs = sysconf(_SC_NPROCESSORS_ONLN);
+        cpu_set_t cpuset;
+        CPU_ZERO(&cpuset);
+        CPU_SET(tid % nprocs, &cpuset);
+        //HMM, can we do this? I doubt it
+        //int result = pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset);
+        //Since we're running multiprocess, this suffices for now:
+        int result = sched_setaffinity(getpid(), sizeof(cpu_set_t), &cpuset);
+        assert(result == 0);
+    }
+#endif
+
+    //Initialize this thread's process-local data
+    fPtrs[tid] = joinPtrs; //delayed, MT-safe barrier join
+    clearCid(tid); //just in case, set an invalid cid
+}
+
+VOID ThreadStart(THREADID tid, CONTEXT *ctxt, INT32 flags, VOID *v) {
+    /* This should only fire for the first thread; I know this is a callback,
+     * everything is serialized etc; that's the point, we block everything.
+     * It's here and not in main() because that way the auxiliary threads can
+     * start.
+     */
+    if (procTreeNode->isInPause()) {
+        futex_lock(&zinfo->pauseLocks[procIdx]);  // initialize
+        info("Pausing until notified");
+        futex_lock(&zinfo->pauseLocks[procIdx]);  // block
+        procTreeNode->exitPause();
+        info("Unpaused");
+    }
+
+    if (procTreeNode->isInFastForward()) {
+        info("FF thread %d starting", tid);
+        fPtrs[tid] = GetFFPtrs();
+    } else if (zinfo->registerThreads) {
+        info("Shadow thread %d starting", tid);
+        fPtrs[tid] = nopPtrs;
+    } else {
+        //Start normal thread
+        SimThreadStart(tid);
+    }
+}
+
+VOID SimThreadFini(THREADID tid) {
+    // zinfo->sched->leave(); //exit syscall (SyscallEnter) already leaves
+    zinfo->sched->finish(procIdx, tid);
+    activeThreads[tid] = false;
+    cids[tid] = UNINITIALIZED_CID; //clear this cid, it might get reused
+}
+
+VOID ThreadFini(THREADID tid, const CONTEXT *ctxt, INT32 flags, VOID *v) {
+    //NOTE: Thread has no valid cid here!
+    if (fPtrs[tid].type == FPTR_NOP) {
+        info("Shadow/NOP thread %d finished", tid);
+        return;
+    } else {
+        SimThreadFini(tid);
+        info("Thread %d finished", tid);
+    }
+}
+
+//Need to remove ourselves from running threads in case the syscall is blocking
+VOID SyscallEnter(THREADID tid, CONTEXT *ctxt, SYSCALL_STANDARD std, VOID *v) {
+    bool isNopThread = fPtrs[tid].type == FPTR_NOP;
+    VirtSyscallEnter(tid, ctxt, std, procTreeNode->getPatchRoot(), isNopThread);
+
+    assert(!inSyscall[tid]); inSyscall[tid] = true;
+
+    if (isNopThread) return;
+
+    /* NOTE: It is possible that we take 2 syscalls back to back with any
+     * intervening instrumentation, so we need to check. In that case, this is
+     * treated as a single syscall scheduling-wise (no second leave without
+     * join).
+     */
+    if (fPtrs[tid].type != FPTR_JOIN && !zinfo->blockingSyscalls) {
+        uint32_t cid = getCid(tid);
+        // set an invalid cid, ours is property of the scheduler now!
+        clearCid(tid);
+        
+        zinfo->sched->syscallLeave(procIdx, tid, cid, PIN_GetContextReg(ctxt, REG_INST_PTR),
+                PIN_GetSyscallNumber(ctxt, std), PIN_GetSyscallArgument(ctxt, std, 0),
+                PIN_GetSyscallArgument(ctxt, std, 1));
+        //zinfo->sched->leave(procIdx, tid, cid); 
+        fPtrs[tid] = joinPtrs;  // will join at the next instr point
+        //info("SyscallEnter %d", tid);
+    }
+}
+
+VOID SyscallExit(THREADID tid, CONTEXT *ctxt, SYSCALL_STANDARD std, VOID *v) {
+    assert(inSyscall[tid]); inSyscall[tid] = false;
+
+    PostPatchAction ppa = VirtSyscallExit(tid, ctxt, std);
+    if (ppa == PPA_USE_JOIN_PTRS) {
+        if (!zinfo->blockingSyscalls) {
+            fPtrs[tid] = joinPtrs;
+        } else {
+            fPtrs[tid] = cores[tid]->GetFuncPtrs(); //go back to normal pointers, directly
+        }
+    } else if (ppa == PPA_USE_NOP_PTRS) {
+        fPtrs[tid] = nopPtrs;
+    } else {
+        assert(ppa == PPA_NOTHING);
+    }
+
+    //Avoid joining at all if we are in FF!
+    if (fPtrs[tid].type == FPTR_JOIN && procTreeNode->isInFastForward()) {
+        assert(activeThreads[tid]);
+        info("Thread %d entering fast-forward (from syscall exit)", tid);
+        //We are not in the scheduler, and have no cid assigned. So, no need to leave()
+        SimThreadFini(tid);
+        fPtrs[tid] = GetFFPtrs();
+    }
+
+
+    if (zinfo->terminationConditionMet) {
+        info("Caught termination condition on syscall exit, exiting");
+        SimEnd();
+    }
+}
+
+/* NOTE: We may screw up programs with frequent signals / SIG on syscall. If
+ * you see this warning and simulations misbehave, it's time to do some testing
+ * to figure out how to make syscall post-patching work in this case.
+ */
+VOID ContextChange(THREADID tid, CONTEXT_CHANGE_REASON reason, const CONTEXT* from, CONTEXT* to, INT32 info, VOID* v) {
+    const char* reasonStr = "?";
+    switch (reason) {
+        case CONTEXT_CHANGE_REASON_FATALSIGNAL:
+            reasonStr = "FATAL_SIGNAL";
+            break;
+        case CONTEXT_CHANGE_REASON_SIGNAL:
+            reasonStr = "SIGNAL";
+            break;
+        case CONTEXT_CHANGE_REASON_SIGRETURN:
+            reasonStr = "SIGRETURN";
+            break;
+        case CONTEXT_CHANGE_REASON_APC:
+            reasonStr = "APC";
+            break;
+        case CONTEXT_CHANGE_REASON_EXCEPTION:
+            reasonStr = "EXCEPTION";
+            break;
+        case CONTEXT_CHANGE_REASON_CALLBACK:
+            reasonStr = "CALLBACK";
+            break;
+    }
+
+    warn("[%d] ContextChange, reason %s, inSyscall %d", tid, reasonStr, inSyscall[tid]);
+    if (inSyscall[tid]) {
+        SyscallExit(tid, to, SYSCALL_STANDARD_IA32E_LINUX, NULL);
+    }
+
+    if (reason == CONTEXT_CHANGE_REASON_FATALSIGNAL) {
+        info("[%d] Fatal signal caught, finishing", tid);
+        zinfo->sched->queueProcessCleanup(procIdx, getpid()); //the scheduler watchdog will remove all our state when we are really dead
+        SimEnd();
+    }
+
+    //If this is an issue, we might need to call syscallexit on occasion. I very much doubt it
+    //SyscallExit(tid, to, SYSCALL_STANDARD_IA32E_LINUX, NULL); //NOTE: For now it is safe to do spurious syscall exits, but careful...
+}
+
+/* Fork and exec instrumentation */
+
+//For funky macro stuff
+#define QUOTED_(x) #x
+#define QUOTED(x) QUOTED_(x)
+
+// Pre-exec
+BOOL FollowChild(CHILD_PROCESS childProcess, VOID * userData) {
+    //Finish all threads in this process w.r.t. the global scheduler
+
+    uint32_t activeCount = CountActiveThreads();
+    if (activeCount > 1) warn("exec() of a multithreaded process! (%d live threads)", activeCount);
+
+    // You can always run process0 = { command = "ls"; startPaused = True; startFastForwarded = True; }; to avoid this
+    if (procIdx == 0) panic("process0 cannot exec(), it spawns globally needed internal threads (scheduler and contention); run a dummy process0 instead!");
+
+    //Set up Pin command
+    //NOTE: perProcessDir may be active, we don't care much... run in the same dir as parent process
+    //NOTE: we recycle our own procIdx on an exec, but fork() changed it so we need to update Pin's command line
+    g_vector<g_string> args = zinfo->pinCmd->getPinCmdArgs(procIdx);
+    uint32_t numArgs = args.size();
+    const char* pinArgs[numArgs];
+    for (uint32_t i = 0; i < numArgs; i++) pinArgs[i] = args[i].c_str();
+    CHILD_PROCESS_SetPinCommandLine(childProcess, numArgs, pinArgs);
+
+    //As a convenience, print the command we are going to execute
+    const char* const* cArgv;
+    int cArgc;
+    CHILD_PROCESS_GetCommandLine(childProcess, &cArgc, &cArgv);
+
+    std::string childCmd = cArgv[0];
+    for (int i = 1; i < cArgc; i++) {
+        childCmd += " ";
+        childCmd += cArgv[i];
+    }
+
+    info("Following exec(): %s", childCmd.c_str());
+
+    return true; //always follow
+}
+
+static ProcessTreeNode* forkedChildNode = NULL;
+
+VOID BeforeFork(THREADID tid, const CONTEXT* ctxt, VOID * arg) {
+    forkedChildNode = procTreeNode->getNextChild();
+    info("Thread %d forking, child procIdx=%d", tid, forkedChildNode->getProcIdx());
+}
+
+VOID AfterForkInParent(THREADID tid, const CONTEXT* ctxt, VOID * arg) {
+    forkedChildNode = NULL;
+}
+
+VOID AfterForkInChild(THREADID tid, const CONTEXT* ctxt, VOID * arg) {
+    assert(forkedChildNode);
+    procTreeNode = forkedChildNode;
+    procIdx = procTreeNode->getProcIdx();
+    bool wasNotStarted = procTreeNode->notifyStart();
+    assert(wasNotStarted); //it's a fork, should be new
+    procMask = ((uint64_t)procIdx) << (64-lineBits);
+
+    char header[64];
+    snprintf(header, sizeof(header), "[S %dF] ", procIdx); //append an F to distinguish forked from fork/exec'd
+    std::stringstream logfile_ss;
+    logfile_ss << zinfo->outputDir << "/zsim.log." << procIdx;
+    InitLog(header, KnobLogToFile.Value()? logfile_ss.str().c_str() : NULL);
+
+    info("Forked child (tid %d/%d), PID %d, parent PID %d", tid, PIN_ThreadId(), PIN_GetPid(), getppid());
+
+    //Initialize process-local per-thread state, even if ThreadStart does so later
+    for (uint32_t i = 0; i < MAX_THREADS; i++) {
+        fPtrs[i] = joinPtrs;
+        cids[i] = UNINITIALIZED_CID;
+        activeThreads[i] = false;
+        inSyscall[i] = false;
+        cores[i] = NULL;
+    }
+
+    //We need to launch another copy of the FF control thread
+    PIN_SpawnInternalThread(FFThread, NULL, 64*1024, NULL);
+
+    ThreadStart(tid, NULL, 0, NULL);
+}
+
+/** Finalization **/
+
+VOID Fini(int code, VOID * v) {
+    info("Finished, code %d", code);
+    //NOTE: In fini, it appears that info() and writes to stdout in general won't work; warn() and stderr still work fine.
+    SimEnd();
+}
+
+VOID SimEnd() {
+    if (__sync_bool_compare_and_swap(&perProcessEndFlag, 0, 1) == false) { //failed, note DEPENDS ON STRONG CAS
+        while (true) { //sleep until thread that won exits for us
+            struct timespec tm;
+            tm.tv_sec = 1;
+            tm.tv_nsec = 0;
+            nanosleep(&tm, NULL);
+        }
+    }
+
+    //at this point, we're in charge of exiting our whole process, but we still need to race for the stats
+
+    //per-process
+#ifdef BBL_PROFILING
+    Decoder::dumpBblProfile();
+#endif
+
+    //global
+    bool lastToFinish = procTreeNode->notifyEnd();
+    (void) lastToFinish; //make gcc happy; not needed anymore, since proc 0 dumps stats
+
+    if (procIdx == 0) {
+        //Done to preserve the scheduler and contention simulation internal threads
+        if (zinfo->globalActiveProcs) {
+            info("Delaying termination until all other processes finish");
+            while (zinfo->globalActiveProcs) usleep(100*1000);
+            info("All other processes done, terminating");
+        }
+
+        info("Dumping termination stats");
+        zinfo->trigger = 20000;
+        if (zinfo->periodicStatsBackend) zinfo->periodicStatsBackend->dump(false); //write last phase to periodic backend
+        zinfo->statsBackend->dump(false);
+        zinfo->eventualStatsBackend->dump(false);
+        zinfo->compactStatsBackend->dump(false);
+
+        zinfo->sched->notifyTermination();
+    }
+
+    //Uncomment when debugging termination races, which can be rare because they are triggered by threads of a dying process
+    //sleep(5);
+
+    exit(0);
+}
+
+
+// Magic ops interface
+/* TODO: In the future, we might want to return values to the program.
+ * This is definitely doable, but there is no use for it now.
+ */
+#define ZSIM_MAGIC_OP_ROI_BEGIN         (1025)
+#define ZSIM_MAGIC_OP_ROI_END           (1026)
+#define ZSIM_MAGIC_OP_REGISTER_THREAD   (1027)
+#define ZSIM_MAGIC_OP_HEARTBEAT         (1028)
+
+VOID HandleMagicOp(THREADID tid, ADDRINT op) {
+    switch (op) {
+        case ZSIM_MAGIC_OP_ROI_BEGIN:
+            if (!zinfo->ignoreHooks) {
+                //TODO: Test whether this is thread-safe
+                futex_lock(&zinfo->ffLock);
+                if (procTreeNode->isInFastForward()) {
+                    info("ROI_BEGIN, exiting fast-forward");
+                    ExitFastForward();
+                } else {
+                    warn("Ignoring ROI_BEGIN magic op, not in fast-forward");
+                }
+                futex_unlock(&zinfo->ffLock);
+            }
+            return;
+        case ZSIM_MAGIC_OP_ROI_END:
+            if (!zinfo->ignoreHooks) {
+                //TODO: Test whether this is thread-safe
+                futex_lock(&zinfo->ffLock);
+                if (procTreeNode->getSyncedFastForward()) {
+                    warn("Ignoring ROI_END magic op on synced FF to avoid deadlock");
+                } else if (!procTreeNode->isInFastForward()) {
+                    info("ROI_END, entering fast-forward");
+                    EnterFastForward();
+                    //If we don't do this, we'll enter FF on the next phase. Which would be OK, except with synced FF
+                    //we stay in the barrier forever. And deadlock. And the deadlock code does nothing, since we're in FF
+                    //So, force immediate entry if we're sync-ffwding
+                    if (procTreeNode->getSyncedFastForward()) {
+                        info("Thread %d entering fast-forward", tid);
+                        uint32_t cid = getCid(tid);
+                        assert(cid != INVALID_CID);
+                        clearCid(tid);
+                        zinfo->sched->leave(procIdx, tid, cid);
+                        SimThreadFini(tid);
+                        fPtrs[tid] = GetFFPtrs();
+                    }
+                } else {
+                    warn("Ignoring ROI_END magic op, already in fast-forward");
+                }
+                futex_unlock(&zinfo->ffLock);
+            }
+            return;
+        case ZSIM_MAGIC_OP_REGISTER_THREAD:
+            if (!zinfo->registerThreads) {
+                info("Thread %d: Treating REGISTER_THREAD magic op as NOP", tid);
+            } else {
+                if (fPtrs[tid].type == FPTR_NOP) {
+                    SimThreadStart(tid);
+                } else {
+                    warn("Thread %d: Treating REGISTER_THREAD magic op as NOP, thread already registered", tid);
+                }
+            }
+            return;
+        case ZSIM_MAGIC_OP_HEARTBEAT:
+            procTreeNode->heartbeat(); //heartbeats are per process for now
+            return;
+
+        // HACK: Ubik magic ops
+        case 1029:
+        case 1030:
+        case 1031:
+        case 1032:
+        case 1033:
+            return;
+        default:
+            panic("Thread %d issued unknown magic op %ld!", tid, op);
+    }
+}
+
+//CPUIID faking
+static uint32_t cpuidEax[MAX_THREADS];
+static uint32_t cpuidEcx[MAX_THREADS];
+
+VOID FakeCPUIDPre(THREADID tid, REG eax, REG ecx) {
+    //info("%d precpuid", tid);
+    cpuidEax[tid] = eax;
+    cpuidEcx[tid] = ecx;
+}
+
+VOID FakeCPUIDPost(THREADID tid, ADDRINT* eax, ADDRINT* ebx, ADDRINT* ecx, ADDRINT* edx) {
+    uint32_t eaxIn = cpuidEax[tid];
+    uint32_t ecxIn = cpuidEcx[tid];
+
+    // Point to record at same (eax,ecx) or immediately before
+    CpuIdRecord val = {eaxIn, ecxIn, (uint32_t)-1, (uint32_t)-1, (uint32_t)-1, (uint32_t)-1};
+    CpuIdRecord* pos = std::lower_bound(cpuid_core2, cpuid_core2+(sizeof(cpuid_core2)/sizeof(CpuIdRecord)), val);
+    if (pos->eaxIn > eaxIn) {
+        assert(pos > cpuid_core2);
+        pos--;
+    }
+    assert(pos->eaxIn <= eaxIn);
+    assert(pos->ecxIn <= ecxIn);
+
+    //info("%x %x : %x %x / %x %x %x %x", eaxIn, ecxIn, pos->eaxIn, pos->ecxIn, pos->eax, pos->ebx, pos->ecx, pos->edx);
+
+    uint32_t eaxOut = pos->eax;
+    uint32_t ebxOut = pos->ebx;
+
+    // patch eax to give the number of cores
+    if (eaxIn == 4) {
+        uint32_t ncpus = cpuenumNumCpus(procIdx);
+        uint32_t eax3126 = ncpus - 1;
+        // Overflowing 6 bits?
+        if (zinfo->numCores > 64) eax3126 = 63; //looked into swarm2.csail (4P Westmere-EX, 80 HTs), it sets this to 63
+        eaxOut = (eaxOut & ((1<<26)-1)) | (eax3126<<26);
+    }
+
+    // HT siblings and APIC (core) ID (apparently used; seems Intel-specific)
+    if (eaxIn == 0x1) {
+        uint32_t cid = getCid(tid);
+        uint32_t cpu = cpuenumCpu(procIdx, cid);
+        uint32_t ncpus = cpuenumNumCpus(procIdx);
+        uint32_t siblings = MIN(ncpus, (uint32_t)255);
+        uint32_t apicId = (cpu < ncpus)? MIN(cpu, (uint32_t)255) : 0 /*not scheduled, ffwd?*/;
+        ebxOut = (ebxOut & 0xffff) | (siblings << 16) | (apicId << 24);
+    }
+
+    //info("[%d] postcpuid, inEax 0x%x, pre 0x%lx 0x%lx 0x%lx 0x%lx", tid, eaxIn, *eax, *ebx, *ecx, *edx);
+    //Preserve high bits
+    *reinterpret_cast<uint32_t*>(eax) = eaxOut;
+    *reinterpret_cast<uint32_t*>(ebx) = ebxOut;
+    *reinterpret_cast<uint32_t*>(ecx) = pos->ecx;
+    *reinterpret_cast<uint32_t*>(edx) = pos->edx;
+    //info("[%d] postcpuid, inEax 0x%x, post 0x%lx 0x%lx 0x%lx 0x%lx", tid, eaxIn, *eax, *ebx, *ecx, *edx);
+}
+
+
+//RDTSC faking
+VOID FakeRDTSCPost(THREADID tid, REG* eax, REG* edx) {
+    if (fPtrs[tid].type == FPTR_NOP) return; //avoid virtualizing NOP threads.
+
+    uint32_t cid = getCid(tid);
+    uint64_t curCycle = VirtGetPhaseRDTSC();
+    if (cid < zinfo->numCores) {
+        curCycle += zinfo->cores[cid]->getPhaseCycles();
+    }
+
+    uint32_t lo = (uint32_t)curCycle;
+    uint32_t hi = (uint32_t)(curCycle >> 32);
+
+    assert((((uint64_t)hi) << 32) + lo == curCycle);
+
+    //uint64_t origTSC = (((uint64_t)*edx) << 32) + (uint32_t)*eax;
+    //info("[t%d/c%d] Virtualizing RDTSC, pre = %x %x (%ld), post = %x %x (%ld)", tid, cid, *edx, *eax, origTSC, hi, lo, curCycle);
+
+    *eax = (REG)lo;
+    *edx = (REG)hi;
+}
+
+/* Fast-forward control */
+
+// Helper class, enabled the FFControl thread to sync with the phase end code
+class SyncEvent: public Event {
+    private:
+        lock_t arrivalLock;
+        lock_t leaveLock;
+
+    public:
+        SyncEvent() : Event(0 /*one-shot*/) {
+            futex_init(&arrivalLock);
+            futex_init(&leaveLock);
+
+            futex_lock(&arrivalLock);
+            futex_lock(&leaveLock);
+        }
+
+        // Blocks until callback()
+        void wait() {
+            futex_lock(&arrivalLock);
+        }
+
+        // Unblocks thread that called wait(), blocks until signal() called
+        // Resilient against callback-wait races (wait does not block if it's
+        // called afteer callback)
+        void callback() {
+            futex_unlock(&arrivalLock);
+            futex_lock(&leaveLock);
+        }
+
+        // Unblocks thread waiting in callback()
+        void signal() {
+            futex_unlock(&leaveLock);
+        }
+};
+
+VOID FFThread(VOID* arg) {
+    futex_lock(&zinfo->ffToggleLocks[procIdx]); //initialize
+    info("FF control Thread TID %ld", syscall(SYS_gettid));
+
+    while (true) {
+        //block ourselves until someone wakes us up with an unlock
+        bool locked = futex_trylock_nospin_timeout(&zinfo->ffToggleLocks[procIdx], 5*BILLION /*5s timeout*/);
+
+        if (!locked) { //timeout
+            if (zinfo->terminationConditionMet) {
+                info("Terminating FF control thread");
+                SimEnd();
+                panic("Should not be reached");
+            }
+            //info("FF control thread wakeup");
+            continue;
+        }
+
+        futex_lock(&zinfo->ffLock);
+        if (procTreeNode->isInFastForward()) {
+            GetVmLock(); //like a callback. This disallows races on all syscall instrumentation, etc.
+            info("Exiting fast forward");
+            ExitFastForward();
+            ReleaseVmLock();
+        } else {
+            SyncEvent* syncEv = new SyncEvent();
+            zinfo->eventQueue->insert(syncEv); //will run on next phase
+            info("Pending fast-forward entry, waiting for end of phase (%ld phases)", zinfo->numPhases);
+
+            futex_unlock(&zinfo->ffLock);
+            syncEv->wait();
+            //At this point the thread thet triggered the end of phase is blocked inside of EndOfPhaseActions
+            futex_lock(&zinfo->ffLock);
+            if (!procTreeNode->isInFastForward()) {
+                info("End of phase %ld, entering FF", zinfo->numPhases);
+                EnterFastForward();
+            } else {
+                info("FF control thread called on end of phase, but someone else (program?) already entered ffwd");
+            }
+            syncEv->signal(); //unblock thread in EndOfPhaseActions
+        }
+        futex_unlock(&zinfo->ffLock);
+    }
+    panic("Should not be reached!");
+}
+
+
+/* Internal Exception Handler */
+//When firing a debugger was an easy affair, this was not an issue. Now it's not so easy, so let's try to at least capture the backtrace and print it out
+
+//Use unlocked output, who knows where this happens.
+static EXCEPT_HANDLING_RESULT InternalExceptionHandler(THREADID tid, EXCEPTION_INFO *pExceptInfo, PHYSICAL_CONTEXT *pPhysCtxt, VOID *) {
+    fprintf(stderr, "%s[%d] Internal exception detected:\n", logHeader, tid);
+    fprintf(stderr, "%s[%d]  Code: %d\n", logHeader, tid, PIN_GetExceptionCode(pExceptInfo));
+    fprintf(stderr, "%s[%d]  Address: 0x%lx\n", logHeader, tid, PIN_GetExceptionAddress(pExceptInfo));
+    fprintf(stderr, "%s[%d]  Description: %s\n", logHeader, tid, PIN_ExceptionToString(pExceptInfo).c_str());
+
+    ADDRINT faultyAccessAddr;
+    if (PIN_GetFaultyAccessAddress(pExceptInfo, &faultyAccessAddr)) {
+        const char* faultyAccessStr = "";
+        FAULTY_ACCESS_TYPE fat = PIN_GetFaultyAccessType(pExceptInfo);
+        if (fat == FAULTY_ACCESS_READ) faultyAccessStr = "READ ";
+        else if (fat == FAULTY_ACCESS_WRITE) faultyAccessStr = "WRITE ";
+        else if (fat == FAULTY_ACCESS_EXECUTE) faultyAccessStr = "EXECUTE ";
+
+        fprintf(stderr, "%s[%d]  Caused by invalid %saccess to address 0x%lx\n", logHeader, tid, faultyAccessStr, faultyAccessAddr);
+    }
+
+    void* array[40];
+    size_t size = backtrace(array, 40);
+    char** strings = backtrace_symbols(array, size);
+    fprintf(stderr, "%s[%d] Backtrace (%ld/%d max frames)\n", logHeader, tid, size, 40);
+    for (uint32_t i = 0; i < size; i++) {
+        //For libzsim.so addresses, call addr2line to get symbol info (can't use -rdynamic on libzsim.so because of Pin's linker script)
+        //NOTE: May be system-dependent, may not handle malformed strings well. We're going to die anyway, so in for a penny, in for a pound...
+        std::string s = strings[i];
+        uint32_t lp = s.find_first_of("(");
+        uint32_t cp = s.find_first_of(")");
+        std::string fname = s.substr(0, lp);
+        std::string faddr = s.substr(lp+1, cp-(lp+1));
+        if (fname.find("libzsim.so") != std::string::npos) {
+            std::string cmd = "addr2line -f -C -e " + fname + " " + faddr;
+            FILE* f = popen(cmd.c_str(), "r");
+            if (f) {
+                char buf[1024];
+                std::string func, loc;
+                func = fgets(buf, 1024, f); //first line is function name
+                loc = fgets(buf, 1024, f); //second is location
+                //Remove line breaks
+                func = func.substr(0, func.size()-1);
+                loc = loc.substr(0, loc.size()-1);
+
+                int status = pclose(f);
+                if (status == 0) {
+                    s = loc + " / " + func;
+                }
+            }
+        }
+
+        fprintf(stderr, "%s[%d]  %s\n", logHeader, tid, s.c_str());
+    }
+    fflush(stderr);
+
+    return EHR_CONTINUE_SEARCH; //we never solve anything at all :P
+}
+
+/* ===================================================================== */
+
+int main(int argc, char *argv[]) {
+    PIN_InitSymbols();
+    if (PIN_Init(argc, argv)) return Usage();
+
+    //Register an internal exception handler (ASAP, to catch segfaults in init)
+    PIN_AddInternalExceptionHandler(InternalExceptionHandler, NULL);
+
+    procIdx = KnobProcIdx.Value();
+    char header[64];
+    snprintf(header, sizeof(header), "[S %d] ", procIdx);
+    std::stringstream logfile_ss;
+    logfile_ss << KnobOutputDir.Value() << "/zsim.log." << procIdx;
+    InitLog(header, KnobLogToFile.Value()? logfile_ss.str().c_str() : NULL);
+
+    //If parent dies, kill us
+    //This avoids leaving strays running in any circumstances, but may be too heavy-handed with arbitrary process hierarchies.
+    //If you ever need this disabled, sim.pinOptions = "-injection child" does the trick
+    if (prctl(PR_SET_PDEATHSIG, 9 /*SIGKILL*/) != 0) {
+        panic("prctl() failed");
+    }
+
+    info("Started instance");
+
+    //Decrease priority to avoid starving system processes (e.g. gluster)
+    //setpriority(PRIO_PROCESS, getpid(), 10);
+    //info("setpriority, new prio %d", getpriority(PRIO_PROCESS, getpid()));
+
+    gm_attach(KnobShmid.Value());
+
+    bool masterProcess = false;
+    if (procIdx == 0 && !gm_isready()) {  // process 0 can exec() without fork()ing first, so we must check gm_isready() to ensure we don't initialize twice
+        masterProcess = true;
+        SimInit(KnobConfigFile.Value().c_str(), KnobOutputDir.Value().c_str(), KnobShmid.Value());
+    } else {
+        while (!gm_isready()) usleep(1000);  // wait till proc idx 0 initializes everything
+        zinfo = static_cast<GlobSimInfo*>(gm_get_glob_ptr());
+    }
+
+    //If assertion below fails, use this to print maps
+#if 0
+    futex_lock(&zinfo->ffLock); //whatever lock, just don't interleave
+    std::ifstream infile("/proc/self/maps");
+    std::string line;
+    while (std::getline(infile, line)) info("  %s", line.c_str());
+    futex_unlock(&zinfo->ffLock);
+    usleep(100000);
+#endif
+    //LibzsimAddrs sanity check: Ensure that they match across processes
+    struct LibInfo libzsimAddrs;
+    getLibzsimAddrs(&libzsimAddrs);
+    if (memcmp(&libzsimAddrs, &zinfo->libzsimAddrs, sizeof(libzsimAddrs)) != 0) {
+        panic("libzsim.so address mismatch! text: %p != %p. Perform loader injection to homogenize offsets!", libzsimAddrs.textAddr, zinfo->libzsimAddrs.textAddr);
+    }
+
+    //Attach to debugger if needed (master process does so in SimInit, to be able to debug initialization)
+    //NOTE: Pin fails to follow exec()'s when gdb is attached. The simplest way to avoid it is to kill the debugger manually before an exec(). If this is common, we could automate it
+    if (!masterProcess && zinfo->attachDebugger) {
+        notifyHarnessForDebugger(zinfo->harnessPid);
+    }
+
+    assert((uint32_t)procIdx < zinfo->numProcs);
+    procTreeNode = zinfo->procArray[procIdx];
+    if (!masterProcess) procTreeNode->notifyStart(); //masterProcess notifyStart is called in init() to avoid races
+    assert(procTreeNode->getProcIdx() == (uint32_t)procIdx); //must be consistent
+
+    trace(Process, "SHM'd global segment, starting");
+
+    assert(zinfo->phaseLength > 0);
+    assert(zinfo->maxPhases >= 0);
+    assert(zinfo->statsPhaseInterval >= 0);
+
+    perProcessEndFlag = 0;
+
+    lineBits = ilog2(zinfo->lineSize);
+    procMask = ((uint64_t)procIdx) << (64-lineBits);
+
+    //Initialize process-local per-thread state, even if ThreadStart does so later
+    for (uint32_t i = 0; i < MAX_THREADS; i++) {
+        fPtrs[i] = joinPtrs;
+        cids[i] = UNINITIALIZED_CID;
+    }
+
+    info("Started process, PID %d", getpid()); //NOTE: external scripts expect this line, please do not change without checking first
+
+    //Unless things change substantially, keep this disabled; it causes higher imbalance and doesn't solve large system time with lots of processes.
+    //Affinity testing code
+    /*cpu_set_t cpuset;
+    CPU_ZERO(&cpuset);
+    CPU_SET(procIdx % 8, &cpuset);
+    int result = sched_setaffinity(getpid(), sizeof(cpu_set_t), &cpuset);
+    info("Affinity result %d", result);*/
+
+    info("procMask: 0x%lx", procMask);
+
+    zinfo->sched->processCleanup(procIdx);
+
+    VirtCaptureClocks(false);
+    FFIInit();
+
+    VirtInit();
+
+    //Register instrumentation
+    TRACE_AddInstrumentFunction(Trace, 0);
+    VdsoInit(); //initialized vDSO patching information (e.g., where all the possible vDSO entry points are)
+
+    PIN_AddThreadStartFunction(ThreadStart, 0);
+    PIN_AddThreadFiniFunction(ThreadFini, 0);
+
+    PIN_AddSyscallEntryFunction(SyscallEnter, 0);
+    PIN_AddSyscallExitFunction(SyscallExit, 0);
+    PIN_AddContextChangeFunction(ContextChange, 0);
+
+    PIN_AddFiniFunction(Fini, 0);
+
+    //Follow exec and fork
+    PIN_AddFollowChildProcessFunction(FollowChild, 0);
+    PIN_AddForkFunction(FPOINT_BEFORE, BeforeFork, 0);
+    PIN_AddForkFunction(FPOINT_AFTER_IN_PARENT, AfterForkInParent, 0);
+    PIN_AddForkFunction(FPOINT_AFTER_IN_CHILD, AfterForkInChild, 0);
+
+    //FFwd control
+    //OK, screw it. Launch this on a separate thread, and forget about signals... the caller will set a shared memory var. PIN is hopeless with signal instrumentation on multithreaded processes!
+    PIN_SpawnInternalThread(FFThread, NULL, 64*1024, NULL);
+
+    //Never returns
+    PIN_StartProgram();
+    return 0;
+}
+
diff --git a/src/zsim.h b/src/zsim.h
new file mode 100644
index 00000000..985dc90a
--- /dev/null
+++ b/src/zsim.h
@@ -0,0 +1,189 @@
+/** $lic$
+ * Copyright (C) 2012-2014 by Massachusetts Institute of Technology
+ * Copyright (C) 2010-2013 by The Board of Trustees of Stanford University
+ *
+ * This file is part of zsim.
+ *
+ * zsim is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU General Public License as published by the Free Software
+ * Foundation, version 2.
+ *
+ * If you use this software in your research, we request that you reference
+ * the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of
+ * Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the
+ * source of the simulator in any publications that use this software, and that
+ * you send us a citation of your work.
+ *
+ * zsim is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef ZSIM_H_
+#define ZSIM_H_
+
+#include <stdint.h>
+#include <sys/time.h>
+#include "constants.h"
+#include "debug.h"
+#include "locks.h"
+#include "pad.h"
+
+class Core;
+class Scheduler;
+class AggregateStat;
+class StatsBackend;
+class ProcessTreeNode;
+class ProcessStats;
+class EventQueue;
+class ContentionSim;
+class EventRecorder;
+class PinCmd;
+class PortVirtualizer;
+class VectorCounter;
+
+struct ClockDomainInfo {
+    uint64_t realtimeOffsetNs;
+    uint64_t monotonicOffsetNs;
+    uint64_t processOffsetNs;
+    uint64_t rdtscOffset;
+    lock_t lock;
+};
+
+class TimeBreakdownStat;
+enum ProfileStates {
+    PROF_INIT = 0,
+    PROF_BOUND = 1,
+    PROF_WEAVE = 2,
+    PROF_FF = 3,
+};
+
+enum ProcExitStatus {
+    PROC_RUNNING = 0,
+    PROC_EXITED = 1,
+    PROC_RESTARTME  = 2
+};
+
+struct GlobSimInfo {
+    //System configuration values, all read-only, set at initialization
+    uint32_t numCores;
+    uint32_t lineSize;
+
+    //Cores
+    Core** cores;
+
+    PAD();
+
+    EventQueue* eventQueue;
+    Scheduler* sched;
+
+    //Contention simulation
+    uint32_t numDomains;
+    ContentionSim* contentionSim;
+    EventRecorder** eventRecorders; //CID->EventRecorder* array
+
+    PAD();
+
+    //World-readable
+    uint32_t phaseLength;
+    uint32_t statsPhaseInterval;
+    uint32_t freqMHz;
+
+    //Maxima/termination conditions
+    uint64_t maxPhases; //terminate when this many phases have been reached
+    uint64_t maxMinInstrs; //terminate when all threads have reached this many instructions
+    uint64_t maxTotalInstrs; //terminate when the aggregate number of instructions reaches this number
+    uint64_t maxSimTimeNs; //terminate when the simulation time (bound+weave) exceeds this many ns
+    uint64_t maxProcEventualDumps; //term if the number of heartbeat-triggered process dumps reached this (MP/MT)
+
+    bool ignoreHooks;
+    bool blockingSyscalls;
+    bool perProcessCpuEnum; //if true, cpus are enumerated according to per-process masks (e.g., a 16-core mask in a 64-core sim sees 16 cores)
+    bool oooDecode; //if true, Decoder does OOO (instr->uop) decoding
+
+    PAD();
+
+    //Writable, rarely read, unshared in a single phase
+    uint64_t numPhases;
+    uint64_t globPhaseCycles; //just numPhases*phaseCycles. It behooves us to precompute it, since it is very frequently used in tracing code.
+
+    uint64_t procEventualDumps;
+
+    PAD();
+
+    ClockDomainInfo clockDomainInfo[MAX_CLOCK_DOMAINS];
+    PortVirtualizer* portVirt[MAX_PORT_DOMAINS];
+
+    lock_t ffLock; //global, grabbed in all ff entry/exit ops.
+
+    volatile uint32_t globalActiveProcs; //used for termination
+    //Counters below are used for deadlock detection
+    volatile uint32_t globalSyncedFFProcs; //count of processes that are in synced FF
+    volatile uint32_t globalFFProcs; //count of processes that are in either synced or unsynced FF
+
+    volatile bool terminationConditionMet;
+
+    const char* outputDir; //all the output files mst be dumped here. Stored because complex workloads often change dir, then spawn...
+
+    AggregateStat* rootStat;
+    StatsBackend* periodicStatsBackend;
+    StatsBackend* statsBackend; //end-of-sim backend
+    StatsBackend* eventualStatsBackend;
+    StatsBackend* compactStatsBackend;
+    ProcessStats* processStats;
+
+    TimeBreakdownStat* profSimTime;
+    VectorCounter* profHeartbeats; //global b/c number of processes cannot be inferred at init time; we just size to max
+
+    uint64_t trigger; //code with what triggered the current stats dump
+
+    ProcessTreeNode* procTree;
+    ProcessTreeNode** procArray; //a flat view of the process tree, where each process is indexed by procIdx
+    ProcExitStatus* procExited; //starts with all set to PROC_RUNNING, each process sets to PROC_EXITED or PROC_RESTARTME on exit. Used to detect untimely deaths (that don;t go thropugh SimEnd) in the harness and abort.
+    uint32_t numProcs;
+    uint32_t numProcGroups;
+
+    PinCmd* pinCmd; //enables calls to exec() to modify Pin's calling arguments, see zsim.cpp
+
+    // If true, threads start as shadow and have no effect on simulation until they call the register magic op
+    bool registerThreads;
+
+    //If true, do not output vectors in stats -- they're bulky and we barely need them
+    bool skipStatsVectors;
+
+    //If true, all the regular aggregate stats are summed before dumped, e.g. getting one thread record with instrs&cycles for all the threads
+    bool compactPeriodicStats;
+
+    bool attachDebugger;
+    int harnessPid; //used for debugging purposes
+
+    struct LibInfo libzsimAddrs;
+
+    bool ffReinstrument; //true if we should reinstrument on ffwd, works fine with ST apps and it's faster since we run with basically no instrumentation, but it's not precise with MT apps
+
+    //fftoggle stuff
+    lock_t ffToggleLocks[256]; //f*ing Pin and its f*ing inability to handle external signals...
+    lock_t pauseLocks[256]; //per-process pauses
+    volatile bool globalPauseFlag; //if set, pauses simulation on phase end
+    volatile bool externalTermPending;
+};
+
+
+//Process-wide global variables, defined in zsim.cpp
+extern Core* cores[MAX_THREADS]; //tid->core array
+extern uint32_t procIdx;
+extern uint32_t lineBits; //process-local for performance, but logically global
+extern uint64_t procMask;
+
+extern GlobSimInfo* zinfo;
+
+//Process-wide functions, defined in zsim.cpp
+uint32_t getCid(uint32_t tid);
+uint32_t TakeBarrier(uint32_t tid, uint32_t cid);
+void SimEnd(); //only call point out of zsim.cpp should be watchdog threads
+
+#endif  // ZSIM_H_
diff --git a/src/zsim_harness.cpp b/src/zsim_harness.cpp
new file mode 100644
index 00000000..78844fc1
--- /dev/null
+++ b/src/zsim_harness.cpp
@@ -0,0 +1,476 @@
+/** $lic$
+ * Copyright (C) 2012-2014 by Massachusetts Institute of Technology
+ * Copyright (C) 2010-2013 by The Board of Trustees of Stanford University
+ *
+ * This file is part of zsim.
+ *
+ * zsim is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU General Public License as published by the Free Software
+ * Foundation, version 2.
+ *
+ * If you use this software in your research, we request that you reference
+ * the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of
+ * Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the
+ * source of the simulator in any publications that use this software, and that
+ * you send us a citation of your work.
+ *
+ * zsim is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/* ZSim master process. Handles global heap creation, configuration, launching
+ * slave pin processes, coordinating and terminating runs, and stats printing.
+ */
+
+#include <fcntl.h>
+#include <fstream>
+#include <iostream>
+#include <signal.h>
+#include <sstream>
+#include <stdlib.h>
+#include <string>
+#include <sys/personality.h>
+#include <sys/wait.h>
+#include <unistd.h>
+#include <vector>
+#include "config.h"
+#include "constants.h"
+#include "debug_harness.h"
+#include "galloc.h"
+#include "log.h"
+#include "pin_cmd.h"
+#include "version.h" //autogenerated, in build dir, see SConstruct
+#include "zsim.h"
+
+/* Globals */
+
+typedef enum {
+    OK,
+    GRACEFUL_TERMINATION,
+    KILL_EM_ALL,
+} TerminationStatus;
+
+TerminationStatus termStatus = OK;
+
+typedef enum {
+    PS_INVALID,
+    PS_RUNNING,
+    PS_DONE,
+} ProcStatus;
+
+struct ProcInfo {
+    int pid;
+    volatile ProcStatus status;
+};
+
+//At most as many processes as threads, plus one extra process per child if we launch a debugger
+#define MAX_CHILDREN (2*MAX_THREADS)
+ProcInfo childInfo[MAX_CHILDREN];
+
+volatile uint32_t debuggerChildIdx = MAX_THREADS;
+
+GlobSimInfo* globzinfo = NULL; //used very sparingly, only in sig handlers. Should probably promote to a global like in zsim processes.
+
+bool perProcessDir, aslr;
+
+PinCmd* pinCmd;
+
+/* Defs & helper functions */
+
+void LaunchProcess(uint32_t procIdx);
+
+int getNumChildren() {
+    int num = 0;
+    for (int i = 0; i < MAX_CHILDREN; i++) {
+        if (childInfo[i].status == PS_RUNNING) num++;
+    }
+    return num;
+}
+
+int eraseChild(int pid) {
+    for (int i = 0; i < MAX_CHILDREN; i++) {
+        if (childInfo[i].pid == pid) {
+            assert_msg(childInfo[i].status == PS_RUNNING, "i=%d pid=%d status=%d", i, pid, childInfo[i].status);
+            childInfo[i].status = PS_DONE;
+            return i;
+        }
+    }
+    panic("Could not erase child!!");
+}
+
+/* Signal handlers */
+
+void chldSigHandler(int sig) {
+    assert(sig == SIGCHLD);
+    int status;
+    int cpid = waitpid(-1, &status, WNOHANG);
+    assert_msg(cpid > 0, "Wait should not fail, cpid=%d", cpid);
+    int idx = eraseChild(cpid);
+    if (idx < MAX_THREADS) {
+        info("Child %d done", cpid);
+        int exitCode = WIFEXITED(status)? WEXITSTATUS(status) : 0;
+        if (exitCode == PANIC_EXIT_CODE) {
+            panic("Child issued a panic, killing simulation");
+        }
+        //Stricter check: See if notifyEnd was called (i.e. zsim caught this termination)
+        //Only works for direct children though
+        if (globzinfo && !globzinfo->procExited[idx]) {
+            panic("Child %d (idx %d) exit was anomalous, killing simulation", cpid, idx);
+        }
+
+        if (globzinfo && globzinfo->procExited[idx] == PROC_RESTARTME) {
+            info("Restarting procIdx %d", idx);
+            globzinfo->procExited[idx] = PROC_RUNNING;
+            LaunchProcess(idx);
+        }
+    } else {
+        info("Child %d done (debugger)", cpid);
+    }
+}
+
+void sigHandler(int sig) {
+    if (termStatus == KILL_EM_ALL) return; //a kill was already issued, avoid infinite recursion
+
+    switch (sig) {
+        case SIGSEGV:
+            warn("Segmentation fault");
+            termStatus = KILL_EM_ALL;
+            break;
+        case SIGINT:
+            info("Received interrupt");
+            termStatus = (termStatus == OK)? GRACEFUL_TERMINATION : KILL_EM_ALL;
+            break;
+        case SIGTERM:
+            info("Received SIGTERM");
+            termStatus = KILL_EM_ALL;
+            break;
+        default:
+            warn("Received signal %d", sig);
+            termStatus = KILL_EM_ALL;
+    }
+
+    if (termStatus == KILL_EM_ALL) {
+        warn("Hard death, killing the whole process tree");
+        kill(-getpid(), SIGKILL);
+        //Exit, we have already killed everything, there should be no strays
+        panic("SIGKILLs sent -- exiting");
+    } else {
+        info("Attempting graceful termination");
+        for (int i = 0; i < MAX_CHILDREN; i++) {
+            int cpid = childInfo[i].pid;
+            if (childInfo[i].status == PS_RUNNING) {
+                info("Killing process %d", cpid);
+                kill(-cpid, SIGKILL);
+                sleep(0.1);
+                kill(cpid, SIGKILL);
+            }
+        }
+
+        info("Done sending kill signals");
+    }
+}
+
+void exitHandler() {
+    // If for some reason we still have children, kill everything
+    uint32_t children = getNumChildren();
+    if (children) {
+        warn("Hard death at exit (%d children running), killing the whole process tree", children);
+        kill(-getpid(), SIGKILL);
+    }
+}
+
+void debugSigHandler(int signum, siginfo_t* siginfo, void* dummy) {
+    assert(signum == SIGUSR1);
+    uint32_t callerPid = siginfo->si_pid;
+    // Child better have this initialized...
+    struct LibInfo* zsimAddrs = (struct LibInfo*) gm_get_secondary_ptr();
+    uint32_t debuggerPid = launchXtermDebugger(callerPid, zsimAddrs);
+    childInfo[debuggerChildIdx].pid = debuggerPid;
+    childInfo[debuggerChildIdx++].status = PS_RUNNING;
+}
+
+/* Heartbeats */
+
+static time_t startTime;
+static time_t lastHeartbeatTime;
+static uint64_t lastCycles = 0;
+
+static void printHeartbeat(GlobSimInfo* zinfo) {
+    uint64_t cycles = zinfo->numPhases*zinfo->phaseLength;
+    time_t curTime = time(NULL);
+    time_t elapsedSecs = curTime - startTime;
+    time_t heartbeatSecs = curTime - lastHeartbeatTime;
+
+    if (elapsedSecs == 0) return;
+    if (heartbeatSecs == 0) return;
+
+    char time[128];
+    char hostname[256];
+    gethostname(hostname, 256);
+
+    std::ofstream hb("heartbeat");
+    hb << "Running on: " << hostname << std::endl;
+    hb << "Start time: " << ctime_r(&startTime, time);
+    hb << "Heartbeat time: " << ctime_r(&curTime, time);
+    hb << "Stats since start:" << std:: endl;
+    hb << " " << zinfo->numPhases << " phases" << std::endl;
+    hb << " " << cycles << " cycles" << std::endl;
+    hb << " " << (cycles)/elapsedSecs << " cycles/s" << std::endl;
+    hb << "Stats since last heartbeat (" << heartbeatSecs << "s):" << std:: endl;
+    hb << " " << (cycles-lastCycles)/heartbeatSecs << " cycles/s" << std::endl;
+
+    lastHeartbeatTime = curTime;
+    lastCycles = cycles;
+}
+
+
+void LaunchProcess(uint32_t procIdx) {
+    int cpid = fork();
+    if (cpid) { //parent
+        assert(cpid > 0);
+        childInfo[procIdx].pid = cpid;
+        childInfo[procIdx].status = PS_RUNNING;
+    } else { //child
+        // Set the child's vars and get the command
+        // NOTE: We set the vars first so that, when parsing the command, wordexp takes those vars into account
+        pinCmd->setEnvVars(procIdx);
+        const char* inputFile;
+        g_vector<g_string> args = pinCmd->getFullCmdArgs(procIdx, &inputFile);
+
+        //Copy args to a const char* [] for exec
+        int nargs = args.size()+1;
+        const char* aptrs[nargs];
+
+        trace(Harness, "Calling arguments:");
+        for (unsigned int i = 0; i < args.size(); i++) {
+            trace(Harness, " arg%d = %s", i, args[i].c_str());
+            aptrs[i] = args[i].c_str();
+        }
+        aptrs[nargs-1] = NULL;
+
+        //Chdir to process dir if needed
+        if (perProcessDir) {
+            std::stringstream dir_ss;
+            dir_ss << "p" << procIdx << "/";
+            int res = chdir(dir_ss.str().c_str());
+            if (res == -1) {
+                perror("Coud not chdir");
+                panic("chdir to %s failed", dir_ss.str().c_str());
+            }
+        }
+
+        //Input redirection if needed
+        if (inputFile) {
+            int fd = open(inputFile, O_RDONLY);
+            if (fd == -1) {
+                perror("open() failed");
+                panic("Could not open input redirection file %s", inputFile);
+            }
+            dup2(fd, 0);
+        }
+
+        /* In a modern kernel, we must disable address space randomization. Otherwise,
+         * different zsim processes will load zsim.so on different addresses,
+         * which would be fine except that the vtable pointers will be different
+         * per process, and virtual functions will not work.
+         *
+         * WARNING: The harness itself is run with randomization on, which should
+         * be fine because it doesn't load zsim.so anyway. If this changes at some
+         * point, we'll need to have the harness be executed via a wrapper that just
+         * changes the personalily and forks, or run the harness with setarch -R
+         */
+        if (!aslr) {
+            //Get old personality flags & update
+            int pers = personality(((unsigned int)-1) /*returns current pers flags; arg is a long, hence the cast, see man*/);
+            if (pers == -1 || personality(pers | ADDR_NO_RANDOMIZE) == -1) {
+                perror("personality() call failed");
+                panic("Could not change personality to disable address space randomization!");
+            }
+            int newPers = personality(((unsigned int)-1));
+            if ((newPers & ADDR_NO_RANDOMIZE) == 0) panic("personality() call was not honored! old 0x%x new 0x%x", pers, newPers);
+        }
+
+        if (execvp(aptrs[0], (char* const*)aptrs) == -1) {
+            perror("Could not exec, killing child");
+            panic("Could not exec %s", aptrs[0]);
+        } else {
+            panic("Something is SERIOUSLY wrong. This should never execute!");
+        }
+    }
+}
+
+
+int main(int argc, char *argv[]) {
+    if (argc == 2 && std::string(argv[1]) == "-v") {
+        printf("%s\n", ZSIM_BUILDVERSION);
+        exit(0);
+    }
+
+    InitLog("[H] ", NULL /*log to stdout/err*/);
+    info("Starting zsim, built %s (rev %s)", ZSIM_BUILDDATE, ZSIM_BUILDVERSION);
+    startTime = time(NULL);
+
+    if (argc != 2) {
+        info("Usage: %s config_file", argv[0]);
+        exit(1);
+    }
+
+    //Canonicalize paths --- because we change dirs, we deal in absolute paths
+    const char* configFile = realpath(argv[1], NULL);
+    const char* outputDir = getcwd(NULL, 0); //already absolute
+
+    Config conf(configFile);
+
+    if (atexit(exitHandler)) panic("Could not register exit handler");
+
+    signal(SIGSEGV, sigHandler);
+    signal(SIGINT,  sigHandler);
+    signal(SIGABRT, sigHandler);
+    signal(SIGTERM, sigHandler);
+
+    signal(SIGCHLD, chldSigHandler);
+
+    //SIGUSR1 is used by children processes when they want to get a debugger session started;
+    struct sigaction debugSa;
+    debugSa.sa_flags = SA_SIGINFO;
+    sigemptyset(&debugSa.sa_mask); //NOTE: We might want to start using sigfullsets in other signal handlers to avoid races...
+    debugSa.sa_sigaction = debugSigHandler;
+    if (sigaction(SIGUSR1, &debugSa, NULL) != 0)
+        panic("sigaction() failed");
+
+    waitid(P_ALL, 0, NULL, WEXITED);
+
+    //Remove all zsim.log.* files (we append to them, and want to avoid outputs from multiple simulations)
+    uint32_t removedLogfiles = 0;
+    while (true) {
+        std::stringstream ss;
+        ss << "zsim.log." << removedLogfiles;
+        if (remove(ss.str().c_str()) != 0) break;
+        removedLogfiles++;
+    }
+    if (removedLogfiles) info("Removed %d old logfiles", removedLogfiles);
+
+    uint32_t gmSize = conf.get<uint32_t>("sim.gmMBytes", (1<<10) /*default 1024MB*/);
+    info("Creating global segment, %d MBs", gmSize);
+    int shmid = gm_init(((size_t)gmSize) << 20 /*MB to Bytes*/);
+    info("Global segment shmid = %d", shmid);
+    //fprintf(stderr, "%sGlobal segment shmid = %d\n", logHeader, shmid); //hack to print shmid on both streams
+    //fflush(stderr);
+
+    trace(Harness, "Created global segment, starting pin processes, shmid = %d", shmid);
+
+    //Do we need per-process direcories?
+    perProcessDir = conf.get<bool>("sim.perProcessDir", false);
+
+    if (perProcessDir) {
+        info("Running each process in a different subdirectory"); //p0, p1, ...
+    }
+
+    bool deadlockDetection;
+    bool attachDebugger = conf.get<bool>("sim.attachDebugger", false);
+
+    if (attachDebugger) {
+        info("Pausing PIN to attach debugger, and not running deadlock detection");
+        deadlockDetection = false;
+    } else {
+        deadlockDetection = conf.get<bool>("sim.deadlockDetection", true);
+    }
+
+    info("Deadlock detection %s", deadlockDetection? "ON" : "OFF");
+
+    aslr = conf.get<bool>("sim.aslr", false);
+    if (aslr) info("Not disabling ASLR, multiprocess runs will fail");
+
+    //Create children processes
+    pinCmd = new PinCmd(&conf, configFile, outputDir, shmid);
+    uint32_t numProcs = pinCmd->getNumCmdProcs();
+
+    for (uint32_t procIdx = 0; procIdx < numProcs; procIdx++) {
+        LaunchProcess(procIdx);
+    }
+
+    if (numProcs == 0) panic("No process config found. Config file needs at least a process0 entry");
+
+    //Wait for all processes to finish
+    int sleepLength = 10;
+    GlobSimInfo* zinfo = NULL;
+    int32_t secsStalled = 0;
+
+    int64_t lastNumPhases = 0;
+
+    while (getNumChildren() > 0) {
+        if (!gm_isready()) {
+            sched_yield(); //wait till proc idx 0 initializes everyhting
+            continue;
+        }
+
+        if (zinfo == NULL) {
+            zinfo = static_cast<GlobSimInfo*>(gm_get_glob_ptr());
+            globzinfo = zinfo;
+            info("Attached to global heap");
+        }
+
+        printHeartbeat(zinfo); //ensure we dump hostname etc on early crashes
+
+        int left = sleep(sleepLength);
+        int secsSlept = sleepLength - left;
+        //info("Waking up, secs elapsed %d", secsSlept);
+
+        __sync_synchronize();
+
+        uint32_t activeProcs = zinfo->globalActiveProcs;
+        uint32_t ffProcs = zinfo->globalFFProcs;
+        uint32_t sffProcs = zinfo->globalSyncedFFProcs;
+        bool simShouldAdvance = (ffProcs < activeProcs) && (sffProcs == 0);
+
+        int64_t numPhases = zinfo->numPhases;
+
+        if (deadlockDetection) {
+            if (simShouldAdvance) {
+                //info("In deadlock check zone");
+                if (numPhases <= lastNumPhases) {
+                    secsStalled += secsSlept;
+                    if (secsStalled > 10) warn("Stalled for %d secs so far", secsStalled);
+                } else {
+                    //info("Not stalled, did %ld phases since last check", numPhases-lastNumPhases);
+                    lastNumPhases = numPhases;
+                    secsStalled = 0;
+                }
+            } else if (activeProcs) {
+                if (numPhases == lastNumPhases) info("Some fast-forwarding is going on, not doing deadlock detection (a: %d, ff: %d, sff: %d)", activeProcs, ffProcs, sffProcs);
+                lastNumPhases = numPhases;
+            } //otherwise, activeProcs == 0; we're done
+        }
+
+        printHeartbeat(zinfo);
+
+        //This solves a weird race in multiprocess where SIGCHLD does not always fire...
+        int cpid = -1;
+        while ((cpid = waitpid(-1, NULL, WNOHANG)) > 0) {
+            eraseChild(cpid);
+            info("Child %d done (in-loop catch)", cpid);
+        }
+
+        if (secsStalled > 120) {
+            warn("Deadlock detected, killing children");
+            sigHandler(SIGINT);
+            exit(42);
+        }
+    }
+
+    uint32_t exitCode = 0;
+    if (termStatus == OK) {
+        info("All children done, exiting");
+    } else {
+        info("Graceful termination finished, exiting");
+        exitCode = 1;
+    }
+    if (zinfo && zinfo->globalActiveProcs) warn("Unclean exit of %d children, termination stats were most likely not dumped", zinfo->globalActiveProcs);
+    exit(exitCode);
+}
+
diff --git a/tests/het.cfg b/tests/het.cfg
new file mode 100755
index 00000000..8b587c9e
--- /dev/null
+++ b/tests/het.cfg
@@ -0,0 +1,117 @@
+// This system is similar to a 6-core, 2.4GHz Westmere with 10 Niagara-like cores attached to the L3
+sys = {
+    lineSize = 64;
+    frequency = 2400;
+
+    cores = {
+        beefy = {
+            type = "OOO";
+            cores = 6;
+            icache = "l1i_beefy";
+            dcache = "l1d_beefy";
+        };
+
+        wimpy = {
+            type = "Simple";
+            cores = 10;
+            icache = "l1i_wimpy";
+            dcache = "l1d_wimpy";
+        };
+    };
+
+    caches = {
+        l1d_beefy = {
+            caches = 6;
+            size = 32768;
+            array = {
+                type = "SetAssoc";
+                ways = 8;
+            };
+            latency = 4;
+            parent = "l2_beefy";
+        };
+
+        l1i_beefy = {
+            caches = 6;
+            size = 32768;
+            array = {
+                type = "SetAssoc";
+                ways = 4;
+            };
+            latency = 3;
+            parent = "l2_beefy";
+        };
+
+        l2_beefy = {
+            caches = 6;
+            size = 262144;
+            latency = 7;
+            array = {
+                type = "SetAssoc";
+                ways = 8;
+            };
+            parent = "l3";
+        };
+
+
+        l1d_wimpy = {
+            caches = 10;
+            size = 8192;
+            latency = 2;
+            array = {
+                type = "SetAssoc";
+                ways = 4;
+            };
+            parent = "l3";
+        };
+
+        l1i_wimpy = {
+            caches = 10;
+            size = 16384;
+            latency = 3;
+            array = {
+                type = "SetAssoc";
+                ways = 8;
+            };
+            parent = "l3";
+        };
+
+
+        l3 = {
+            caches = 1;
+            banks = 6;
+            size = 12582912;
+            latency = 27;
+
+            array = {
+                type = "SetAssoc";
+                hash = "H3";
+                ways = 16;
+            };
+            parent = "mem";
+        };
+    };
+
+    mem = {
+        type = "DDR";
+        controllers = 4;
+        tech = "DDR3-1066-CL8";
+    };
+};
+
+sim = {
+    phaseLength = 10000;
+    maxTotalInstrs = 5000000000L;
+    statsPhaseInterval = 1000;
+    // attachDebugger = True;
+};
+
+process0 = {
+    command = "$ZSIMAPPSPATH/build/speccpu2006/401.bzip2/401.bzip2 $ZSIMAPPSPATH/inputs/speccpu2006/401.bzip2/ref/input.source 64";
+};
+
+process1 = {
+    command = "$ZSIMAPPSPATH/build/parsec/blackscholes/blackscholes 15 2000000";
+    startFastForwarded = True;
+};
+
diff --git a/tests/hooks.cfg b/tests/hooks.cfg
new file mode 100755
index 00000000..47fd4660
--- /dev/null
+++ b/tests/hooks.cfg
@@ -0,0 +1,59 @@
+// Tests zsim hooks for different programming languages To build tests, run
+// make -j -C misc/hooks You can run the real tests with make -C misc/hooks
+// run_tests This uses logToFile because some JVMs need transparency (they fork
+// and use pipes, and writing to stdout/stderr breaks those pipes)
+
+sys = {
+    cores = {
+        c = {
+            type = "Simple";
+            dcache = "l1d";
+            icache = "l1i";
+        };
+    };
+
+    lineSize = 64;
+
+    caches = {
+        l1d = {
+            size = 65536;
+            parent = "l2";
+        };
+        l1i = {
+            size = 32768;
+            parent = "l2";
+        };
+        l2 = {
+            size = 2097152;
+            parent = "mem";
+        };
+    };
+};
+
+sim = {
+    logToFile = true;
+};
+
+process0 = {
+    command = "./misc/hooks/test_c";
+    startFastForwarded = True;
+    syncedFastForward = False;
+};
+
+process1 = {
+    command = "./misc/hooks/test_cpp";
+    startFastForwarded = True;
+    syncedFastForward = False;
+};
+
+process2 = {
+    command = "./misc/hooks/test_fortran";
+    startFastForwarded = True;
+    syncedFastForward = False;
+};
+
+process3 = {
+    command = "java -cp ./misc/hooks -Djava.library.path=./misc/hooks test";
+    startFastForwarded = True;
+    syncedFastForward = False;
+};
diff --git a/tests/pgo.cfg b/tests/pgo.cfg
new file mode 100644
index 00000000..935ba842
--- /dev/null
+++ b/tests/pgo.cfg
@@ -0,0 +1,80 @@
+// Used for the PGO compile flow
+// based on zephyr3 L5640@2.27GHz
+
+process0 = {
+  command = "$ZSIMAPPSPATH/build/speccpu2006/447.dealII/447.dealII 23";
+};
+
+sim = {
+  maxTotalInstrs = 100000000L;
+  phaseLength = 10000;
+  statsPhaseInterval = 0;
+};
+
+sys = {
+  caches = {
+    l1d = {
+      array = {
+        type = "SetAssoc";
+        ways = 8;
+      };
+      caches = 1;
+      latency = 4;
+      parent = "l2";
+      size = 32768;
+    };
+
+    l1i = {
+      array = {
+        type = "SetAssoc";
+        ways = 4;
+      };
+      caches = 1;
+      latency = 3;
+      parent = "l2";
+      size = 32768;
+    };
+
+    l2 = {
+      array = {
+        type = "SetAssoc";
+        ways = 8;
+      };
+      caches = 1;
+      latency = 7;
+      parent = "l3";
+      size = 262144;
+    };
+
+    l3 = {
+      array = {
+        hash = "H3";
+        type = "SetAssoc";
+        ways = 16;
+      };
+      banks = 6;
+      caches = 1;
+      latency = 27;
+      parent = "mem";
+      size = 12582912;
+    };
+  };
+
+  cores = {
+    westmere = {
+      cores = 1;
+      dcache = "l1d";
+      icache = "l1i";
+      type = "OOO";
+    };
+  };
+
+  frequency = 2270;
+  lineSize = 64;
+  mem = {
+    controllers = 3;
+    type = "DDR";
+    controllerLatency = 40;
+  };
+};
+
diff --git a/tests/ptree.cfg b/tests/ptree.cfg
new file mode 100755
index 00000000..6a7e5c16
--- /dev/null
+++ b/tests/ptree.cfg
@@ -0,0 +1,61 @@
+// Test nested processes
+
+sys = {
+    cores = {
+        nehalem = {
+            type = "OOO";
+            cores = 2;
+            //icache and dcache can be the same group, but are still split.
+            //Even ones are i (l1-0 and 2), odd are d (l1-1 and 3)
+            icache = "l1"; 
+            dcache = "l1";
+        };
+    };
+
+    caches = {
+        l1 = {
+            size = 65536;
+            caches = 4;
+            parent = "l2";
+        };
+
+        l2 = {
+            caches = 1;
+            size = 2097152;
+            array = {
+                ways = 16;
+                hash = "H3";
+            };
+            parent = "mem";
+        };
+    };
+};
+
+sim = {
+    phaseLength = 10000;
+    //attachDebugger = True; //this would be madness :)
+};
+
+// NOTE: This is useful when you spawn multiple processes that create other
+// processes --- they are assigned in tree order. However, at this point if you
+// don't specify any subprocesses, the simulation will work (but process ids
+// are assigned in FCFS order, which is non-deterministic, and the children will
+// inherit the parent's configuration).
+process0 = { //bash
+    command = "bash -c \"bash -c '/bin/echo Foo | cat; sleep 2 ; echo Bar' | cat | cat \"";
+    process0 = { //bash
+        process0 = { // /bin/echo
+        };
+        process1 = { //cat
+        };
+        process2 = { //sleep
+        };
+        process3 = { //echo (actually, when you call echo in bash, it's just a fork, it doesn't exec echo)
+        };
+    };
+    process1 = { //cat
+    };
+    process2 = { //cat
+    };
+};
+
diff --git a/tests/simple.cfg b/tests/simple.cfg
new file mode 100755
index 00000000..343277c8
--- /dev/null
+++ b/tests/simple.cfg
@@ -0,0 +1,45 @@
+// As simple as it gets: 1-core system with 2 short processes
+
+sys = {
+    cores = {
+        simpleCore = {
+            type = "Simple";
+            dcache = "l1d";
+            icache = "l1i";
+        };
+    };
+
+    lineSize = 64;
+
+    caches = {
+        l1d = {
+            size = 65536;
+            parent = "l2";
+        };
+        l1i = {
+            size = 32768;
+            parent = "l2";
+        };
+        l2 = {
+            caches = 1;
+            size = 2097152;
+            parent = "mem";
+        };
+    };
+};
+
+sim = {
+    phaseLength = 10000;
+    // attachDebugger = True;
+    schedQuantum = 50;  // switch threads frequently
+};
+
+process0 = {
+    command = "ls -alh --color tests/";
+};
+
+
+process1 = {
+    command = "cat tests/simple.cfg";
+};
+