diff --git a/.clang-format b/.clang-format
index 4b3f13fa55..e7d00feaa0 100644
--- a/.clang-format
+++ b/.clang-format
@@ -1,5 +1,5 @@
 ---
 Language:        Cpp
 BasedOnStyle:  Google
+PointerAlignment: Left
 ...
-
diff --git a/.github/.libcxx-setup.sh b/.github/.libcxx-setup.sh
new file mode 100755
index 0000000000..56008403ae
--- /dev/null
+++ b/.github/.libcxx-setup.sh
@@ -0,0 +1,24 @@
+#!/usr/bin/env bash
+
+# Checkout LLVM sources
+git clone --depth=1 https://github.com/llvm/llvm-project.git llvm-project
+
+# Setup libc++ options
+if [ -z "$BUILD_32_BITS" ]; then
+  export BUILD_32_BITS=OFF && echo disabling 32 bit build
+fi
+
+# Build and install libc++ (Use unstable ABI for better sanitizer coverage)
+cd ./llvm-project
+cmake -DCMAKE_C_COMPILER=${C_COMPILER}          \
+      -DCMAKE_CXX_COMPILER=${COMPILER}          \
+      -DCMAKE_BUILD_TYPE=RelWithDebInfo         \
+      -DCMAKE_INSTALL_PREFIX=/usr               \
+      -DLIBCXX_ABI_UNSTABLE=OFF                 \
+      -DLLVM_USE_SANITIZER=${LIBCXX_SANITIZER}  \
+      -DLLVM_BUILD_32_BITS=${BUILD_32_BITS}     \
+      -DLLVM_ENABLE_PROJECTS='libcxx;libcxxabi' \
+      -S llvm -B llvm-build -G "Unix Makefiles"
+make -C llvm-build -j3 cxx cxxabi
+sudo make -C llvm-build install-cxx install-cxxabi
+cd ..
diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md
new file mode 100644
index 0000000000..6c2ced9b2e
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/bug_report.md
@@ -0,0 +1,32 @@
+---
+name: Bug report
+about: Create a report to help us improve
+title: "[BUG]"
+labels: ''
+assignees: ''
+
+---
+
+**Describe the bug**
+A clear and concise description of what the bug is.
+
+**System**
+Which OS, compiler, and compiler version are you using:
+  - OS: 
+  - Compiler and version: 
+
+**To reproduce**
+Steps to reproduce the behavior:
+1. sync to commit ...
+2. cmake/bazel...
+3. make ...
+4. See error
+
+**Expected behavior**
+A clear and concise description of what you expected to happen.
+
+**Screenshots**
+If applicable, add screenshots to help explain your problem.
+
+**Additional context**
+Add any other context about the problem here.
diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md
new file mode 100644
index 0000000000..9e8ab6a673
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/feature_request.md
@@ -0,0 +1,20 @@
+---
+name: Feature request
+about: Suggest an idea for this project
+title: "[FR]"
+labels: ''
+assignees: ''
+
+---
+
+**Is your feature request related to a problem? Please describe.**
+A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
+
+**Describe the solution you'd like**
+A clear and concise description of what you want to happen.
+
+**Describe alternatives you've considered**
+A clear and concise description of any alternative solutions or features you've considered.
+
+**Additional context**
+Add any other context or screenshots about the feature request here.
diff --git a/.github/workflows/bazel.yml b/.github/workflows/bazel.yml
new file mode 100644
index 0000000000..a53661b2f9
--- /dev/null
+++ b/.github/workflows/bazel.yml
@@ -0,0 +1,30 @@
+name: bazel
+
+on:
+  push: {}
+  pull_request: {}
+
+jobs:
+  build-and-test:
+    runs-on: ubuntu-latest
+    
+    steps:
+    - uses: actions/checkout@v1
+
+    - name: mount bazel cache
+      uses: actions/cache@v2.0.0
+      env:
+        cache-name: bazel-cache
+      with:
+        path: "~/.cache/bazel"
+        key: ${{ env.cache-name }}-${{ runner.os }}-${{ github.ref }}
+        restore-keys: |
+          ${{ env.cache-name }}-${{ runner.os }}-main
+
+    - name: build
+      run: |
+        bazel build //:benchmark //:benchmark_main //test/...
+
+    - name: test
+      run: |
+        bazel test --test_output=all //test/...
diff --git a/.github/workflows/build-and-test-perfcounters.yml b/.github/workflows/build-and-test-perfcounters.yml
new file mode 100644
index 0000000000..b2b5419197
--- /dev/null
+++ b/.github/workflows/build-and-test-perfcounters.yml
@@ -0,0 +1,44 @@
+name: build-and-test-perfcounters
+
+on:
+  push:
+    branches: [ main ]
+  pull_request:
+    branches: [ main ]
+
+jobs:
+  job:
+    # TODO(dominic): Extend this to include compiler and set through env: CC/CXX.
+    name: ${{ matrix.os }}.${{ matrix.build_type }}
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [ubuntu-latest, ubuntu-16.04, ubuntu-20.04]
+        build_type: ['Release', 'Debug']
+    steps:
+    - uses: actions/checkout@v2
+
+    - name: install libpfm
+      run: sudo apt install libpfm4-dev
+
+    - name: create build environment
+      run: cmake -E make_directory ${{ runner.workspace }}/_build
+
+    - name: configure cmake
+      shell: bash
+      working-directory: ${{ runner.workspace }}/_build
+      run: cmake  -DBENCHMARK_ENABLE_LIBPFM=1 -DBENCHMARK_DOWNLOAD_DEPENDENCIES=ON $GITHUB_WORKSPACE -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
+
+    - name: build
+      shell: bash
+      working-directory: ${{ runner.workspace }}/_build
+      run: cmake --build . --config ${{ matrix.build_type }}
+
+    # Skip testing, for now. It seems perf_event_open does not succeed on the
+    # hosting machine, very likely a permissions issue.
+    # TODO(mtrofin): Enable test.
+    # - name: test
+    #   shell: bash
+    #   working-directory: ${{ runner.workspace }}/_build
+    #   run: sudo ctest -C ${{ matrix.build_type }} --rerun-failed --output-on-failure
diff --git a/.github/workflows/build-and-test.yml b/.github/workflows/build-and-test.yml
new file mode 100644
index 0000000000..9e5be3b1dc
--- /dev/null
+++ b/.github/workflows/build-and-test.yml
@@ -0,0 +1,110 @@
+name: build-and-test
+
+on:
+  push: {}
+  pull_request: {}
+
+jobs:
+  # TODO: add 32-bit builds (g++ and clang++) for ubuntu
+  #   (requires g++-multilib and libc6:i386)
+  # TODO: add coverage build (requires lcov)
+  # TODO: add clang + libc++ builds for ubuntu
+  # TODO: add clang + ubsan/asan/msan + libc++ builds for ubuntu
+  job:
+    name: ${{ matrix.os }}.${{ matrix.build_type }}.${{ matrix.compiler }}
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [ubuntu-latest, ubuntu-16.04, ubuntu-20.04, macos-latest]
+        build_type: ['Release', 'Debug']
+        compiler: [g++, clang++]
+        include:
+          - displayTargetName: windows-latest-release
+            os: windows-latest
+            build_type: 'Release'
+          - displayTargetName: windows-latest-debug
+            os: windows-latest
+            build_type: 'Debug'
+    steps:
+      - uses: actions/checkout@v2
+
+      - name: create build environment
+        run: cmake -E make_directory ${{ runner.workspace }}/_build
+
+      - name: configure cmake
+        env:
+          CXX: ${{ matrix.compiler }}
+        shell: bash
+        working-directory: ${{ runner.workspace }}/_build
+        run: >
+          cmake $GITHUB_WORKSPACE
+          -DBENCHMARK_DOWNLOAD_DEPENDENCIES=ON
+          -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
+
+      - name: build
+        shell: bash
+        working-directory: ${{ runner.workspace }}/_build
+        run: cmake --build . --config ${{ matrix.build_type }}
+
+      - name: test
+        shell: bash
+        working-directory: ${{ runner.workspace }}/_build
+        run: ctest -C ${{ matrix.build_type }} -VV
+
+  ubuntu-14_04:
+    name: ubuntu-14.04.${{ matrix.build_type }}.${{ matrix.compiler }}
+    runs-on: [ubuntu-latest]
+    strategy:
+      fail-fast: false
+      matrix:
+        build_type: ['Release', 'Debug']
+        compiler: [g++-4.8, clang++-3.6]
+        include:
+          - compiler: g++-6
+            build_type: 'Debug'
+            run_tests: true
+          - compiler: g++-6
+            build_type: 'Release'
+            run_tests: true
+    container: ubuntu:14.04
+    steps:
+      - uses: actions/checkout@v2
+
+      - name: install required bits
+        run: |
+          sudo apt update
+          sudo apt -y install clang-3.6 cmake3 g++-4.8 git
+
+      - name: install other bits
+        if: ${{ matrix.compiler }} == g++-6
+        run: |
+          sudo apt -y install software-properties-common
+          sudo add-apt-repository -y "ppa:ubuntu-toolchain-r/test"
+          sudo apt update
+          sudo apt -y install g++-6
+
+      - name: create build environment
+        run: cmake -E make_directory $GITHUB_WORKSPACE/_build
+
+      - name: configure cmake
+        env:
+          CXX: ${{ matrix.compiler }}
+        shell: bash
+        working-directory: ${{ github.workspace }}/_build
+        run: >
+          cmake $GITHUB_WORKSPACE
+          -DBENCHMARK_ENABLE_TESTING=${{ matrix.run_tests }}
+          -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
+          -DBENCHMARK_DOWNLOAD_DEPENDENCIES=${{ matrix.run_tests }}
+
+      - name: build
+        shell: bash
+        working-directory: ${{ github.workspace }}/_build
+        run: cmake --build . --config ${{ matrix.build_type }}
+
+      - name: test
+        if: ${{ matrix.run_tests }}
+        shell: bash
+        working-directory: ${{ github.workspace }}/_build
+        run: ctest -C ${{ matrix.build_type }} -VV
diff --git a/.github/workflows/pylint.yml b/.github/workflows/pylint.yml
new file mode 100644
index 0000000000..0f73a58232
--- /dev/null
+++ b/.github/workflows/pylint.yml
@@ -0,0 +1,26 @@
+name: pylint
+
+on:
+  push:
+    branches: [ main ]
+  pull_request:
+    branches: [ main ]
+
+jobs:
+  pylint:
+
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: actions/checkout@v2
+    - name: Set up Python 3.8
+      uses: actions/setup-python@v1
+      with:
+        python-version: 3.8
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install pylint pylint-exit conan
+    - name: Run pylint
+      run: |
+        pylint `find . -name '*.py'|xargs` || pylint-exit $?
diff --git a/.github/workflows/sanitizer.yml b/.github/workflows/sanitizer.yml
new file mode 100644
index 0000000000..fbc984492d
--- /dev/null
+++ b/.github/workflows/sanitizer.yml
@@ -0,0 +1,78 @@
+name: sanitizer
+
+on:
+  push: {}
+  pull_request: {}
+
+env:
+  CC: clang
+  CXX: clang++
+  EXTRA_CXX_FLAGS: "-stdlib=libc++"
+  UBSAN_OPTIONS: "print_stacktrace=1"
+
+jobs:
+  job:
+    name: ${{ matrix.sanitizer }}.${{ matrix.build_type }}
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        build_type: ['Debug', 'RelWithDebInfo']
+        sanitizer: ['asan', 'ubsan', 'tsan']
+        # TODO: add 'msan' above. currently failing and needs investigation.
+    steps:
+    - uses: actions/checkout@v2
+
+    - name: configure msan env
+      if: matrix.sanitizer == 'msan'
+      run: |
+        echo "EXTRA_FLAGS=-g -O2 -fno-omit-frame-pointer -fsanitize=memory -fsanitize-memory-track-origins" >> $GITHUB_ENV
+        echo "LIBCXX_SANITIZER=MemoryWithOrigins" >> $GITHUB_ENV
+
+    - name: configure ubsan env
+      if: matrix.sanitizer == 'ubsan'
+      run: |
+        echo "EXTRA_FLAGS=-g -O2 -fno-omit-frame-pointer -fsanitize=undefined -fno-sanitize-recover=all" >> $GITHUB_ENV
+        echo "LIBCXX_SANITIZER=Undefined" >> $GITHUB_ENV
+
+    - name: configure asan env
+      if: matrix.sanitizer == 'asan'
+      run: |
+        echo "EXTRA_FLAGS=-g -O2 -fno-omit-frame-pointer -fsanitize=address -fno-sanitize-recover=all" >> $GITHUB_ENV
+        echo "LIBCXX_SANITIZER=Address" >> $GITHUB_ENV
+
+    - name: configure tsan env
+      if: matrix.sanitizer == 'tsan'
+      run: |
+        echo "EXTRA_FLAGS=-g -O2 -fno-omit-frame-pointer -fsanitize=thread -fno-sanitize-recover=all" >> $GITHUB_ENV
+        echo "LIBCXX_SANITIZER=Thread" >> $GITHUB_ENV
+
+    - name: install llvm stuff
+      run: "${GITHUB_WORKSPACE}/.github/.libcxx-setup.sh"
+
+    - name: create build environment
+      run: cmake -E make_directory ${{ runner.workspace }}/_build
+
+    - name: configure cmake
+      shell: bash
+      working-directory: ${{ runner.workspace }}/_build
+      run: >
+        cmake $GITHUB_WORKSPACE
+        -DBENCHMARK_ENABLE_ASSEMBLY_TESTS=OFF
+        -DBENCHMARK_ENABLE_LIBPFM=OFF
+        -DBENCHMARK_DOWNLOAD_DEPENDENCIES=ON
+        -DCMAKE_C_COMPILER=${{ env.CC }}
+        -DCMAKE_CXX_COMPILER=${{ env.CXX }}
+        -DCMAKE_C_FLAGS="${{ env.EXTRA_FLAGS }}"
+        -DCMAKE_CXX_FLAGS="${{ env.EXTRA_FLAGS }} ${{ env.EXTRA_CXX_FLAGS }}"
+        -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
+
+    - name: build
+      shell: bash
+      working-directory: ${{ runner.workspace }}/_build
+      run: cmake --build . --config ${{ matrix.build_type }}
+
+    - name: test
+      shell: bash
+      working-directory: ${{ runner.workspace }}/_build
+      run: ctest -C ${{ matrix.build_type }} -VV
diff --git a/.github/workflows/test_bindings.yml b/.github/workflows/test_bindings.yml
new file mode 100644
index 0000000000..4a580ebe04
--- /dev/null
+++ b/.github/workflows/test_bindings.yml
@@ -0,0 +1,24 @@
+name: test-bindings
+
+on:
+  push:
+    branches: [main]
+  pull_request:
+    branches: [main]
+
+jobs:
+  python_bindings:
+    runs-on: ubuntu-latest
+
+    steps:
+      - uses: actions/checkout@v2
+      - name: Set up Python
+        uses: actions/setup-python@v1
+        with:
+          python-version: 3.8
+      - name: Install benchmark
+        run:
+          python setup.py install
+      - name: Run example bindings
+        run:
+          python bindings/python/google_benchmark/example.py
diff --git a/.gitignore b/.gitignore
index 93d5ced634..be55d774e2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,7 +6,9 @@
 *.dylib
 *.cmake
 !/cmake/*.cmake
+!/test/AssemblyTests.cmake
 *~
+*.swp
 *.pyc
 __pycache__
 
@@ -41,9 +43,13 @@ build.ninja
 install_manifest.txt
 rules.ninja
 
+# bazel output symlinks.
+bazel-*
+
 # out-of-source build top-level folders.
 build/
 _build/
+build*/
 
 # in-source dependencies
 /googletest/
@@ -51,3 +57,10 @@ _build/
 # Visual Studio 2015/2017 cache/options directory
 .vs/
 CMakeSettings.json
+
+# Visual Studio Code cache/options directory
+.vscode/
+
+# Python build stuff
+dist/
+*.egg-info*
diff --git a/.travis-libcxx-setup.sh b/.travis-libcxx-setup.sh
deleted file mode 100644
index a591743c6a..0000000000
--- a/.travis-libcxx-setup.sh
+++ /dev/null
@@ -1,28 +0,0 @@
-#!/usr/bin/env bash
-
-# Install a newer CMake version
-curl -sSL https://cmake.org/files/v3.6/cmake-3.6.1-Linux-x86_64.sh -o install-cmake.sh
-chmod +x install-cmake.sh
-sudo ./install-cmake.sh --prefix=/usr/local --skip-license
-
-# Checkout LLVM sources
-git clone --depth=1 https://github.com/llvm-mirror/llvm.git llvm-source
-git clone --depth=1 https://github.com/llvm-mirror/libcxx.git llvm-source/projects/libcxx
-git clone --depth=1 https://github.com/llvm-mirror/libcxxabi.git llvm-source/projects/libcxxabi
-
-# Setup libc++ options
-if [ -z "$BUILD_32_BITS" ]; then
-  export BUILD_32_BITS=OFF && echo disabling 32 bit build
-fi
-
-# Build and install libc++ (Use unstable ABI for better sanitizer coverage)
-mkdir llvm-build && cd llvm-build
-cmake -DCMAKE_C_COMPILER=${C_COMPILER} -DCMAKE_CXX_COMPILER=${COMPILER} \
-      -DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_INSTALL_PREFIX=/usr \
-      -DLIBCXX_ABI_UNSTABLE=ON \
-      -DLLVM_USE_SANITIZER=${LIBCXX_SANITIZER} \
-      -DLLVM_BUILD_32_BITS=${BUILD_32_BITS} \
-      ../llvm-source
-make cxx -j2
-sudo make install-cxxabi install-cxx
-cd ../
diff --git a/.ycm_extra_conf.py b/.ycm_extra_conf.py
index 86194357da..5649ddcc74 100644
--- a/.ycm_extra_conf.py
+++ b/.ycm_extra_conf.py
@@ -7,7 +7,7 @@
 flags = [
 '-Wall',
 '-Werror',
-'-pendantic-errors',
+'-pedantic-errors',
 '-std=c++0x',
 '-fno-strict-aliasing',
 '-O3',
diff --git a/AUTHORS b/AUTHORS
index 4e4c4ed475..838dd4f5bd 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -9,35 +9,52 @@
 # Please keep the list sorted.
 
 Albert Pretorius <pretoalb@gmail.com>
+Alex Steele <steeleal123@gmail.com>
+Andriy Berestovskyy <berestovskyy@gmail.com>
 Arne Beer <arne@twobeer.de>
 Carto
+Christian Wassermann <christian_wassermann@web.de>
 Christopher Seymour <chris.j.seymour@hotmail.com>
+Colin Braley <braley.colin@gmail.com>
+Daniel Harvey <danielharvey458@gmail.com>
 David Coeurjolly <david.coeurjolly@liris.cnrs.fr>
+Deniz Evrenci <denizevrenci@gmail.com>
 Dirac Research 
 Dominik Czarnota <dominik.b.czarnota@gmail.com>
+Eric Backus <eric_backus@alum.mit.edu>
 Eric Fiselier <eric@efcs.ca>
 Eugene Zhuk <eugene.zhuk@gmail.com>
 Evgeny Safronov <division494@gmail.com>
+Federico Ficarelli <federico.ficarelli@gmail.com>
 Felix Homann <linuxaudio@showlabor.de>
+Gergő Szitár <szitar.gergo@gmail.com>
 Google Inc.
 International Business Machines Corporation
 Ismael Jimenez Martinez <ismael.jimenez.martinez@gmail.com>
 Jern-Kuan Leong <jernkuan@gmail.com>
 JianXiong Zhou <zhoujianxiong2@gmail.com>
 Joao Paulo Magalhaes <joaoppmagalhaes@gmail.com>
+Jordan Williams <jwillikers@protonmail.com>
 Jussi Knuuttila <jussi.knuuttila@gmail.com>
 Kaito Udagawa <umireon@gmail.com>
 Kishan Kumar <kumar.kishan@outlook.com>
 Lei Xu <eddyxu@gmail.com>
 Matt Clarkson <mattyclarkson@gmail.com>
 Maxim Vafin <maxvafin@gmail.com>
+MongoDB Inc.
 Nick Hutchinson <nshutchinson@gmail.com>
+Norman Heino <norman.heino@gmail.com>
 Oleksandr Sochka <sasha.sochka@gmail.com>
+Ori Livneh <ori.livneh@gmail.com>
 Paul Redmond <paul.redmond@gmail.com>
 Radoslav Yovchev <radoslav.tm@gmail.com>
 Roman Lebedev <lebedev.ri@gmail.com>
+Sayan Bhattacharjee <aero.sayan@gmail.com>
 Shuo Chen <chenshuo@chenshuo.com>
 Steinar H. Gunderson <sgunderson@bigfoot.com>
+Stripe, Inc.
+Tobias Schmidt <tobias.schmidt@in.tum.de>
 Yixuan Qiu <yixuanq@gmail.com>
 Yusuke Suzuki <utatane.tea@gmail.com>
 Zbigniew Skowron <zbychs@gmail.com>
+Min-Yih Hsu <yihshyng223@gmail.com>
diff --git a/BUILD.bazel b/BUILD.bazel
new file mode 100644
index 0000000000..eb35b62730
--- /dev/null
+++ b/BUILD.bazel
@@ -0,0 +1,44 @@
+load("@rules_cc//cc:defs.bzl", "cc_library")
+
+licenses(["notice"])
+
+config_setting(
+    name = "windows",
+    values = {
+        "cpu": "x64_windows",
+    },
+    visibility = [":__subpackages__"],
+)
+
+cc_library(
+    name = "benchmark",
+    srcs = glob(
+        [
+            "src/*.cc",
+            "src/*.h",
+        ],
+        exclude = ["src/benchmark_main.cc"],
+    ),
+    hdrs = ["include/benchmark/benchmark.h"],
+    linkopts = select({
+        ":windows": ["-DEFAULTLIB:shlwapi.lib"],
+        "//conditions:default": ["-pthread"],
+    }),
+    strip_include_prefix = "include",
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "benchmark_main",
+    srcs = ["src/benchmark_main.cc"],
+    hdrs = ["include/benchmark/benchmark.h"],
+    strip_include_prefix = "include",
+    visibility = ["//visibility:public"],
+    deps = [":benchmark"],
+)
+
+cc_library(
+    name = "benchmark_internal_headers",
+    hdrs = glob(["src/*.h"]),
+    visibility = ["//test:__pkg__"],
+)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index ad75bfbac6..393ddad622 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,22 +1,29 @@
-cmake_minimum_required (VERSION 2.8.12)
-
-project (benchmark)
+cmake_minimum_required (VERSION 3.5.1)
 
 foreach(p
+    CMP0048 # OK to clear PROJECT_VERSION on project()
     CMP0054 # CMake 3.1
     CMP0056 # export EXE_LINKER_FLAGS to try_run
     CMP0057 # Support no if() IN_LIST operator
+    CMP0063 # Honor visibility properties for all targets
+    CMP0077 # Allow option() overrides in importing projects
     )
   if(POLICY ${p})
     cmake_policy(SET ${p} NEW)
   endif()
 endforeach()
 
+project (benchmark VERSION 1.5.4 LANGUAGES CXX)
+
 option(BENCHMARK_ENABLE_TESTING "Enable testing of the benchmark library." ON)
 option(BENCHMARK_ENABLE_EXCEPTIONS "Enable the use of exceptions in the benchmark library." ON)
 option(BENCHMARK_ENABLE_LTO "Enable link time optimisation of the benchmark library." OFF)
 option(BENCHMARK_USE_LIBCXX "Build and test using libc++ as the standard library." OFF)
-option(BENCHMARK_BUILD_32_BITS "Build a 32 bit version of the library." OFF)
+if(NOT MSVC)
+  option(BENCHMARK_BUILD_32_BITS "Build a 32 bit version of the library." OFF)
+else()
+  set(BENCHMARK_BUILD_32_BITS OFF CACHE BOOL "Build a 32 bit version of the library - unsupported when using MSVC)" FORCE)
+endif()
 option(BENCHMARK_ENABLE_INSTALL "Enable installation of benchmark. (Projects embedding benchmark may want to turn this OFF.)" ON)
 
 # Allow unmet dependencies to be met using CMake's ExternalProject mechanics, which
@@ -27,17 +34,75 @@ option(BENCHMARK_DOWNLOAD_DEPENDENCIES "Allow the downloading and in-tree buildi
 # in cases where it is not possible to build or find a valid version of gtest.
 option(BENCHMARK_ENABLE_GTEST_TESTS "Enable building the unit tests which depend on gtest" ON)
 
+option(BENCHMARK_ENABLE_LIBPFM "Enable performance counters provided by libpfm" OFF)
+
+set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
+if(MSVC)
+    # As of CMake 3.18, CMAKE_SYSTEM_PROCESSOR is not set properly for MSVC and
+    # cross-compilation (e.g. Host=x86_64, target=aarch64) requires using the
+    # undocumented, but working variable.
+    # See https://gitlab.kitware.com/cmake/cmake/-/issues/15170
+    set(CMAKE_SYSTEM_PROCESSOR ${MSVC_CXX_ARCHITECTURE_ID})
+    if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "ARM")
+      set(CMAKE_CROSSCOMPILING TRUE)
+    endif()
+endif()
+
+set(ENABLE_ASSEMBLY_TESTS_DEFAULT OFF)
+function(should_enable_assembly_tests)
+  if(CMAKE_BUILD_TYPE)
+    string(TOLOWER ${CMAKE_BUILD_TYPE} CMAKE_BUILD_TYPE_LOWER)
+    if (${CMAKE_BUILD_TYPE_LOWER} MATCHES "coverage")
+      # FIXME: The --coverage flag needs to be removed when building assembly
+      # tests for this to work.
+      return()
+    endif()
+  endif()
+  if (MSVC)
+    return()
+  elseif(NOT CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64")
+    return()
+  elseif(NOT CMAKE_SIZEOF_VOID_P EQUAL 8)
+    # FIXME: Make these work on 32 bit builds
+    return()
+  elseif(BENCHMARK_BUILD_32_BITS)
+     # FIXME: Make these work on 32 bit builds
+    return()
+  endif()
+  find_program(LLVM_FILECHECK_EXE FileCheck)
+  if (LLVM_FILECHECK_EXE)
+    set(LLVM_FILECHECK_EXE "${LLVM_FILECHECK_EXE}" CACHE PATH "llvm filecheck" FORCE)
+    message(STATUS "LLVM FileCheck Found: ${LLVM_FILECHECK_EXE}")
+  else()
+    message(STATUS "Failed to find LLVM FileCheck")
+    return()
+  endif()
+  set(ENABLE_ASSEMBLY_TESTS_DEFAULT ON PARENT_SCOPE)
+endfunction()
+should_enable_assembly_tests()
+
+# This option disables the building and running of the assembly verification tests
+option(BENCHMARK_ENABLE_ASSEMBLY_TESTS "Enable building and running the assembly tests"
+    ${ENABLE_ASSEMBLY_TESTS_DEFAULT})
+
 # Make sure we can import out CMake functions
 list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/Modules")
 list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
 
+
 # Read the git tags to determine the project version
 include(GetGitVersion)
 get_git_version(GIT_VERSION)
 
+# If no git version can be determined, use the version
+# from the project() command
+if ("${GIT_VERSION}" STREQUAL "0.0.0")
+  set(VERSION "${benchmark_VERSION}")
+else()
+  set(VERSION "${GIT_VERSION}")
+endif()
 # Tell the user what versions we are using
-string(REGEX MATCH "[0-9]+\\.[0-9]+\\.[0-9]+" VERSION ${GIT_VERSION})
-message("-- Version: ${VERSION}")
+message(STATUS "Version: ${VERSION}")
 
 # The version of the libraries
 set(GENERIC_LIB_VERSION ${VERSION})
@@ -52,7 +117,7 @@ if (BENCHMARK_BUILD_32_BITS)
   add_required_cxx_compiler_flag(-m32)
 endif()
 
-if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC")
+if (MSVC)
   # Turn compiler warnings up to 11
   string(REGEX REPLACE "[-/]W[1-4]" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /W4")
@@ -61,6 +126,7 @@ if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC")
   if (NOT BENCHMARK_ENABLE_EXCEPTIONS)
     add_cxx_compiler_flag(-EHs-)
     add_cxx_compiler_flag(-EHa-)
+    add_definitions(-D_HAS_EXCEPTIONS=0)
   endif()
   # Link time optimisation
   if (BENCHMARK_ENABLE_LTO)
@@ -92,17 +158,32 @@ else()
 
   # Turn compiler warnings up to 11
   add_cxx_compiler_flag(-Wall)
-
   add_cxx_compiler_flag(-Wextra)
   add_cxx_compiler_flag(-Wshadow)
   add_cxx_compiler_flag(-Werror RELEASE)
   add_cxx_compiler_flag(-Werror RELWITHDEBINFO)
   add_cxx_compiler_flag(-Werror MINSIZEREL)
+  if (NOT BENCHMARK_ENABLE_TESTING)
+    # Disable warning when compiling tests as gtest does not use 'override'.
+    add_cxx_compiler_flag(-Wsuggest-override)
+  endif()
   add_cxx_compiler_flag(-pedantic)
   add_cxx_compiler_flag(-pedantic-errors)
   add_cxx_compiler_flag(-Wshorten-64-to-32)
-  add_cxx_compiler_flag(-Wfloat-equal)
   add_cxx_compiler_flag(-fstrict-aliasing)
+  # Disable warnings regarding deprecated parts of the library while building
+  # and testing those parts of the library.
+  add_cxx_compiler_flag(-Wno-deprecated-declarations)
+  if (CMAKE_CXX_COMPILER_ID STREQUAL "Intel")
+    # Intel silently ignores '-Wno-deprecated-declarations',
+    # warning no. 1786 must be explicitly disabled.
+    # See #631 for rationale.
+    add_cxx_compiler_flag(-wd1786)
+  endif()
+  # Disable deprecation warnings for release builds (when -Werror is enabled).
+  add_cxx_compiler_flag(-Wno-deprecated RELEASE)
+  add_cxx_compiler_flag(-Wno-deprecated RELWITHDEBINFO)
+  add_cxx_compiler_flag(-Wno-deprecated MINSIZEREL)
   if (NOT BENCHMARK_ENABLE_EXCEPTIONS)
     add_cxx_compiler_flag(-fno-exceptions)
   endif()
@@ -114,7 +195,7 @@ else()
   endif()
   # ICC17u2: overloaded virtual function "benchmark::Fixture::SetUp" is only partially overridden
   # (because of deprecated overload)
-  add_cxx_compiler_flag(-wd654)  
+  add_cxx_compiler_flag(-wd654)
   add_cxx_compiler_flag(-Wthread-safety)
   if (HAVE_CXX_FLAG_WTHREAD_SAFETY)
     cxx_feature_check(THREAD_SAFETY_ATTRIBUTES)
@@ -128,10 +209,15 @@ else()
     add_definitions(-D_GNU_SOURCE=1)
   endif()
 
+  if (QNXNTO)
+    add_definitions(-D_QNX_SOURCE)
+  endif()
+
   # Link time optimisation
   if (BENCHMARK_ENABLE_LTO)
     add_cxx_compiler_flag(-flto)
-    if ("${CMAKE_C_COMPILER_ID}" STREQUAL "GNU")
+    add_cxx_compiler_flag(-Wno-lto-type-mismatch)
+    if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
       find_program(GCC_AR gcc-ar)
       if (GCC_AR)
         set(CMAKE_AR ${GCC_AR})
@@ -140,7 +226,7 @@ else()
       if (GCC_RANLIB)
         set(CMAKE_RANLIB ${GCC_RANLIB})
       endif()
-    elseif("${CMAKE_C_COMPILER_ID}" STREQUAL "Clang")
+    elseif("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
       include(llvm-toolchain)
     endif()
   endif()
@@ -165,12 +251,12 @@ else()
 endif()
 
 if (BENCHMARK_USE_LIBCXX)
-  if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
+  if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
     add_cxx_compiler_flag(-stdlib=libc++)
   elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU" OR
           "${CMAKE_CXX_COMPILER_ID}" STREQUAL "Intel")
     add_cxx_compiler_flag(-nostdinc++)
-    message("libc++ header path must be manually specified using CMAKE_CXX_FLAGS")
+    message(WARNING "libc++ header path must be manually specified using CMAKE_CXX_FLAGS")
     # Adding -nodefaultlibs directly to CMAKE_<TYPE>_LINKER_FLAGS will break
     # configuration checks such as 'find_package(Threads)'
     list(APPEND BENCHMARK_CXX_LINKER_FLAGS -nodefaultlibs)
@@ -178,15 +264,21 @@ if (BENCHMARK_USE_LIBCXX)
     # linker flags appear before all linker inputs and -lc++ must appear after.
     list(APPEND BENCHMARK_CXX_LIBRARIES c++)
   else()
-    message(FATAL "-DBENCHMARK_USE_LIBCXX:BOOL=ON is not supported for compiler")
+    message(FATAL_ERROR "-DBENCHMARK_USE_LIBCXX:BOOL=ON is not supported for compiler")
   endif()
 endif(BENCHMARK_USE_LIBCXX)
 
+set(EXTRA_CXX_FLAGS "")
+if (WIN32 AND "${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
+  # Clang on Windows fails to compile the regex feature check under C++11
+  set(EXTRA_CXX_FLAGS "-DCMAKE_CXX_STANDARD=14")
+endif()
+
 # C++ feature checks
 # Determine the correct regular expression engine to use
-cxx_feature_check(STD_REGEX)
-cxx_feature_check(GNU_POSIX_REGEX)
-cxx_feature_check(POSIX_REGEX)
+cxx_feature_check(STD_REGEX ${EXTRA_CXX_FLAGS})
+cxx_feature_check(GNU_POSIX_REGEX ${EXTRA_CXX_FLAGS})
+cxx_feature_check(POSIX_REGEX ${EXTRA_CXX_FLAGS})
 if(NOT HAVE_STD_REGEX AND NOT HAVE_GNU_POSIX_REGEX AND NOT HAVE_POSIX_REGEX)
   message(FATAL_ERROR "Failed to determine the source files for the regular expression backend")
 endif()
@@ -194,10 +286,16 @@ if (NOT BENCHMARK_ENABLE_EXCEPTIONS AND HAVE_STD_REGEX
         AND NOT HAVE_GNU_POSIX_REGEX AND NOT HAVE_POSIX_REGEX)
   message(WARNING "Using std::regex with exceptions disabled is not fully supported")
 endif()
+
 cxx_feature_check(STEADY_CLOCK)
 # Ensure we have pthreads
+set(THREADS_PREFER_PTHREAD_FLAG ON)
 find_package(Threads REQUIRED)
 
+if (BENCHMARK_ENABLE_LIBPFM)
+  find_package(PFM)
+endif()
+
 # Set up directories
 include_directories(${PROJECT_SOURCE_DIR}/include)
 set(JSON_INCLUDE_DIR "${PROJECT_SOURCE_DIR}/third_party/json/src")
@@ -208,8 +306,10 @@ add_subdirectory(src)
 
 if (BENCHMARK_ENABLE_TESTING)
   enable_testing()
-  if (BENCHMARK_ENABLE_GTEST_TESTS)
-    include(HandleGTest)
+  if (BENCHMARK_ENABLE_GTEST_TESTS AND
+      NOT (TARGET gtest AND TARGET gtest_main AND
+           TARGET gmock AND TARGET gmock_main))
+    include(GoogleTest)
   endif()
   add_subdirectory(test)
 endif()
diff --git a/CONTRIBUTORS b/CONTRIBUTORS
index c59134b9bd..7489731de5 100644
--- a/CONTRIBUTORS
+++ b/CONTRIBUTORS
@@ -22,41 +22,64 @@
 #
 # Please keep the list sorted.
 
+Abhina Sreeskantharajan <abhina.sreeskantharajan@ibm.com>
 Albert Pretorius <pretoalb@gmail.com>
+Alex Steele <steelal123@gmail.com>
+Andriy Berestovskyy <berestovskyy@gmail.com>
 Arne Beer <arne@twobeer.de>
 Billy Robert O'Neal III <billy.oneal@gmail.com> <bion@microsoft.com>
 Chris Kennelly <ckennelly@google.com> <ckennelly@ckennelly.com>
+Christian Wassermann <christian_wassermann@web.de>
 Christopher Seymour <chris.j.seymour@hotmail.com>
+Colin Braley <braley.colin@gmail.com>
+Cyrille Faucheux <cyrille.faucheux@gmail.com>
+Daniel Harvey <danielharvey458@gmail.com>
 David Coeurjolly <david.coeurjolly@liris.cnrs.fr>
+Deniz Evrenci <denizevrenci@gmail.com>
 Dominic Hamon <dma@stripysock.com> <dominic@google.com>
 Dominik Czarnota <dominik.b.czarnota@gmail.com>
+Eric Backus <eric_backus@alum.mit.edu>
 Eric Fiselier <eric@efcs.ca>
 Eugene Zhuk <eugene.zhuk@gmail.com>
 Evgeny Safronov <division494@gmail.com>
+Fanbo Meng <fanbo.meng@ibm.com>
+Federico Ficarelli <federico.ficarelli@gmail.com>
 Felix Homann <linuxaudio@showlabor.de>
+Geoffrey Martin-Noble <gcmn@google.com> <gmngeoffrey@gmail.com>
+Gergő Szitár <szitar.gergo@gmail.com>
+Hannes Hauswedell <h2@fsfe.org>
 Ismael Jimenez Martinez <ismael.jimenez.martinez@gmail.com>
 Jern-Kuan Leong <jernkuan@gmail.com>
 JianXiong Zhou <zhoujianxiong2@gmail.com>
 Joao Paulo Magalhaes <joaoppmagalhaes@gmail.com>
+John Millikin <jmillikin@stripe.com>
+Jordan Williams <jwillikers@protonmail.com>
 Jussi Knuuttila <jussi.knuuttila@gmail.com>
 Kai Wolf <kai.wolf@gmail.com>
-Kishan Kumar <kumar.kishan@outlook.com>
 Kaito Udagawa <umireon@gmail.com>
+Kishan Kumar <kumar.kishan@outlook.com>
 Lei Xu <eddyxu@gmail.com>
 Matt Clarkson <mattyclarkson@gmail.com>
 Maxim Vafin <maxvafin@gmail.com>
 Nick Hutchinson <nshutchinson@gmail.com>
+Norman Heino <norman.heino@gmail.com>
 Oleksandr Sochka <sasha.sochka@gmail.com>
+Ori Livneh <ori.livneh@gmail.com>
 Pascal Leroy <phl@google.com>
 Paul Redmond <paul.redmond@gmail.com>
 Pierre Phaneuf <pphaneuf@google.com>
 Radoslav Yovchev <radoslav.tm@gmail.com>
 Raul Marin <rmrodriguez@cartodb.com>
 Ray Glover <ray.glover@uk.ibm.com>
+Robert Guo <robert.guo@mongodb.com>
 Roman Lebedev <lebedev.ri@gmail.com>
+Sayan Bhattacharjee <aero.sayan@gmail.com>
 Shuo Chen <chenshuo@chenshuo.com>
+Steven Wan <wan.yu@ibm.com>
+Tobias Schmidt <tobias.schmidt@in.tum.de>
 Tobias Ulvgård <tobias.ulvgard@dirac.se>
 Tom Madams <tom.ej.madams@gmail.com> <tmadams@google.com>
 Yixuan Qiu <yixuanq@gmail.com>
 Yusuke Suzuki <utatane.tea@gmail.com>
 Zbigniew Skowron <zbychs@gmail.com>
+Min-Yih Hsu <yihshyng223@gmail.com>
diff --git a/WORKSPACE b/WORKSPACE
new file mode 100644
index 0000000000..631f3ba05d
--- /dev/null
+++ b/WORKSPACE
@@ -0,0 +1,51 @@
+workspace(name = "com_github_google_benchmark")
+
+load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
+
+http_archive(
+    name = "rules_cc",
+    strip_prefix = "rules_cc-a508235df92e71d537fcbae0c7c952ea6957a912",
+    urls = ["https://github.com/bazelbuild/rules_cc/archive/a508235df92e71d537fcbae0c7c952ea6957a912.zip"],
+    sha256 = "d7dc12c1d5bc1a87474de8e3d17b7731a4dcebcfb8aa3990fe8ac7734ef12f2f",
+)
+
+http_archive(
+    name = "com_google_absl",
+    sha256 = "f41868f7a938605c92936230081175d1eae87f6ea2c248f41077c8f88316f111",
+    strip_prefix = "abseil-cpp-20200225.2",
+    urls = ["https://github.com/abseil/abseil-cpp/archive/20200225.2.tar.gz"],
+)
+
+http_archive(
+    name = "com_google_googletest",
+    strip_prefix = "googletest-3f0cf6b62ad1eb50d8736538363d3580dd640c3e",
+    urls = ["https://github.com/google/googletest/archive/3f0cf6b62ad1eb50d8736538363d3580dd640c3e.zip"],
+    sha256 = "8f827dd550db8b4fdf73904690df0be9fccc161017c9038a724bc9a0617a1bc8",
+)
+
+http_archive(
+    name = "pybind11",
+    build_file = "@//bindings/python:pybind11.BUILD",
+    sha256 = "1eed57bc6863190e35637290f97a20c81cfe4d9090ac0a24f3bbf08f265eb71d",
+    strip_prefix = "pybind11-2.4.3",
+    urls = ["https://github.com/pybind/pybind11/archive/v2.4.3.tar.gz"],
+)
+
+new_local_repository(
+    name = "python_headers",
+    build_file = "@//bindings/python:python_headers.BUILD",
+    path = "/usr/include/python3.6",  # May be overwritten by setup.py.
+)
+
+http_archive(
+    name = "rules_python",
+    url = "https://github.com/bazelbuild/rules_python/releases/download/0.1.0/rules_python-0.1.0.tar.gz",
+    sha256 = "b6d46438523a3ec0f3cead544190ee13223a52f6a6765a29eae7b7cc24cc83a0",
+)
+
+load("@rules_python//python:pip.bzl", pip3_install="pip_install")
+
+pip3_install(
+   name = "py_deps",
+   requirements = "//:requirements.txt",
+)
diff --git a/_config.yml b/_config.yml
new file mode 100644
index 0000000000..1fa5ff852b
--- /dev/null
+++ b/_config.yml
@@ -0,0 +1,2 @@
+theme: jekyll-theme-midnight
+markdown: GFM
diff --git a/appveyor.yml b/appveyor.yml
index cf240190be..81da955f02 100644
--- a/appveyor.yml
+++ b/appveyor.yml
@@ -41,7 +41,7 @@ build_script:
   - cmake --build . --config %configuration%
 
 test_script:
-  - ctest -c %configuration% --timeout 300 --output-on-failure
+  - ctest --build-config %configuration% --timeout 300 --output-on-failure
 
 artifacts:
   - path: '_build/CMakeFiles/*.log'
diff --git a/bindings/python/BUILD b/bindings/python/BUILD
new file mode 100644
index 0000000000..9559a76b30
--- /dev/null
+++ b/bindings/python/BUILD
@@ -0,0 +1,3 @@
+exports_files(glob(["*.BUILD"]))
+exports_files(["build_defs.bzl"])
+
diff --git a/bindings/python/build_defs.bzl b/bindings/python/build_defs.bzl
new file mode 100644
index 0000000000..45907aaa5e
--- /dev/null
+++ b/bindings/python/build_defs.bzl
@@ -0,0 +1,25 @@
+_SHARED_LIB_SUFFIX = {
+    "//conditions:default": ".so",
+    "//:windows": ".dll",
+}
+
+def py_extension(name, srcs, hdrs = [], copts = [], features = [], deps = []):
+    for shared_lib_suffix in _SHARED_LIB_SUFFIX.values():
+        shared_lib_name = name + shared_lib_suffix
+        native.cc_binary(
+            name = shared_lib_name,
+            linkshared = 1,
+            linkstatic = 1,
+            srcs = srcs + hdrs,
+            copts = copts,
+            features = features,
+            deps = deps,
+        )
+
+    return native.py_library(
+        name = name,
+        data = select({
+            platform: [name + shared_lib_suffix]
+            for platform, shared_lib_suffix in _SHARED_LIB_SUFFIX.items()
+        }),
+    )
diff --git a/bindings/python/google_benchmark/BUILD b/bindings/python/google_benchmark/BUILD
new file mode 100644
index 0000000000..3c1561f48e
--- /dev/null
+++ b/bindings/python/google_benchmark/BUILD
@@ -0,0 +1,38 @@
+load("//bindings/python:build_defs.bzl", "py_extension")
+
+py_library(
+    name = "google_benchmark",
+    srcs = ["__init__.py"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":_benchmark",
+        # pip; absl:app
+    ],
+)
+
+py_extension(
+    name = "_benchmark",
+    srcs = ["benchmark.cc"],
+    copts = [
+        "-fexceptions",
+        "-fno-strict-aliasing",
+    ],
+    features = ["-use_header_modules"],
+    deps = [
+        "//:benchmark",
+        "@pybind11",
+        "@python_headers",
+    ],
+)
+
+py_test(
+    name = "example",
+    srcs = ["example.py"],
+    python_version = "PY3",
+    srcs_version = "PY3",
+    visibility = ["//visibility:public"],
+    deps = [
+        ":google_benchmark",
+    ],
+)
+
diff --git a/bindings/python/google_benchmark/__init__.py b/bindings/python/google_benchmark/__init__.py
new file mode 100644
index 0000000000..1055bf2418
--- /dev/null
+++ b/bindings/python/google_benchmark/__init__.py
@@ -0,0 +1,158 @@
+# Copyright 2020 Google Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Python benchmarking utilities.
+
+Example usage:
+  import google_benchmark as benchmark
+
+  @benchmark.register
+  def my_benchmark(state):
+      ...  # Code executed outside `while` loop is not timed.
+
+      while state:
+        ...  # Code executed within `while` loop is timed.
+
+  if __name__ == '__main__':
+    benchmark.main()
+"""
+
+from absl import app
+from google_benchmark import _benchmark
+from google_benchmark._benchmark import (
+    Counter,
+    kNanosecond,
+    kMicrosecond,
+    kMillisecond,
+    kSecond,
+    oNone,
+    o1,
+    oN,
+    oNSquared,
+    oNCubed,
+    oLogN,
+    oNLogN,
+    oAuto,
+    oLambda,
+)
+
+
+__all__ = [
+    "register",
+    "main",
+    "Counter",
+    "kNanosecond",
+    "kMicrosecond",
+    "kMillisecond",
+    "kSecond",
+    "oNone",
+    "o1",
+    "oN",
+    "oNSquared",
+    "oNCubed",
+    "oLogN",
+    "oNLogN",
+    "oAuto",
+    "oLambda",
+]
+
+__version__ = "0.2.0"
+
+
+class __OptionMaker:
+    """A stateless class to collect benchmark options.
+
+    Collect all decorator calls like @option.range(start=0, limit=1<<5).
+    """
+
+    class Options:
+        """Pure data class to store options calls, along with the benchmarked function."""
+
+        def __init__(self, func):
+            self.func = func
+            self.builder_calls = []
+
+    @classmethod
+    def make(cls, func_or_options):
+        """Make Options from Options or the benchmarked function."""
+        if isinstance(func_or_options, cls.Options):
+            return func_or_options
+        return cls.Options(func_or_options)
+
+    def __getattr__(self, builder_name):
+        """Append option call in the Options."""
+
+        # The function that get returned on @option.range(start=0, limit=1<<5).
+        def __builder_method(*args, **kwargs):
+
+            # The decorator that get called, either with the benchmared function
+            # or the previous Options
+            def __decorator(func_or_options):
+                options = self.make(func_or_options)
+                options.builder_calls.append((builder_name, args, kwargs))
+                # The decorator returns Options so it is not technically a decorator
+                # and needs a final call to @regiser
+                return options
+
+            return __decorator
+
+        return __builder_method
+
+
+# Alias for nicer API.
+# We have to instantiate an object, even if stateless, to be able to use __getattr__
+# on option.range
+option = __OptionMaker()
+
+
+def register(undefined=None, *, name=None):
+    """Register function for benchmarking."""
+    if undefined is None:
+        # Decorator is called without parenthesis so we return a decorator
+        return lambda f: register(f, name=name)
+
+    # We have either the function to benchmark (simple case) or an instance of Options
+    # (@option._ case).
+    options = __OptionMaker.make(undefined)
+
+    if name is None:
+        name = options.func.__name__
+
+    # We register the benchmark and reproduce all the @option._ calls onto the
+    # benchmark builder pattern
+    benchmark = _benchmark.RegisterBenchmark(name, options.func)
+    for name, args, kwargs in options.builder_calls[::-1]:
+        getattr(benchmark, name)(*args, **kwargs)
+
+    # return the benchmarked function because the decorator does not modify it
+    return options.func
+
+
+def _flags_parser(argv):
+    argv = _benchmark.Initialize(argv)
+    return app.parse_flags_with_usage(argv)
+
+
+def _run_benchmarks(argv):
+    if len(argv) > 1:
+        raise app.UsageError("Too many command-line arguments.")
+    return _benchmark.RunSpecifiedBenchmarks()
+
+
+def main(argv=None):
+    return app.run(_run_benchmarks, argv=argv, flags_parser=_flags_parser)
+
+
+# Methods for use with custom main function.
+initialize = _benchmark.Initialize
+run_benchmarks = _benchmark.RunSpecifiedBenchmarks
diff --git a/bindings/python/google_benchmark/benchmark.cc b/bindings/python/google_benchmark/benchmark.cc
new file mode 100644
index 0000000000..1b01fe7f7f
--- /dev/null
+++ b/bindings/python/google_benchmark/benchmark.cc
@@ -0,0 +1,181 @@
+// Benchmark for Python.
+
+#include <map>
+#include <string>
+#include <vector>
+
+#include "pybind11/operators.h"
+#include "pybind11/pybind11.h"
+#include "pybind11/stl.h"
+#include "pybind11/stl_bind.h"
+
+#include "benchmark/benchmark.h"
+
+PYBIND11_MAKE_OPAQUE(benchmark::UserCounters);
+
+namespace {
+namespace py = ::pybind11;
+
+std::vector<std::string> Initialize(const std::vector<std::string>& argv) {
+  // The `argv` pointers here become invalid when this function returns, but
+  // benchmark holds the pointer to `argv[0]`. We create a static copy of it
+  // so it persists, and replace the pointer below.
+  static std::string executable_name(argv[0]);
+  std::vector<char*> ptrs;
+  ptrs.reserve(argv.size());
+  for (auto& arg : argv) {
+    ptrs.push_back(const_cast<char*>(arg.c_str()));
+  }
+  ptrs[0] = const_cast<char*>(executable_name.c_str());
+  int argc = static_cast<int>(argv.size());
+  benchmark::Initialize(&argc, ptrs.data());
+  std::vector<std::string> remaining_argv;
+  remaining_argv.reserve(argc);
+  for (int i = 0; i < argc; ++i) {
+    remaining_argv.emplace_back(ptrs[i]);
+  }
+  return remaining_argv;
+}
+
+benchmark::internal::Benchmark* RegisterBenchmark(const char* name,
+                                                  py::function f) {
+  return benchmark::RegisterBenchmark(
+      name, [f](benchmark::State& state) { f(&state); });
+}
+
+PYBIND11_MODULE(_benchmark, m) {
+  using benchmark::TimeUnit;
+  py::enum_<TimeUnit>(m, "TimeUnit")
+      .value("kNanosecond", TimeUnit::kNanosecond)
+      .value("kMicrosecond", TimeUnit::kMicrosecond)
+      .value("kMillisecond", TimeUnit::kMillisecond)
+      .value("kSecond", TimeUnit::kSecond)
+      .export_values();
+
+  using benchmark::BigO;
+  py::enum_<BigO>(m, "BigO")
+      .value("oNone", BigO::oNone)
+      .value("o1", BigO::o1)
+      .value("oN", BigO::oN)
+      .value("oNSquared", BigO::oNSquared)
+      .value("oNCubed", BigO::oNCubed)
+      .value("oLogN", BigO::oLogN)
+      .value("oNLogN", BigO::oLogN)
+      .value("oAuto", BigO::oAuto)
+      .value("oLambda", BigO::oLambda)
+      .export_values();
+
+  using benchmark::internal::Benchmark;
+  py::class_<Benchmark>(m, "Benchmark")
+      // For methods returning a pointer tor the current object, reference
+      // return policy is used to ask pybind not to take ownership oof the
+      // returned object and avoid calling delete on it.
+      // https://pybind11.readthedocs.io/en/stable/advanced/functions.html#return-value-policies
+      //
+      // For methods taking a const std::vector<...>&, a copy is created
+      // because a it is bound to a Python list.
+      // https://pybind11.readthedocs.io/en/stable/advanced/cast/stl.html
+      .def("unit", &Benchmark::Unit, py::return_value_policy::reference)
+      .def("arg", &Benchmark::Arg, py::return_value_policy::reference)
+      .def("args", &Benchmark::Args, py::return_value_policy::reference)
+      .def("range", &Benchmark::Range, py::return_value_policy::reference,
+           py::arg("start"), py::arg("limit"))
+      .def("dense_range", &Benchmark::DenseRange,
+           py::return_value_policy::reference, py::arg("start"),
+           py::arg("limit"), py::arg("step") = 1)
+      .def("ranges", &Benchmark::Ranges, py::return_value_policy::reference)
+      .def("args_product", &Benchmark::ArgsProduct,
+           py::return_value_policy::reference)
+      .def("arg_name", &Benchmark::ArgName, py::return_value_policy::reference)
+      .def("arg_names", &Benchmark::ArgNames,
+           py::return_value_policy::reference)
+      .def("range_pair", &Benchmark::RangePair,
+           py::return_value_policy::reference, py::arg("lo1"), py::arg("hi1"),
+           py::arg("lo2"), py::arg("hi2"))
+      .def("range_multiplier", &Benchmark::RangeMultiplier,
+           py::return_value_policy::reference)
+      .def("min_time", &Benchmark::MinTime, py::return_value_policy::reference)
+      .def("iterations", &Benchmark::Iterations,
+           py::return_value_policy::reference)
+      .def("repetitions", &Benchmark::Repetitions,
+           py::return_value_policy::reference)
+      .def("report_aggregates_only", &Benchmark::ReportAggregatesOnly,
+           py::return_value_policy::reference, py::arg("value") = true)
+      .def("display_aggregates_only", &Benchmark::DisplayAggregatesOnly,
+           py::return_value_policy::reference, py::arg("value") = true)
+      .def("measure_process_cpu_time", &Benchmark::MeasureProcessCPUTime,
+           py::return_value_policy::reference)
+      .def("use_real_time", &Benchmark::UseRealTime,
+           py::return_value_policy::reference)
+      .def("use_manual_time", &Benchmark::UseManualTime,
+           py::return_value_policy::reference)
+      .def(
+          "complexity",
+          (Benchmark * (Benchmark::*)(benchmark::BigO)) & Benchmark::Complexity,
+          py::return_value_policy::reference,
+          py::arg("complexity") = benchmark::oAuto);
+
+  using benchmark::Counter;
+  py::class_<Counter> py_counter(m, "Counter");
+
+  py::enum_<Counter::Flags>(py_counter, "Flags")
+      .value("kDefaults", Counter::Flags::kDefaults)
+      .value("kIsRate", Counter::Flags::kIsRate)
+      .value("kAvgThreads", Counter::Flags::kAvgThreads)
+      .value("kAvgThreadsRate", Counter::Flags::kAvgThreadsRate)
+      .value("kIsIterationInvariant", Counter::Flags::kIsIterationInvariant)
+      .value("kIsIterationInvariantRate",
+             Counter::Flags::kIsIterationInvariantRate)
+      .value("kAvgIterations", Counter::Flags::kAvgIterations)
+      .value("kAvgIterationsRate", Counter::Flags::kAvgIterationsRate)
+      .value("kInvert", Counter::Flags::kInvert)
+      .export_values()
+      .def(py::self | py::self);
+
+  py::enum_<Counter::OneK>(py_counter, "OneK")
+      .value("kIs1000", Counter::OneK::kIs1000)
+      .value("kIs1024", Counter::OneK::kIs1024)
+      .export_values();
+
+  py_counter
+      .def(py::init<double, Counter::Flags, Counter::OneK>(),
+           py::arg("value") = 0., py::arg("flags") = Counter::kDefaults,
+           py::arg("k") = Counter::kIs1000)
+      .def(py::init([](double value) { return Counter(value); }))
+      .def_readwrite("value", &Counter::value)
+      .def_readwrite("flags", &Counter::flags)
+      .def_readwrite("oneK", &Counter::oneK);
+  py::implicitly_convertible<py::float_, Counter>();
+  py::implicitly_convertible<py::int_, Counter>();
+
+  py::bind_map<benchmark::UserCounters>(m, "UserCounters");
+
+  using benchmark::State;
+  py::class_<State>(m, "State")
+      .def("__bool__", &State::KeepRunning)
+      .def_property_readonly("keep_running", &State::KeepRunning)
+      .def("pause_timing", &State::PauseTiming)
+      .def("resume_timing", &State::ResumeTiming)
+      .def("skip_with_error", &State::SkipWithError)
+      .def_property_readonly("error_occurred", &State::error_occurred)
+      .def("set_iteration_time", &State::SetIterationTime)
+      .def_property("bytes_processed", &State::bytes_processed,
+                    &State::SetBytesProcessed)
+      .def_property("complexity_n", &State::complexity_length_n,
+                    &State::SetComplexityN)
+      .def_property("items_processed", &State::items_processed,
+                    &State::SetItemsProcessed)
+      .def("set_label", (void (State::*)(const char*)) & State::SetLabel)
+      .def("range", &State::range, py::arg("pos") = 0)
+      .def_property_readonly("iterations", &State::iterations)
+      .def_readwrite("counters", &State::counters)
+      .def_readonly("thread_index", &State::thread_index)
+      .def_readonly("threads", &State::threads);
+
+  m.def("Initialize", Initialize);
+  m.def("RegisterBenchmark", RegisterBenchmark,
+        py::return_value_policy::reference);
+  m.def("RunSpecifiedBenchmarks",
+        []() { benchmark::RunSpecifiedBenchmarks(); });
+};
+}  // namespace
diff --git a/bindings/python/google_benchmark/example.py b/bindings/python/google_benchmark/example.py
new file mode 100644
index 0000000000..9134e8cffe
--- /dev/null
+++ b/bindings/python/google_benchmark/example.py
@@ -0,0 +1,136 @@
+# Copyright 2020 Google Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Example of Python using C++ benchmark framework.
+
+To run this example, you must first install the `google_benchmark` Python package.
+
+To install using `setup.py`, download and extract the `google_benchmark` source.
+In the extracted directory, execute:
+  python setup.py install
+"""
+
+import random
+import time
+
+import google_benchmark as benchmark
+from google_benchmark import Counter
+
+
+@benchmark.register
+def empty(state):
+    while state:
+        pass
+
+
+@benchmark.register
+def sum_million(state):
+    while state:
+        sum(range(1_000_000))
+
+@benchmark.register
+def pause_timing(state):
+    """Pause timing every iteration."""
+    while state:
+        # Construct a list of random ints every iteration without timing it
+        state.pause_timing()
+        random_list = [random.randint(0, 100) for _ in range(100)]
+        state.resume_timing()
+        # Time the in place sorting algorithm
+        random_list.sort()
+
+
+@benchmark.register
+def skipped(state):
+    if True:  # Test some predicate here.
+        state.skip_with_error("some error")
+        return  # NOTE: You must explicitly return, or benchmark will continue.
+
+    ...  # Benchmark code would be here.
+
+
+@benchmark.register
+def manual_timing(state):
+    while state:
+        # Manually count Python CPU time
+        start = time.perf_counter()  # perf_counter_ns() in Python 3.7+
+        # Something to benchmark
+        time.sleep(0.01)
+        end = time.perf_counter()
+        state.set_iteration_time(end - start)
+
+
+@benchmark.register
+def custom_counters(state):
+    """Collect cutom metric using benchmark.Counter."""
+    num_foo = 0.0
+    while state:
+        # Benchmark some code here
+        pass
+        # Collect some custom metric named foo
+        num_foo += 0.13
+
+    # Automatic Counter from numbers.
+    state.counters["foo"] = num_foo
+    # Set a counter as a rate.
+    state.counters["foo_rate"] = Counter(num_foo, Counter.kIsRate)
+    #  Set a counter as an inverse of rate.
+    state.counters["foo_inv_rate"] = Counter(num_foo, Counter.kIsRate | Counter.kInvert)
+    # Set a counter as a thread-average quantity.
+    state.counters["foo_avg"] = Counter(num_foo, Counter.kAvgThreads)
+    # There's also a combined flag:
+    state.counters["foo_avg_rate"] = Counter(num_foo, Counter.kAvgThreadsRate)
+
+
+@benchmark.register
+@benchmark.option.measure_process_cpu_time()
+@benchmark.option.use_real_time()
+def with_options(state):
+    while state:
+        sum(range(1_000_000))
+
+
+@benchmark.register(name="sum_million_microseconds")
+@benchmark.option.unit(benchmark.kMicrosecond)
+def with_options(state):
+    while state:
+        sum(range(1_000_000))
+
+
+@benchmark.register
+@benchmark.option.arg(100)
+@benchmark.option.arg(1000)
+def passing_argument(state):
+    while state:
+        sum(range(state.range(0)))
+
+
+@benchmark.register
+@benchmark.option.range(8, limit=8 << 10)
+def using_range(state):
+    while state:
+        sum(range(state.range(0)))
+
+
+@benchmark.register
+@benchmark.option.range_multiplier(2)
+@benchmark.option.range(1 << 10, 1 << 18)
+@benchmark.option.complexity(benchmark.oN)
+def computing_complexity(state):
+    while state:
+        sum(range(state.range(0)))
+    state.complexity_n = state.range(0)
+
+
+if __name__ == "__main__":
+    benchmark.main()
diff --git a/bindings/python/pybind11.BUILD b/bindings/python/pybind11.BUILD
new file mode 100644
index 0000000000..bc83350038
--- /dev/null
+++ b/bindings/python/pybind11.BUILD
@@ -0,0 +1,20 @@
+cc_library(
+    name = "pybind11",
+    hdrs = glob(
+        include = [
+            "include/pybind11/*.h",
+            "include/pybind11/detail/*.h",
+        ],
+        exclude = [
+            "include/pybind11/common.h",
+            "include/pybind11/eigen.h",
+        ],
+    ),
+    copts = [
+        "-fexceptions",
+        "-Wno-undefined-inline",
+        "-Wno-pragma-once-outside-header",
+    ],
+    includes = ["include"],
+    visibility = ["//visibility:public"],
+)
diff --git a/bindings/python/python_headers.BUILD b/bindings/python/python_headers.BUILD
new file mode 100644
index 0000000000..9c34cf6ca4
--- /dev/null
+++ b/bindings/python/python_headers.BUILD
@@ -0,0 +1,6 @@
+cc_library(
+    name = "python_headers",
+    hdrs = glob(["**/*.h"]),
+    includes = ["."],
+    visibility = ["//visibility:public"],
+)
diff --git a/bindings/python/requirements.txt b/bindings/python/requirements.txt
new file mode 100644
index 0000000000..f5bbe7eca5
--- /dev/null
+++ b/bindings/python/requirements.txt
@@ -0,0 +1,2 @@
+absl-py>=0.7.1
+
diff --git a/cmake/AddCXXCompilerFlag.cmake b/cmake/AddCXXCompilerFlag.cmake
index 17d5f3dcc3..858589e977 100644
--- a/cmake/AddCXXCompilerFlag.cmake
+++ b/cmake/AddCXXCompilerFlag.cmake
@@ -34,9 +34,11 @@ function(add_cxx_compiler_flag FLAG)
   check_cxx_compiler_flag("${FLAG}" ${MANGLED_FLAG})
   set(CMAKE_REQUIRED_FLAGS "${OLD_CMAKE_REQUIRED_FLAGS}")
   if(${MANGLED_FLAG})
-    set(VARIANT ${ARGV1})
-    if(ARGV1)
+    if(ARGC GREATER 1)
+      set(VARIANT ${ARGV1})
       string(TOUPPER "_${VARIANT}" VARIANT)
+    else()
+      set(VARIANT "")
     endif()
     set(CMAKE_CXX_FLAGS${VARIANT} "${CMAKE_CXX_FLAGS${VARIANT}} ${BENCHMARK_CXX_FLAGS${VARIANT}} ${FLAG}" PARENT_SCOPE)
   endif()
@@ -49,9 +51,11 @@ function(add_required_cxx_compiler_flag FLAG)
   check_cxx_compiler_flag("${FLAG}" ${MANGLED_FLAG})
   set(CMAKE_REQUIRED_FLAGS "${OLD_CMAKE_REQUIRED_FLAGS}")
   if(${MANGLED_FLAG})
-    set(VARIANT ${ARGV1})
-    if(ARGV1)
+    if(ARGC GREATER 1)
+      set(VARIANT ${ARGV1})
       string(TOUPPER "_${VARIANT}" VARIANT)
+    else()
+      set(VARIANT "")
     endif()
     set(CMAKE_CXX_FLAGS${VARIANT} "${CMAKE_CXX_FLAGS${VARIANT}} ${FLAG}" PARENT_SCOPE)
     set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${FLAG}" PARENT_SCOPE)
@@ -62,3 +66,13 @@ function(add_required_cxx_compiler_flag FLAG)
     message(FATAL_ERROR "Required flag '${FLAG}' is not supported by the compiler")
   endif()
 endfunction()
+
+function(check_cxx_warning_flag FLAG)
+  mangle_compiler_flag("${FLAG}" MANGLED_FLAG)
+  set(OLD_CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS}")
+  # Add -Werror to ensure the compiler generates an error if the warning flag
+  # doesn't exist.
+  set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} -Werror ${FLAG}")
+  check_cxx_compiler_flag("${FLAG}" ${MANGLED_FLAG})
+  set(CMAKE_REQUIRED_FLAGS "${OLD_CMAKE_REQUIRED_FLAGS}")
+endfunction()
diff --git a/cmake/CXXFeatureCheck.cmake b/cmake/CXXFeatureCheck.cmake
index b2a8217157..62e6741fe3 100644
--- a/cmake/CXXFeatureCheck.cmake
+++ b/cmake/CXXFeatureCheck.cmake
@@ -27,36 +27,43 @@ function(cxx_feature_check FILE)
     return()
   endif()
 
-  message("-- Performing Test ${FEATURE}")
-  if(CMAKE_CROSSCOMPILING)
-    try_compile(COMPILE_${FEATURE}
-            ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/cmake/${FILE}.cpp
-            CMAKE_FLAGS ${BENCHMARK_CXX_LINKER_FLAGS}
-            LINK_LIBRARIES ${BENCHMARK_CXX_LIBRARIES})
-    if(COMPILE_${FEATURE})
-      message(WARNING
-            "If you see build failures due to cross compilation, try setting HAVE_${VAR} to 0")
-      set(RUN_${FEATURE} 0)
+  if (ARGC GREATER 1)
+    message(STATUS "Enabling additional flags: ${ARGV1}")
+    list(APPEND BENCHMARK_CXX_LINKER_FLAGS ${ARGV1})
+  endif()
+
+  if (NOT DEFINED COMPILE_${FEATURE})
+    message(STATUS "Performing Test ${FEATURE}")
+    if(CMAKE_CROSSCOMPILING)
+      try_compile(COMPILE_${FEATURE}
+              ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/cmake/${FILE}.cpp
+              CMAKE_FLAGS ${BENCHMARK_CXX_LINKER_FLAGS}
+              LINK_LIBRARIES ${BENCHMARK_CXX_LIBRARIES})
+      if(COMPILE_${FEATURE})
+        message(WARNING
+              "If you see build failures due to cross compilation, try setting HAVE_${VAR} to 0")
+        set(RUN_${FEATURE} 0 CACHE INTERNAL "")
+      else()
+        set(RUN_${FEATURE} 1 CACHE INTERNAL "")
+      endif()
     else()
-      set(RUN_${FEATURE} 1)
+      message(STATUS "Performing Test ${FEATURE}")
+      try_run(RUN_${FEATURE} COMPILE_${FEATURE}
+              ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/cmake/${FILE}.cpp
+              CMAKE_FLAGS ${BENCHMARK_CXX_LINKER_FLAGS}
+              LINK_LIBRARIES ${BENCHMARK_CXX_LIBRARIES})
     endif()
-  else()
-    message("-- Performing Test ${FEATURE}")
-    try_run(RUN_${FEATURE} COMPILE_${FEATURE}
-            ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/cmake/${FILE}.cpp
-            CMAKE_FLAGS ${BENCHMARK_CXX_LINKER_FLAGS}
-            LINK_LIBRARIES ${BENCHMARK_CXX_LIBRARIES})
   endif()
 
   if(RUN_${FEATURE} EQUAL 0)
-    message("-- Performing Test ${FEATURE} -- success")
+    message(STATUS "Performing Test ${FEATURE} -- success")
     set(HAVE_${VAR} 1 PARENT_SCOPE)
     add_definitions(-DHAVE_${VAR})
   else()
     if(NOT COMPILE_${FEATURE})
-      message("-- Performing Test ${FEATURE} -- failed to compile")
+      message(STATUS "Performing Test ${FEATURE} -- failed to compile")
     else()
-      message("-- Performing Test ${FEATURE} -- compiled but failed to run")
+      message(STATUS "Performing Test ${FEATURE} -- compiled but failed to run")
     endif()
   endif()
 endfunction()
diff --git a/cmake/GetGitVersion.cmake b/cmake/GetGitVersion.cmake
index 8dd9480045..04a1f9b70d 100644
--- a/cmake/GetGitVersion.cmake
+++ b/cmake/GetGitVersion.cmake
@@ -20,32 +20,39 @@ set(__get_git_version INCLUDED)
 
 function(get_git_version var)
   if(GIT_EXECUTABLE)
-      execute_process(COMMAND ${GIT_EXECUTABLE} describe --match "v[0-9]*.[0-9]*.[0-9]*" --abbrev=8
+      execute_process(COMMAND ${GIT_EXECUTABLE} describe --tags --match "v[0-9]*.[0-9]*.[0-9]*" --abbrev=8
+          WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
           RESULT_VARIABLE status
-          OUTPUT_VARIABLE GIT_VERSION
+          OUTPUT_VARIABLE GIT_DESCRIBE_VERSION
           ERROR_QUIET)
-      if(${status})
-          set(GIT_VERSION "v0.0.0")
+      if(status)
+          set(GIT_DESCRIBE_VERSION "v0.0.0")
+      endif()
+      
+      string(STRIP ${GIT_DESCRIBE_VERSION} GIT_DESCRIBE_VERSION)
+      if(GIT_DESCRIBE_VERSION MATCHES v[^-]*-) 
+         string(REGEX REPLACE "v([^-]*)-([0-9]+)-.*" "\\1.\\2"  GIT_VERSION ${GIT_DESCRIBE_VERSION})
       else()
-          string(STRIP ${GIT_VERSION} GIT_VERSION)
-          string(REGEX REPLACE "-[0-9]+-g" "-" GIT_VERSION ${GIT_VERSION})
+         string(REGEX REPLACE "v(.*)" "\\1" GIT_VERSION ${GIT_DESCRIBE_VERSION})
       endif()
 
       # Work out if the repository is dirty
       execute_process(COMMAND ${GIT_EXECUTABLE} update-index -q --refresh
+          WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
           OUTPUT_QUIET
           ERROR_QUIET)
       execute_process(COMMAND ${GIT_EXECUTABLE} diff-index --name-only HEAD --
+          WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
           OUTPUT_VARIABLE GIT_DIFF_INDEX
           ERROR_QUIET)
       string(COMPARE NOTEQUAL "${GIT_DIFF_INDEX}" "" GIT_DIRTY)
       if (${GIT_DIRTY})
-          set(GIT_VERSION "${GIT_VERSION}-dirty")
+          set(GIT_DESCRIBE_VERSION "${GIT_DESCRIBE_VERSION}-dirty")
       endif()
+      message(STATUS "git version: ${GIT_DESCRIBE_VERSION} normalized to ${GIT_VERSION}")
   else()
-      set(GIT_VERSION "v0.0.0")
+      set(GIT_VERSION "0.0.0")
   endif()
 
-  message("-- git Version: ${GIT_VERSION}")
   set(${var} ${GIT_VERSION} PARENT_SCOPE)
 endfunction()
diff --git a/cmake/GoogleTest.cmake b/cmake/GoogleTest.cmake
new file mode 100644
index 0000000000..dd611fc875
--- /dev/null
+++ b/cmake/GoogleTest.cmake
@@ -0,0 +1,41 @@
+# Download and unpack googletest at configure time
+set(GOOGLETEST_PREFIX "${benchmark_BINARY_DIR}/third_party/googletest")
+configure_file(${benchmark_SOURCE_DIR}/cmake/GoogleTest.cmake.in ${GOOGLETEST_PREFIX}/CMakeLists.txt @ONLY)
+
+set(GOOGLETEST_PATH "${CMAKE_CURRENT_SOURCE_DIR}/googletest" CACHE PATH "") # Mind the quotes
+execute_process(COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}"
+  -DALLOW_DOWNLOADING_GOOGLETEST=${BENCHMARK_DOWNLOAD_DEPENDENCIES} -DGOOGLETEST_PATH:PATH=${GOOGLETEST_PATH} .
+  RESULT_VARIABLE result
+  WORKING_DIRECTORY ${GOOGLETEST_PREFIX}
+)
+
+if(result)
+  message(FATAL_ERROR "CMake step for googletest failed: ${result}")
+endif()
+
+execute_process(
+  COMMAND ${CMAKE_COMMAND} --build .
+  RESULT_VARIABLE result
+  WORKING_DIRECTORY ${GOOGLETEST_PREFIX}
+)
+
+if(result)
+  message(FATAL_ERROR "Build step for googletest failed: ${result}")
+endif()
+
+# Prevent overriding the parent project's compiler/linker
+# settings on Windows
+set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
+
+include(${GOOGLETEST_PREFIX}/googletest-paths.cmake)
+
+# Add googletest directly to our build. This defines
+# the gtest and gtest_main targets.
+add_subdirectory(${GOOGLETEST_SOURCE_DIR}
+                 ${GOOGLETEST_BINARY_DIR}
+                 EXCLUDE_FROM_ALL)
+
+set_target_properties(gtest PROPERTIES INTERFACE_SYSTEM_INCLUDE_DIRECTORIES $<TARGET_PROPERTY:gtest,INTERFACE_INCLUDE_DIRECTORIES>)
+set_target_properties(gtest_main PROPERTIES INTERFACE_SYSTEM_INCLUDE_DIRECTORIES $<TARGET_PROPERTY:gtest_main,INTERFACE_INCLUDE_DIRECTORIES>)
+set_target_properties(gmock PROPERTIES INTERFACE_SYSTEM_INCLUDE_DIRECTORIES $<TARGET_PROPERTY:gmock,INTERFACE_INCLUDE_DIRECTORIES>)
+set_target_properties(gmock_main PROPERTIES INTERFACE_SYSTEM_INCLUDE_DIRECTORIES $<TARGET_PROPERTY:gmock_main,INTERFACE_INCLUDE_DIRECTORIES>)
diff --git a/cmake/GoogleTest.cmake.in b/cmake/GoogleTest.cmake.in
new file mode 100644
index 0000000000..fd957ff564
--- /dev/null
+++ b/cmake/GoogleTest.cmake.in
@@ -0,0 +1,58 @@
+cmake_minimum_required(VERSION 2.8.12)
+
+project(googletest-download NONE)
+
+# Enable ExternalProject CMake module
+include(ExternalProject)
+
+option(ALLOW_DOWNLOADING_GOOGLETEST "If googletest src tree is not found in location specified by GOOGLETEST_PATH, do fetch the archive from internet" OFF)
+set(GOOGLETEST_PATH "/usr/src/googletest" CACHE PATH
+                    "Path to the googletest root tree. Should contain googletest and googlemock subdirs. And CMakeLists.txt in root, and in both of these subdirs")
+
+# Download and install GoogleTest
+
+message(STATUS "Looking for Google Test sources")
+message(STATUS "Looking for Google Test sources in ${GOOGLETEST_PATH}")
+if(EXISTS "${GOOGLETEST_PATH}"            AND IS_DIRECTORY "${GOOGLETEST_PATH}"            AND EXISTS "${GOOGLETEST_PATH}/CMakeLists.txt" AND
+   EXISTS "${GOOGLETEST_PATH}/googletest" AND IS_DIRECTORY "${GOOGLETEST_PATH}/googletest" AND EXISTS "${GOOGLETEST_PATH}/googletest/CMakeLists.txt" AND
+   EXISTS "${GOOGLETEST_PATH}/googlemock" AND IS_DIRECTORY "${GOOGLETEST_PATH}/googlemock" AND EXISTS "${GOOGLETEST_PATH}/googlemock/CMakeLists.txt")
+  message(STATUS "Found Google Test in ${GOOGLETEST_PATH}")
+
+  ExternalProject_Add(
+    googletest
+    PREFIX            "${CMAKE_BINARY_DIR}"
+    DOWNLOAD_DIR      "${CMAKE_BINARY_DIR}/download"
+    SOURCE_DIR        "${GOOGLETEST_PATH}" # use existing src dir.
+    BINARY_DIR        "${CMAKE_BINARY_DIR}/build"
+    CONFIGURE_COMMAND ""
+    BUILD_COMMAND     ""
+    INSTALL_COMMAND   ""
+    TEST_COMMAND      ""
+  )
+else()
+  if(NOT ALLOW_DOWNLOADING_GOOGLETEST)
+    message(SEND_ERROR "Did not find Google Test sources! Either pass correct path in GOOGLETEST_PATH, or enable BENCHMARK_DOWNLOAD_DEPENDENCIES, or disable BENCHMARK_ENABLE_GTEST_TESTS / BENCHMARK_ENABLE_TESTING.")
+  else()
+    message(WARNING "Did not find Google Test sources! Fetching from web...")
+    ExternalProject_Add(
+      googletest
+      GIT_REPOSITORY    https://github.com/google/googletest.git
+      GIT_TAG           master
+      PREFIX            "${CMAKE_BINARY_DIR}"
+      STAMP_DIR         "${CMAKE_BINARY_DIR}/stamp"
+      DOWNLOAD_DIR      "${CMAKE_BINARY_DIR}/download"
+      SOURCE_DIR        "${CMAKE_BINARY_DIR}/src"
+      BINARY_DIR        "${CMAKE_BINARY_DIR}/build"
+      CONFIGURE_COMMAND ""
+      BUILD_COMMAND     ""
+      INSTALL_COMMAND   ""
+      TEST_COMMAND      ""
+    )
+  endif()
+endif()
+
+ExternalProject_Get_Property(googletest SOURCE_DIR BINARY_DIR)
+file(WRITE googletest-paths.cmake
+"set(GOOGLETEST_SOURCE_DIR \"${SOURCE_DIR}\")
+set(GOOGLETEST_BINARY_DIR \"${BINARY_DIR}\")
+")
diff --git a/cmake/HandleGTest.cmake b/cmake/HandleGTest.cmake
deleted file mode 100644
index 77ffc4c51c..0000000000
--- a/cmake/HandleGTest.cmake
+++ /dev/null
@@ -1,79 +0,0 @@
-
-macro(split_list listname)
-  string(REPLACE ";" " " ${listname} "${${listname}}")
-endmacro()
-
-macro(build_external_gtest)
-  include(ExternalProject)
-  set(GTEST_FLAGS "")
-  if (BENCHMARK_USE_LIBCXX)
-    if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
-      list(APPEND GTEST_FLAGS -stdlib=libc++)
-    else()
-      message(WARNING "Unsupported compiler (${CMAKE_CXX_COMPILER}) when using libc++")
-    endif()
-  endif()
-  if (BENCHMARK_BUILD_32_BITS)
-    list(APPEND GTEST_FLAGS -m32)
-  endif()
-  if (NOT "${CMAKE_CXX_FLAGS}" STREQUAL "")
-    list(APPEND GTEST_FLAGS ${CMAKE_CXX_FLAGS})
-  endif()
-  string(TOUPPER "${CMAKE_BUILD_TYPE}" GTEST_BUILD_TYPE)
-  if ("${GTEST_BUILD_TYPE}" STREQUAL "COVERAGE")
-    set(GTEST_BUILD_TYPE "DEBUG")
-  endif()
-  split_list(GTEST_FLAGS)
-  ExternalProject_Add(googletest
-      EXCLUDE_FROM_ALL ON
-      GIT_REPOSITORY https://github.com/google/googletest.git
-      GIT_TAG master
-      PREFIX "${CMAKE_BINARY_DIR}/googletest"
-      INSTALL_DIR "${CMAKE_BINARY_DIR}/googletest"
-      CMAKE_CACHE_ARGS
-        -DCMAKE_BUILD_TYPE:STRING=${GTEST_BUILD_TYPE}
-        -DCMAKE_C_COMPILER:STRING=${CMAKE_C_COMPILER}
-        -DCMAKE_CXX_COMPILER:STRING=${CMAKE_CXX_COMPILER}
-        -DCMAKE_INSTALL_PREFIX:PATH=<INSTALL_DIR>
-        -DCMAKE_CXX_FLAGS:STRING=${GTEST_FLAGS}
-        -Dgtest_force_shared_crt:BOOL=ON
-      )
-
-  ExternalProject_Get_Property(googletest install_dir)
-
-  add_library(gtest UNKNOWN IMPORTED)
-  add_library(gtest_main UNKNOWN IMPORTED)
-
-  set(LIB_SUFFIX "${CMAKE_STATIC_LIBRARY_SUFFIX}")
-  set(LIB_PREFIX "${CMAKE_STATIC_LIBRARY_PREFIX}")
-
-  if("${GTEST_BUILD_TYPE}" STREQUAL "DEBUG")
-    set(LIB_SUFFIX "d${CMAKE_STATIC_LIBRARY_SUFFIX}")
-  endif()
-  file(MAKE_DIRECTORY ${install_dir}/include)
-  set_target_properties(gtest PROPERTIES
-    IMPORTED_LOCATION ${install_dir}/lib/${LIB_PREFIX}gtest${LIB_SUFFIX}
-    INTERFACE_INCLUDE_DIRECTORIES ${install_dir}/include
-  )
-  set_target_properties(gtest_main PROPERTIES
-    IMPORTED_LOCATION ${install_dir}/lib/${LIB_PREFIX}gtest_main${LIB_SUFFIX}
-    INTERFACE_INCLUDE_DIRECTORIES ${install_dir}/include
-  )
-  add_dependencies(gtest googletest)
-  add_dependencies(gtest_main googletest)
-  set(GTEST_BOTH_LIBRARIES gtest gtest_main)
-  #set(GTEST_INCLUDE_DIRS ${install_dir}/include)
-endmacro(build_external_gtest)
-
-if (BENCHMARK_ENABLE_GTEST_TESTS)
-  if (IS_DIRECTORY ${CMAKE_SOURCE_DIR}/googletest)
-    set(INSTALL_GTEST OFF CACHE INTERNAL "")
-    set(INSTALL_GMOCK OFF CACHE INTERNAL "")
-    add_subdirectory(${CMAKE_SOURCE_DIR}/googletest)
-    set(GTEST_BOTH_LIBRARIES gtest gtest_main)
-  elseif(BENCHMARK_DOWNLOAD_DEPENDENCIES)
-    build_external_gtest()
-  else()
-    find_package(GTest REQUIRED)
-  endif()
-endif()
diff --git a/cmake/Modules/FindPFM.cmake b/cmake/Modules/FindPFM.cmake
new file mode 100644
index 0000000000..553d458c00
--- /dev/null
+++ b/cmake/Modules/FindPFM.cmake
@@ -0,0 +1,19 @@
+# If successful, the following variables will be defined:
+# HAVE_LIBPFM.
+# Set BENCHMARK_ENABLE_LIBPFM to 0 to disable, regardless of libpfm presence.
+include(CheckIncludeFile)
+include(CheckLibraryExists)
+enable_language(C)
+
+check_library_exists(libpfm.a pfm_initialize "" HAVE_LIBPFM_INITIALIZE)
+if(HAVE_LIBPFM_INITIALIZE)
+  check_include_file(perfmon/perf_event.h HAVE_PERFMON_PERF_EVENT_H)
+  check_include_file(perfmon/pfmlib.h HAVE_PERFMON_PFMLIB_H)
+  check_include_file(perfmon/pfmlib_perf_event.h HAVE_PERFMON_PFMLIB_PERF_EVENT_H)
+  if(HAVE_PERFMON_PERF_EVENT_H AND HAVE_PERFMON_PFMLIB_H AND HAVE_PERFMON_PFMLIB_PERF_EVENT_H)
+    message("Using Perf Counters.")
+    set(HAVE_LIBPFM 1)
+  endif()
+else()
+  message("Perf Counters support requested, but was unable to find libpfm.")
+endif()
diff --git a/cmake/benchmark.pc.in b/cmake/benchmark.pc.in
index 1e84bff68d..34beb012ee 100644
--- a/cmake/benchmark.pc.in
+++ b/cmake/benchmark.pc.in
@@ -1,11 +1,12 @@
 prefix=@CMAKE_INSTALL_PREFIX@
 exec_prefix=${prefix}
-libdir=${prefix}/lib
-includedir=${prefix}/include
+libdir=${prefix}/@CMAKE_INSTALL_LIBDIR@
+includedir=${prefix}/@CMAKE_INSTALL_INCLUDEDIR@
 
 Name: @PROJECT_NAME@
 Description: Google microbenchmark framework
 Version: @VERSION@
 
 Libs: -L${libdir} -lbenchmark
+Libs.private: -lpthread
 Cflags: -I${includedir}
diff --git a/cmake/split_list.cmake b/cmake/split_list.cmake
new file mode 100644
index 0000000000..67aed3fdc8
--- /dev/null
+++ b/cmake/split_list.cmake
@@ -0,0 +1,3 @@
+macro(split_list listname)
+  string(REPLACE ";" " " ${listname} "${${listname}}")
+endmacro()
diff --git a/dependencies.md b/dependencies.md
new file mode 100644
index 0000000000..6289b4e354
--- /dev/null
+++ b/dependencies.md
@@ -0,0 +1,18 @@
+# Build tool dependency policy
+
+To ensure the broadest compatibility when building the benchmark library, but
+still allow forward progress, we require any build tooling to be available for:
+
+* Debian stable AND
+* The last two Ubuntu LTS releases AND
+
+Currently, this means using build tool versions that are available for Ubuntu
+16.04 (Xenial), Ubuntu 18.04 (Bionic), and Debian stretch.
+
+_Note, [travis](.travis.yml) runs under Ubuntu 14.04 (Trusty) for linux builds._
+
+## cmake
+The current supported version is cmake 3.5.1 as of 2018-06-06.
+
+_Note, this version is also available for Ubuntu 14.04, the previous Ubuntu LTS
+release, as `cmake3`._
diff --git a/docs/AssemblyTests.md b/docs/AssemblyTests.md
new file mode 100644
index 0000000000..1fbdc269b5
--- /dev/null
+++ b/docs/AssemblyTests.md
@@ -0,0 +1,147 @@
+# Assembly Tests
+
+The Benchmark library provides a number of functions whose primary
+purpose in to affect assembly generation, including `DoNotOptimize`
+and `ClobberMemory`. In addition there are other functions,
+such as `KeepRunning`, for which generating good assembly is paramount.
+
+For these functions it's important to have tests that verify the
+correctness and quality of the implementation. This requires testing
+the code generated by the compiler.
+
+This document describes how the Benchmark library tests compiler output,
+as well as how to properly write new tests.
+
+
+## Anatomy of a Test
+
+Writing a test has two steps:
+
+* Write the code you want to generate assembly for.
+* Add `// CHECK` lines to match against the verified assembly.
+
+Example:
+```c++
+
+// CHECK-LABEL: test_add:
+extern "C" int test_add() {
+    extern int ExternInt;
+    return ExternInt + 1;
+
+    // CHECK: movl ExternInt(%rip), %eax
+    // CHECK: addl %eax
+    // CHECK: ret
+}
+
+```
+
+#### LLVM Filecheck
+
+[LLVM's Filecheck](https://llvm.org/docs/CommandGuide/FileCheck.html)
+is used to test the generated assembly against the `// CHECK` lines
+specified in the tests source file. Please see the documentation
+linked above for information on how to write `CHECK` directives.
+
+#### Tips and Tricks:
+
+* Tests should match the minimal amount of output required to establish
+correctness. `CHECK` directives don't have to match on the exact next line
+after the previous match, so tests should omit checks for unimportant
+bits of assembly. ([`CHECK-NEXT`](https://llvm.org/docs/CommandGuide/FileCheck.html#the-check-next-directive)
+can be used to ensure a match occurs exactly after the previous match).
+
+* The tests are compiled with `-O3 -g0`. So we're only testing the
+optimized output.
+
+* The assembly output is further cleaned up using `tools/strip_asm.py`.
+This removes comments, assembler directives, and unused labels before
+the test is run.
+
+* The generated and stripped assembly file for a test is output under
+`<build-directory>/test/<test-name>.s`
+
+* Filecheck supports using [`CHECK` prefixes](https://llvm.org/docs/CommandGuide/FileCheck.html#cmdoption-check-prefixes)
+to specify lines that should only match in certain situations.
+The Benchmark tests use `CHECK-CLANG` and `CHECK-GNU` for lines that
+are only expected to match Clang or GCC's output respectively. Normal
+`CHECK` lines match against all compilers. (Note: `CHECK-NOT` and
+`CHECK-LABEL` are NOT prefixes. They are versions of non-prefixed
+`CHECK` lines)
+
+* Use `extern "C"` to disable name mangling for specific functions. This
+makes them easier to name in the `CHECK` lines.
+
+
+## Problems Writing Portable Tests
+
+Writing tests which check the code generated by a compiler are
+inherently non-portable. Different compilers and even different compiler
+versions may generate entirely different code. The Benchmark tests
+must tolerate this.
+
+LLVM Filecheck provides a number of mechanisms to help write
+"more portable" tests; including [matching using regular expressions](https://llvm.org/docs/CommandGuide/FileCheck.html#filecheck-pattern-matching-syntax),
+allowing the creation of [named variables](https://llvm.org/docs/CommandGuide/FileCheck.html#filecheck-variables)
+for later matching, and [checking non-sequential matches](https://llvm.org/docs/CommandGuide/FileCheck.html#the-check-dag-directive).
+
+#### Capturing Variables
+
+For example, say GCC stores a variable in a register but Clang stores
+it in memory. To write a test that tolerates both cases we "capture"
+the destination of the store, and then use the captured expression
+to write the remainder of the test.
+
+```c++
+// CHECK-LABEL: test_div_no_op_into_shr:
+extern "C" void test_div_no_op_into_shr(int value) {
+    int divisor = 2;
+    benchmark::DoNotOptimize(divisor); // hide the value from the optimizer
+    return value / divisor;
+
+    // CHECK: movl $2, [[DEST:.*]]
+    // CHECK: idivl [[DEST]]
+    // CHECK: ret
+}
+```
+
+#### Using Regular Expressions to Match Differing Output
+
+Often tests require testing assembly lines which may subtly differ
+between compilers or compiler versions. A common example of this
+is matching stack frame addresses. In this case regular expressions
+can be used to match the differing bits of output. For example:
+
+```c++
+int ExternInt;
+struct Point { int x, y, z; };
+
+// CHECK-LABEL: test_store_point:
+extern "C" void test_store_point() {
+    Point p{ExternInt, ExternInt, ExternInt};
+    benchmark::DoNotOptimize(p);
+
+    // CHECK: movl ExternInt(%rip), %eax
+    // CHECK: movl %eax, -{{[0-9]+}}(%rsp)
+    // CHECK: movl %eax, -{{[0-9]+}}(%rsp)
+    // CHECK: movl %eax, -{{[0-9]+}}(%rsp)
+    // CHECK: ret
+}
+```
+
+## Current Requirements and Limitations
+
+The tests require Filecheck to be installed along the `PATH` of the
+build machine. Otherwise the tests will be disabled.
+
+Additionally, as mentioned in the previous section, codegen tests are
+inherently non-portable. Currently the tests are limited to:
+
+* x86_64 targets.
+* Compiled with GCC or Clang
+
+Further work could be done, at least on a limited basis, to extend the
+tests to other architectures and compilers (using `CHECK` prefixes).
+
+Furthermore, the tests fail for builds which specify additional flags
+that modify code generation, including `--coverage` or `-fsanitize=`.
+
diff --git a/docs/_config.yml b/docs/_config.yml
new file mode 100644
index 0000000000..fc24e7a62d
--- /dev/null
+++ b/docs/_config.yml
@@ -0,0 +1 @@
+theme: jekyll-theme-hacker
\ No newline at end of file
diff --git a/docs/perf_counters.md b/docs/perf_counters.md
new file mode 100644
index 0000000000..74560e9669
--- /dev/null
+++ b/docs/perf_counters.md
@@ -0,0 +1,34 @@
+<a name="perf-counters" />
+
+# User-Requested Performance Counters
+
+When running benchmarks, the user may choose to request collection of
+performance counters. This may be useful in investigation scenarios - narrowing
+down the cause of a regression; or verifying that the underlying cause of a
+performance improvement matches expectations.
+
+This feature is available if:
+
+* The benchmark is run on an architecture featuring a Performance Monitoring
+  Unit (PMU),
+* The benchmark is compiled with support for collecting counters. Currently,
+  this requires [libpfm](http://perfmon2.sourceforge.net/) be available at build
+  time
+
+The feature does not require modifying benchmark code. Counter collection is
+handled at the boundaries where timer collection is also handled. 
+
+To opt-in:
+
+*  Install `libpfm4-dev`, e.g. `apt-get install libpfm4-dev`.
+*  Enable the cmake flag BENCHMARK_ENABLE_LIBPFM.
+
+To use, pass a comma-separated list of counter names through the
+`--benchmark_perf_counters` flag. The names are decoded through libpfm - meaning,
+they are platform specific, but some (e.g. `CYCLES` or `INSTRUCTIONS`) are
+mapped by libpfm to platform-specifics - see libpfm
+[documentation](http://perfmon2.sourceforge.net/docs.html) for more details.
+
+The counter values are reported back through the [User Counters](../README.md#custom-counters)
+mechanism, meaning, they are available in all the formats (e.g. JSON) supported
+by User Counters.
\ No newline at end of file
diff --git a/docs/random_interleaving.md b/docs/random_interleaving.md
new file mode 100644
index 0000000000..c083036841
--- /dev/null
+++ b/docs/random_interleaving.md
@@ -0,0 +1,13 @@
+<a name="interleaving" />
+
+# Random Interleaving
+
+[Random Interleaving](https://github.com/google/benchmark/issues/1051) is a
+technique to lower run-to-run variance. It randomly interleaves repetitions of a
+microbenchmark with repetitions from other microbenchmarks in the same benchmark
+test. Data shows it is able to lower run-to-run variance by
+[40%](https://github.com/google/benchmark/issues/1051) on average.
+
+To use, you mainly need to set `--benchmark_enable_random_interleaving=true`,
+and optionally specify non-zero repetition count `--benchmark_repetitions=9`
+and optionally decrease the per-repetition time `--benchmark_min_time=0.1`.
diff --git a/releasing.md b/docs/releasing.md
similarity index 64%
rename from releasing.md
rename to docs/releasing.md
index f0cd7010e3..7a6dfc4017 100644
--- a/releasing.md
+++ b/docs/releasing.md
@@ -1,6 +1,6 @@
 # How to release
 
-* Make sure you're on master and synced to HEAD
+* Make sure you're on main and synced to HEAD
 * Ensure the project builds and tests run (sanity check only, obviously)
     * `parallel -j0 exec ::: test/*_test` can help ensure everything at least
       passes
@@ -8,6 +8,12 @@
     * `git log $(git describe --abbrev=0 --tags)..HEAD` gives you the list of
       commits between the last annotated tag and HEAD
     * Pick the most interesting.
+* Create one last commit that updates the version saved in `CMakeLists.txt` to the release version you're creating. (This version will be used if benchmark is installed from the archive you'll be creating in the next step.)
+
+```
+project (benchmark VERSION 1.5.3 LANGUAGES CXX)
+```
+
 * Create a release through github's interface
     * Note this will create a lightweight tag.
     * Update this to an annotated tag:
diff --git a/docs/tools.md b/docs/tools.md
index 70500bd322..f2d0c497f3 100644
--- a/docs/tools.md
+++ b/docs/tools.md
@@ -1,84 +1,29 @@
 # Benchmark Tools
 
-## compare_bench.py
-
-The `compare_bench.py` utility which can be used to compare the result of benchmarks.
-The program is invoked like:
-
-``` bash
-$ compare_bench.py <old-benchmark> <new-benchmark> [benchmark options]...
-```
-
-Where `<old-benchmark>` and `<new-benchmark>` either specify a benchmark executable file, or a JSON output file. The type of the input file is automatically detected. If a benchmark executable is specified then the benchmark is run to obtain the results. Otherwise the results are simply loaded from the output file.
-
-`[benchmark options]` will be passed to the benchmarks invocations. They can be anything that binary accepts, be it either normal `--benchmark_*` parameters, or some custom parameters your binary takes.
-
-The sample output using the JSON test files under `Inputs/` gives:
-
-``` bash
-$ ./compare_bench.py ./gbench/Inputs/test1_run1.json ./gbench/Inputs/test1_run2.json
-Comparing ./gbench/Inputs/test1_run1.json to ./gbench/Inputs/test1_run2.json
-Benchmark                        Time             CPU      Time Old      Time New       CPU Old       CPU New
--------------------------------------------------------------------------------------------------------------
-BM_SameTimes                  +0.0000         +0.0000            10            10            10            10
-BM_2xFaster                   -0.5000         -0.5000            50            25            50            25
-BM_2xSlower                   +1.0000         +1.0000            50           100            50           100
-BM_1PercentFaster             -0.0100         -0.0100           100            99           100            99
-BM_1PercentSlower             +0.0100         +0.0100           100           101           100           101
-BM_10PercentFaster            -0.1000         -0.1000           100            90           100            90
-BM_10PercentSlower            +0.1000         +0.1000           100           110           100           110
-BM_100xSlower                +99.0000        +99.0000           100         10000           100         10000
-BM_100xFaster                 -0.9900         -0.9900         10000           100         10000           100
-BM_10PercentCPUToTime         +0.1000         -0.1000           100           110           100            90
-BM_ThirdFaster                -0.3333         -0.3334           100            67           100            67
-BM_BadTimeUnit                -0.9000         +0.2000             0             0             0             1
-```
-
-As you can note, the values in `Time` and `CPU` columns are calculated as `(new - old) / |old|`.
+## compare.py
 
-When a benchmark executable is run, the raw output from the benchmark is printed in real time to stdout. The sample output using `benchmark/basic_test` for both arguments looks like:
+The `compare.py` can be used to compare the result of benchmarks.
 
+### Dependencies
+The utility relies on the [scipy](https://www.scipy.org) package which can be installed using pip:
+```bash
+pip3 install -r requirements.txt
 ```
-./compare_bench.py  test/basic_test test/basic_test  --benchmark_filter=BM_empty.*
-RUNNING: test/basic_test --benchmark_filter=BM_empty.* --benchmark_out=/tmp/tmpN7LF3a
-Run on (8 X 4000 MHz CPU s)
-2017-11-07 23:28:36
----------------------------------------------------------------------
-Benchmark                              Time           CPU Iterations
----------------------------------------------------------------------
-BM_empty                               4 ns          4 ns  170178757
-BM_empty/threads:8                     1 ns          7 ns  103868920
-BM_empty_stop_start                    0 ns          0 ns 1000000000
-BM_empty_stop_start/threads:8          0 ns          0 ns 1403031720
-RUNNING: /test/basic_test --benchmark_filter=BM_empty.* --benchmark_out=/tmp/tmplvrIp8
-Run on (8 X 4000 MHz CPU s)
-2017-11-07 23:28:38
----------------------------------------------------------------------
-Benchmark                              Time           CPU Iterations
----------------------------------------------------------------------
-BM_empty                               4 ns          4 ns  169534855
-BM_empty/threads:8                     1 ns          7 ns  104188776
-BM_empty_stop_start                    0 ns          0 ns 1000000000
-BM_empty_stop_start/threads:8          0 ns          0 ns 1404159424
-Comparing ../build/test/basic_test to ../build/test/basic_test
-Benchmark                                Time             CPU      Time Old      Time New       CPU Old       CPU New
----------------------------------------------------------------------------------------------------------------------
-BM_empty                              -0.0048         -0.0049             4             4             4             4
-BM_empty/threads:8                    -0.0123         -0.0054             1             1             7             7
-BM_empty_stop_start                   -0.0000         -0.0000             0             0             0             0
-BM_empty_stop_start/threads:8         -0.0029         +0.0001             0             0             0             0
 
-```
+### Displaying aggregates only
 
-As you can note, the values in `Time` and `CPU` columns are calculated as `(new - old) / |old|`.
-Obviously this example doesn't give any useful output, but it's intended to show the output format when 'compare_bench.py' needs to run benchmarks.
+The switch `-a` / `--display_aggregates_only` can be used to control the
+displayment of the normal iterations vs the aggregates. When passed, it will
+be passthrough to the benchmark binaries to be run, and will be accounted for
+in the tool itself; only the aggregates will be displayed, but not normal runs.
+It only affects the display, the separate runs will still be used to calculate
+the U test.
 
-## compare.py
+### Modes of operation
 
-The `compare.py` can be used to compare the result of benchmarks.
 There are three modes of operation:
 
-1. Just compare two benchmarks, what `compare_bench.py` did.
+1. Just compare two benchmarks
 The program is invoked like:
 
 ``` bash
@@ -240,3 +185,19 @@ Benchmark                               Time             CPU      Time Old
 ```
 This is a mix of the previous two modes, two (potentially different) benchmark binaries are run, and a different filter is applied to each one.
 As you can note, the values in `Time` and `CPU` columns are calculated as `(new - old) / |old|`.
+
+### U test
+
+If there is a sufficient repetition count of the benchmarks, the tool can do
+a [U Test](https://en.wikipedia.org/wiki/Mann%E2%80%93Whitney_U_test), of the
+null hypothesis that it is equally likely that a randomly selected value from
+one sample will be less than or greater than a randomly selected value from a
+second sample.
+
+If the calculated p-value is below this value is lower than the significance
+level alpha, then the result is said to be statistically significant and the
+null hypothesis is rejected. Which in other words means that the two benchmarks
+aren't identical.
+
+**WARNING**: requires **LARGE** (no less than 9) number of repetitions to be
+meaningful!
diff --git a/mingw.py b/mingw.py
deleted file mode 100644
index 706ad559db..0000000000
--- a/mingw.py
+++ /dev/null
@@ -1,320 +0,0 @@
-#! /usr/bin/env python
-# encoding: utf-8
-
-import argparse
-import errno
-import logging
-import os
-import platform
-import re
-import sys
-import subprocess
-import tempfile
-
-try:
-    import winreg
-except ImportError:
-    import _winreg as winreg
-try:
-    import urllib.request as request
-except ImportError:
-    import urllib as request
-try:
-    import urllib.parse as parse
-except ImportError:
-    import urlparse as parse
-
-class EmptyLogger(object):
-    '''
-    Provides an implementation that performs no logging
-    '''
-    def debug(self, *k, **kw):
-        pass
-    def info(self, *k, **kw):
-        pass
-    def warn(self, *k, **kw):
-        pass
-    def error(self, *k, **kw):
-        pass
-    def critical(self, *k, **kw):
-        pass
-    def setLevel(self, *k, **kw):
-        pass
-
-urls = (
-    'http://downloads.sourceforge.net/project/mingw-w64/Toolchains%20'
-        'targetting%20Win32/Personal%20Builds/mingw-builds/installer/'
-        'repository.txt',
-    'http://downloads.sourceforge.net/project/mingwbuilds/host-windows/'
-        'repository.txt'
-)
-'''
-A list of mingw-build repositories
-'''
-
-def repository(urls = urls, log = EmptyLogger()):
-    '''
-    Downloads and parse mingw-build repository files and parses them
-    '''
-    log.info('getting mingw-builds repository')
-    versions = {}
-    re_sourceforge = re.compile(r'http://sourceforge.net/projects/([^/]+)/files')
-    re_sub = r'http://downloads.sourceforge.net/project/\1'
-    for url in urls:
-        log.debug(' - requesting: %s', url)
-        socket = request.urlopen(url)
-        repo = socket.read()
-        if not isinstance(repo, str):
-            repo = repo.decode();
-        socket.close()
-        for entry in repo.split('\n')[:-1]:
-            value = entry.split('|')
-            version = tuple([int(n) for n in value[0].strip().split('.')])
-            version = versions.setdefault(version, {})
-            arch = value[1].strip()
-            if arch == 'x32':
-                arch = 'i686'
-            elif arch == 'x64':
-                arch = 'x86_64'
-            arch = version.setdefault(arch, {})
-            threading = arch.setdefault(value[2].strip(), {})
-            exceptions = threading.setdefault(value[3].strip(), {})
-            revision = exceptions.setdefault(int(value[4].strip()[3:]),
-                re_sourceforge.sub(re_sub, value[5].strip()))
-    return versions
-
-def find_in_path(file, path=None):
-    '''
-    Attempts to find an executable in the path
-    '''
-    if platform.system() == 'Windows':
-        file += '.exe'
-    if path is None:
-        path = os.environ.get('PATH', '')
-    if type(path) is type(''):
-        path = path.split(os.pathsep)
-    return list(filter(os.path.exists,
-        map(lambda dir, file=file: os.path.join(dir, file), path)))
-
-def find_7zip(log = EmptyLogger()):
-    '''
-    Attempts to find 7zip for unpacking the mingw-build archives
-    '''
-    log.info('finding 7zip')
-    path = find_in_path('7z')
-    if not path:
-        key = winreg.OpenKey(winreg.HKEY_LOCAL_MACHINE, r'SOFTWARE\7-Zip')
-        path, _ = winreg.QueryValueEx(key, 'Path')
-        path = [os.path.join(path, '7z.exe')]
-    log.debug('found \'%s\'', path[0])
-    return path[0]
-
-find_7zip()
-
-def unpack(archive, location, log = EmptyLogger()):
-    '''
-    Unpacks a mingw-builds archive
-    '''
-    sevenzip = find_7zip(log)
-    log.info('unpacking %s', os.path.basename(archive))
-    cmd = [sevenzip, 'x', archive, '-o' + location, '-y']
-    log.debug(' - %r', cmd)
-    with open(os.devnull, 'w') as devnull:
-        subprocess.check_call(cmd, stdout = devnull)
-
-def download(url, location, log = EmptyLogger()):
-    '''
-    Downloads and unpacks a mingw-builds archive
-    '''
-    log.info('downloading MinGW')
-    log.debug(' - url: %s', url)
-    log.debug(' - location: %s', location)
-
-    re_content = re.compile(r'attachment;[ \t]*filename=(")?([^"]*)(")?[\r\n]*')
-
-    stream = request.urlopen(url)
-    try:
-        content = stream.getheader('Content-Disposition') or ''
-    except AttributeError:
-        content = stream.headers.getheader('Content-Disposition') or ''
-    matches = re_content.match(content)
-    if matches:
-        filename = matches.group(2)
-    else:
-        parsed = parse.urlparse(stream.geturl())
-        filename = os.path.basename(parsed.path)
-
-    try:
-        os.makedirs(location)
-    except OSError as e:
-        if e.errno == errno.EEXIST and os.path.isdir(location):
-            pass
-        else:
-            raise
-
-    archive = os.path.join(location, filename)
-    with open(archive, 'wb') as out:
-        while True:
-            buf = stream.read(1024)
-            if not buf:
-                break
-            out.write(buf)
-    unpack(archive, location, log = log)
-    os.remove(archive)
-
-    possible = os.path.join(location, 'mingw64')
-    if not os.path.exists(possible):
-        possible = os.path.join(location, 'mingw32')
-        if not os.path.exists(possible):
-            raise ValueError('Failed to find unpacked MinGW: ' + possible)
-    return possible
-
-def root(location = None, arch = None, version = None, threading = None,
-        exceptions = None, revision = None, log = EmptyLogger()):
-    '''
-    Returns the root folder of a specific version of the mingw-builds variant
-    of gcc. Will download the compiler if needed
-    '''
-
-    # Get the repository if we don't have all the information
-    if not (arch and version and threading and exceptions and revision):
-        versions = repository(log = log)
-
-    # Determine some defaults
-    version = version or max(versions.keys())
-    if not arch:
-        arch = platform.machine().lower()
-        if arch == 'x86':
-            arch = 'i686'
-        elif arch == 'amd64':
-            arch = 'x86_64'
-    if not threading:
-        keys = versions[version][arch].keys()
-        if 'posix' in keys:
-            threading = 'posix'
-        elif 'win32' in keys:
-            threading = 'win32'
-        else:
-            threading = keys[0]
-    if not exceptions:
-        keys = versions[version][arch][threading].keys()
-        if 'seh' in keys:
-            exceptions = 'seh'
-        elif 'sjlj' in keys:
-            exceptions = 'sjlj'
-        else:
-            exceptions = keys[0]
-    if revision == None:
-        revision = max(versions[version][arch][threading][exceptions].keys())
-    if not location:
-        location = os.path.join(tempfile.gettempdir(), 'mingw-builds')
-
-    # Get the download url
-    url = versions[version][arch][threading][exceptions][revision]
-
-    # Tell the user whatzzup
-    log.info('finding MinGW %s', '.'.join(str(v) for v in version))
-    log.debug(' - arch: %s', arch)
-    log.debug(' - threading: %s', threading)
-    log.debug(' - exceptions: %s', exceptions)
-    log.debug(' - revision: %s', revision)
-    log.debug(' - url: %s', url)
-
-    # Store each specific revision differently
-    slug = '{version}-{arch}-{threading}-{exceptions}-rev{revision}'
-    slug = slug.format(
-        version = '.'.join(str(v) for v in version),
-        arch = arch,
-        threading = threading,
-        exceptions = exceptions,
-        revision = revision
-    )
-    if arch == 'x86_64':
-        root_dir = os.path.join(location, slug, 'mingw64')
-    elif arch == 'i686':
-        root_dir = os.path.join(location, slug, 'mingw32')
-    else:
-        raise ValueError('Unknown MinGW arch: ' + arch)
-
-    # Download if needed
-    if not os.path.exists(root_dir):
-        downloaded = download(url, os.path.join(location, slug), log = log)
-        if downloaded != root_dir:
-            raise ValueError('The location of mingw did not match\n%s\n%s'
-                % (downloaded, root_dir))
-
-    return root_dir
-
-def str2ver(string):
-    '''
-    Converts a version string into a tuple
-    '''
-    try:
-        version = tuple(int(v) for v in string.split('.'))
-        if len(version) is not 3:
-            raise ValueError()
-    except ValueError:
-        raise argparse.ArgumentTypeError(
-            'please provide a three digit version string')
-    return version
-
-def main():
-    '''
-    Invoked when the script is run directly by the python interpreter
-    '''
-    parser = argparse.ArgumentParser(
-        description = 'Downloads a specific version of MinGW',
-        formatter_class = argparse.ArgumentDefaultsHelpFormatter
-    )
-    parser.add_argument('--location',
-        help = 'the location to download the compiler to',
-        default = os.path.join(tempfile.gettempdir(), 'mingw-builds'))
-    parser.add_argument('--arch', required = True, choices = ['i686', 'x86_64'],
-        help = 'the target MinGW architecture string')
-    parser.add_argument('--version', type = str2ver,
-        help = 'the version of GCC to download')
-    parser.add_argument('--threading', choices = ['posix', 'win32'],
-        help = 'the threading type of the compiler')
-    parser.add_argument('--exceptions', choices = ['sjlj', 'seh', 'dwarf'],
-        help = 'the method to throw exceptions')
-    parser.add_argument('--revision', type=int,
-        help = 'the revision of the MinGW release')
-    group = parser.add_mutually_exclusive_group()
-    group.add_argument('-v', '--verbose', action='store_true',
-        help='increase the script output verbosity')
-    group.add_argument('-q', '--quiet', action='store_true',
-        help='only print errors and warning')
-    args = parser.parse_args()
-
-    # Create the logger
-    logger = logging.getLogger('mingw')
-    handler = logging.StreamHandler()
-    formatter = logging.Formatter('%(message)s')
-    handler.setFormatter(formatter)
-    logger.addHandler(handler)
-    logger.setLevel(logging.INFO)
-    if args.quiet:
-        logger.setLevel(logging.WARN)
-    if args.verbose:
-        logger.setLevel(logging.DEBUG)
-
-    # Get MinGW
-    root_dir = root(location = args.location, arch = args.arch,
-        version = args.version, threading = args.threading,
-        exceptions = args.exceptions, revision = args.revision,
-        log = logger)
-
-    sys.stdout.write('%s\n' % os.path.join(root_dir, 'bin'))
-
-if __name__ == '__main__':
-    try:
-        main()
-    except IOError as e:
-        sys.stderr.write('IO error: %s\n' % e)
-        sys.exit(1)
-    except OSError as e:
-        sys.stderr.write('OS error: %s\n' % e)
-        sys.exit(1)
-    except KeyboardInterrupt as e:
-        sys.stderr.write('Killed\n')
-        sys.exit(1)
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000000..85e8986040
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,2 @@
+numpy == 1.19.4
+scipy == 1.5.4
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000000..5cdab10cf7
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,140 @@
+import os
+import posixpath
+import re
+import shutil
+import sys
+
+from distutils import sysconfig
+import setuptools
+from setuptools.command import build_ext
+
+
+HERE = os.path.dirname(os.path.abspath(__file__))
+
+
+IS_WINDOWS = sys.platform.startswith("win")
+
+
+def _get_version():
+    """Parse the version string from __init__.py."""
+    with open(
+        os.path.join(HERE, "bindings", "python", "google_benchmark", "__init__.py")
+    ) as init_file:
+        try:
+            version_line = next(
+                line for line in init_file if line.startswith("__version__")
+            )
+        except StopIteration:
+            raise ValueError("__version__ not defined in __init__.py")
+        else:
+            namespace = {}
+            exec(version_line, namespace)  # pylint: disable=exec-used
+            return namespace["__version__"]
+
+
+def _parse_requirements(path):
+    with open(os.path.join(HERE, path)) as requirements:
+        return [
+            line.rstrip()
+            for line in requirements
+            if not (line.isspace() or line.startswith("#"))
+        ]
+
+
+class BazelExtension(setuptools.Extension):
+    """A C/C++ extension that is defined as a Bazel BUILD target."""
+
+    def __init__(self, name, bazel_target):
+        self.bazel_target = bazel_target
+        self.relpath, self.target_name = posixpath.relpath(bazel_target, "//").split(
+            ":"
+        )
+        setuptools.Extension.__init__(self, name, sources=[])
+
+
+class BuildBazelExtension(build_ext.build_ext):
+    """A command that runs Bazel to build a C/C++ extension."""
+
+    def run(self):
+        for ext in self.extensions:
+            self.bazel_build(ext)
+        build_ext.build_ext.run(self)
+
+    def bazel_build(self, ext):
+        """Runs the bazel build to create the package."""
+        with open("WORKSPACE", "r") as workspace:
+            workspace_contents = workspace.read()
+
+        with open("WORKSPACE", "w") as workspace:
+            workspace.write(
+                re.sub(
+                    r'(?<=path = ").*(?=",  # May be overwritten by setup\.py\.)',
+                    sysconfig.get_python_inc().replace(os.path.sep, posixpath.sep),
+                    workspace_contents,
+                )
+            )
+
+        if not os.path.exists(self.build_temp):
+            os.makedirs(self.build_temp)
+
+        bazel_argv = [
+            "bazel",
+            "build",
+            ext.bazel_target,
+            "--symlink_prefix=" + os.path.join(self.build_temp, "bazel-"),
+            "--compilation_mode=" + ("dbg" if self.debug else "opt"),
+        ]
+
+        if IS_WINDOWS:
+            # Link with python*.lib.
+            for library_dir in self.library_dirs:
+                bazel_argv.append("--linkopt=/LIBPATH:" + library_dir)
+
+        self.spawn(bazel_argv)
+
+        shared_lib_suffix = '.dll' if IS_WINDOWS else '.so'
+        ext_bazel_bin_path = os.path.join(
+            self.build_temp, 'bazel-bin',
+            ext.relpath, ext.target_name + shared_lib_suffix)
+
+        ext_dest_path = self.get_ext_fullpath(ext.name)
+        ext_dest_dir = os.path.dirname(ext_dest_path)
+        if not os.path.exists(ext_dest_dir):
+            os.makedirs(ext_dest_dir)
+        shutil.copyfile(ext_bazel_bin_path, ext_dest_path)
+
+
+setuptools.setup(
+    name="google_benchmark",
+    version=_get_version(),
+    url="https://github.com/google/benchmark",
+    description="A library to benchmark code snippets.",
+    author="Google",
+    author_email="benchmark-py@google.com",
+    # Contained modules and scripts.
+    package_dir={"": "bindings/python"},
+    packages=setuptools.find_packages("bindings/python"),
+    install_requires=_parse_requirements("bindings/python/requirements.txt"),
+    cmdclass=dict(build_ext=BuildBazelExtension),
+    ext_modules=[
+        BazelExtension(
+            "google_benchmark._benchmark",
+            "//bindings/python/google_benchmark:_benchmark",
+        )
+    ],
+    zip_safe=False,
+    # PyPI package information.
+    classifiers=[
+        "Development Status :: 4 - Beta",
+        "Intended Audience :: Developers",
+        "Intended Audience :: Science/Research",
+        "License :: OSI Approved :: Apache Software License",
+        "Programming Language :: Python :: 3.6",
+        "Programming Language :: Python :: 3.7",
+        "Programming Language :: Python :: 3.8",
+        "Topic :: Software Development :: Testing",
+        "Topic :: System :: Benchmark",
+    ],
+    license="Apache 2.0",
+    keywords="benchmark",
+)
diff --git a/src/benchmark_api_internal.cc b/src/benchmark_api_internal.cc
new file mode 100644
index 0000000000..89da519afc
--- /dev/null
+++ b/src/benchmark_api_internal.cc
@@ -0,0 +1,94 @@
+#include "benchmark_api_internal.h"
+
+#include <cinttypes>
+
+#include "string_util.h"
+
+namespace benchmark {
+namespace internal {
+
+BenchmarkInstance::BenchmarkInstance(Benchmark* benchmark, int family_idx,
+                                     int per_family_instance_idx,
+                                     const std::vector<int64_t>& args,
+                                     int thread_count)
+    : benchmark_(*benchmark),
+      family_index_(family_idx),
+      per_family_instance_index_(per_family_instance_idx),
+      aggregation_report_mode_(benchmark_.aggregation_report_mode_),
+      args_(args),
+      time_unit_(benchmark_.time_unit_),
+      measure_process_cpu_time_(benchmark_.measure_process_cpu_time_),
+      use_real_time_(benchmark_.use_real_time_),
+      use_manual_time_(benchmark_.use_manual_time_),
+      complexity_(benchmark_.complexity_),
+      complexity_lambda_(benchmark_.complexity_lambda_),
+      statistics_(benchmark_.statistics_),
+      repetitions_(benchmark_.repetitions_),
+      min_time_(benchmark_.min_time_),
+      iterations_(benchmark_.iterations_),
+      threads_(thread_count) {
+  name_.function_name = benchmark_.name_;
+
+  size_t arg_i = 0;
+  for (const auto& arg : args) {
+    if (!name_.args.empty()) {
+      name_.args += '/';
+    }
+
+    if (arg_i < benchmark->arg_names_.size()) {
+      const auto& arg_name = benchmark_.arg_names_[arg_i];
+      if (!arg_name.empty()) {
+        name_.args += StrFormat("%s:", arg_name.c_str());
+      }
+    }
+
+    name_.args += StrFormat("%" PRId64, arg);
+    ++arg_i;
+  }
+
+  if (!IsZero(benchmark->min_time_)) {
+    name_.min_time = StrFormat("min_time:%0.3f", benchmark_.min_time_);
+  }
+
+  if (benchmark_.iterations_ != 0) {
+    name_.iterations = StrFormat(
+        "iterations:%lu", static_cast<unsigned long>(benchmark_.iterations_));
+  }
+
+  if (benchmark_.repetitions_ != 0) {
+    name_.repetitions = StrFormat("repeats:%d", benchmark_.repetitions_);
+  }
+
+  if (benchmark_.measure_process_cpu_time_) {
+    name_.time_type = "process_time";
+  }
+
+  if (benchmark_.use_manual_time_) {
+    if (!name_.time_type.empty()) {
+      name_.time_type += '/';
+    }
+    name_.time_type += "manual_time";
+  } else if (benchmark_.use_real_time_) {
+    if (!name_.time_type.empty()) {
+      name_.time_type += '/';
+    }
+    name_.time_type += "real_time";
+  }
+
+  if (!benchmark_.thread_counts_.empty()) {
+    name_.threads = StrFormat("threads:%d", threads_);
+  }
+}
+
+State BenchmarkInstance::Run(
+    IterationCount iters, int thread_id, internal::ThreadTimer* timer,
+    internal::ThreadManager* manager,
+    internal::PerfCountersMeasurement* perf_counters_measurement) const {
+  State st(iters, args_, thread_id, threads_, timer, manager,
+           perf_counters_measurement);
+  benchmark_.Run(st);
+  return st;
+}
+
+}  // namespace internal
+}  // namespace benchmark
diff --git a/src/benchmark_api_internal.h b/src/benchmark_api_internal.h
index 4770d23aa0..9296b7d2c8 100644
--- a/src/benchmark_api_internal.h
+++ b/src/benchmark_api_internal.h
@@ -1,41 +1,71 @@
 #ifndef BENCHMARK_API_INTERNAL_H
 #define BENCHMARK_API_INTERNAL_H
 
-#include "benchmark/benchmark.h"
-
 #include <cmath>
 #include <iosfwd>
 #include <limits>
+#include <memory>
 #include <string>
 #include <vector>
 
+#include "benchmark/benchmark.h"
+#include "commandlineflags.h"
+
 namespace benchmark {
 namespace internal {
 
 // Information kept per benchmark we may want to run
-struct Benchmark::Instance {
-  std::string name;
-  Benchmark* benchmark;
-  ReportMode report_mode;
-  std::vector<int> arg;
-  json json_arg;
-  TimeUnit time_unit;
-  int range_multiplier;
-  bool use_real_time;
-  bool use_manual_time;
-  BigO complexity;
-  BigOFunc* complexity_lambda;
-  UserCounters counters;
-  const std::vector<Statistics>* statistics;
-  bool last_benchmark_instance;
-  int repetitions;
-  double min_time;
-  size_t iterations;
-  int threads;  // Number of concurrent threads to us
+class BenchmarkInstance {
+ public:
+  BenchmarkInstance(Benchmark* benchmark, int family_index,
+                    int per_family_instance_index,
+                    const std::vector<int64_t>& args, int threads);
+
+  const BenchmarkName& name() const { return name_; }
+  int family_index() const { return family_index_; }
+  int per_family_instance_index() const { return per_family_instance_index_; }
+  AggregationReportMode aggregation_report_mode() const {
+    return aggregation_report_mode_;
+  }
+  TimeUnit time_unit() const { return time_unit_; }
+  bool measure_process_cpu_time() const { return measure_process_cpu_time_; }
+  bool use_real_time() const { return use_real_time_; }
+  bool use_manual_time() const { return use_manual_time_; }
+  BigO complexity() const { return complexity_; }
+  BigOFunc& complexity_lambda() const { return *complexity_lambda_; }
+  const std::vector<Statistics>& statistics() const { return statistics_; }
+  int repetitions() const { return repetitions_; }
+  double min_time() const { return min_time_; }
+  IterationCount iterations() const { return iterations_; }
+  int threads() const { return threads_; }
+
+  State Run(IterationCount iters, int thread_id, internal::ThreadTimer* timer,
+            internal::ThreadManager* manager,
+            internal::PerfCountersMeasurement* perf_counters_measurement) const;
+
+ private:
+  BenchmarkName name_;
+  Benchmark& benchmark_;
+  const int family_index_;
+  const int per_family_instance_index_;
+  AggregationReportMode aggregation_report_mode_;
+  const std::vector<int64_t>& args_;
+  TimeUnit time_unit_;
+  bool measure_process_cpu_time_;
+  bool use_real_time_;
+  bool use_manual_time_;
+  BigO complexity_;
+  BigOFunc* complexity_lambda_;
+  UserCounters counters_;
+  const std::vector<Statistics>& statistics_;
+  int repetitions_;
+  double min_time_;
+  IterationCount iterations_;
+  int threads_;  // Number of concurrent threads to us
 };
 
 bool FindBenchmarksInternal(const std::string& re,
-                            std::vector<Benchmark::Instance>* benchmarks,
+                            std::vector<BenchmarkInstance>* benchmarks,
                             std::ostream* Err);
 
 bool IsZero(double n);
diff --git a/src/benchmark_main.cc b/src/benchmark_main.cc
new file mode 100644
index 0000000000..b3b2478314
--- /dev/null
+++ b/src/benchmark_main.cc
@@ -0,0 +1,17 @@
+// Copyright 2018 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "benchmark/benchmark.h"
+
+BENCHMARK_MAIN();
diff --git a/src/benchmark_name.cc b/src/benchmark_name.cc
new file mode 100644
index 0000000000..2a17ebce27
--- /dev/null
+++ b/src/benchmark_name.cc
@@ -0,0 +1,58 @@
+// Copyright 2015 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <benchmark/benchmark.h>
+
+namespace benchmark {
+
+namespace {
+
+// Compute the total size of a pack of std::strings
+size_t size_impl() { return 0; }
+
+template <typename Head, typename... Tail>
+size_t size_impl(const Head& head, const Tail&... tail) {
+  return head.size() + size_impl(tail...);
+}
+
+// Join a pack of std::strings using a delimiter
+// TODO: use absl::StrJoin
+void join_impl(std::string&, char) {}
+
+template <typename Head, typename... Tail>
+void join_impl(std::string& s, const char delimiter, const Head& head,
+               const Tail&... tail) {
+  if (!s.empty() && !head.empty()) {
+    s += delimiter;
+  }
+
+  s += head;
+
+  join_impl(s, delimiter, tail...);
+}
+
+template <typename... Ts>
+std::string join(char delimiter, const Ts&... ts) {
+  std::string s;
+  s.reserve(sizeof...(Ts) + size_impl(ts...));
+  join_impl(s, delimiter, ts...);
+  return s;
+}
+}  // namespace
+
+std::string BenchmarkName::str() const {
+  return join('/', function_name, args, min_time, iterations, repetitions,
+              time_type, threads);
+}
+}  // namespace benchmark
diff --git a/src/benchmark_register.h b/src/benchmark_register.h
new file mode 100644
index 0000000000..7033dbf622
--- /dev/null
+++ b/src/benchmark_register.h
@@ -0,0 +1,108 @@
+#ifndef BENCHMARK_REGISTER_H
+#define BENCHMARK_REGISTER_H
+
+#include <limits>
+#include <vector>
+
+#include "check.h"
+
+namespace benchmark {
+namespace internal {
+
+// Append the powers of 'mult' in the closed interval [lo, hi].
+// Returns iterator to the start of the inserted range.
+template <typename T>
+typename std::vector<T>::iterator
+AddPowers(std::vector<T>* dst, T lo, T hi, int mult) {
+  BM_CHECK_GE(lo, 0);
+  BM_CHECK_GE(hi, lo);
+  BM_CHECK_GE(mult, 2);
+
+  const size_t start_offset = dst->size();
+
+  static const T kmax = std::numeric_limits<T>::max();
+
+  // Space out the values in multiples of "mult"
+  for (T i = static_cast<T>(1); i <= hi; i *= mult) {
+    if (i >= lo) {
+      dst->push_back(i);
+    }
+    // Break the loop here since multiplying by
+    // 'mult' would move outside of the range of T
+    if (i > kmax / mult) break;
+  }
+
+  return dst->begin() + start_offset;
+}
+
+template <typename T>
+void AddNegatedPowers(std::vector<T>* dst, T lo, T hi, int mult) {
+  // We negate lo and hi so we require that they cannot be equal to 'min'.
+  BM_CHECK_GT(lo, std::numeric_limits<T>::min());
+  BM_CHECK_GT(hi, std::numeric_limits<T>::min());
+  BM_CHECK_GE(hi, lo);
+  BM_CHECK_LE(hi, 0);
+
+  // Add positive powers, then negate and reverse.
+  // Casts necessary since small integers get promoted
+  // to 'int' when negating.
+  const auto lo_complement = static_cast<T>(-lo);
+  const auto hi_complement = static_cast<T>(-hi);
+
+  const auto it = AddPowers(dst, hi_complement, lo_complement, mult);
+
+  std::for_each(it, dst->end(), [](T& t) { t *= -1; });
+  std::reverse(it, dst->end());
+}
+
+template <typename T>
+void AddRange(std::vector<T>* dst, T lo, T hi, int mult) {
+  static_assert(std::is_integral<T>::value && std::is_signed<T>::value,
+                "Args type must be a signed integer");
+
+  BM_CHECK_GE(hi, lo);
+  BM_CHECK_GE(mult, 2);
+
+  // Add "lo"
+  dst->push_back(lo);
+
+  // Handle lo == hi as a special case, so we then know
+  // lo < hi and so it is safe to add 1 to lo and subtract 1
+  // from hi without falling outside of the range of T.
+  if (lo == hi) return;
+
+  // Ensure that lo_inner <= hi_inner below.
+  if (lo + 1 == hi) {
+    dst->push_back(hi);
+    return;
+  }
+
+  // Add all powers of 'mult' in the range [lo+1, hi-1] (inclusive).
+  const auto lo_inner = static_cast<T>(lo + 1);
+  const auto hi_inner = static_cast<T>(hi - 1);
+
+  // Insert negative values
+  if (lo_inner < 0) {
+    AddNegatedPowers(dst, lo_inner, std::min(hi_inner, T{-1}), mult);
+  }
+
+  // Treat 0 as a special case (see discussion on #762).
+  if (lo < 0 && hi >= 0) {
+    dst->push_back(0);
+  }
+
+  // Insert positive values
+  if (hi_inner > 0) {
+    AddPowers(dst, std::max(lo_inner, T{1}), hi_inner, mult);
+  }
+
+  // Add "hi" (if different from last value).
+  if (hi != dst->back()) {
+    dst->push_back(hi);
+  }
+}
+
+}  // namespace internal
+}  // namespace benchmark
+
+#endif  // BENCHMARK_REGISTER_H
diff --git a/src/benchmark_runner.cc b/src/benchmark_runner.cc
new file mode 100644
index 0000000000..485bbc6e85
--- /dev/null
+++ b/src/benchmark_runner.cc
@@ -0,0 +1,349 @@
+// Copyright 2015 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "benchmark_runner.h"
+
+#include "benchmark/benchmark.h"
+#include "benchmark_api_internal.h"
+#include "internal_macros.h"
+
+#ifndef BENCHMARK_OS_WINDOWS
+#ifndef BENCHMARK_OS_FUCHSIA
+#include <sys/resource.h>
+#endif
+#include <sys/time.h>
+#include <unistd.h>
+#endif
+
+#include <algorithm>
+#include <atomic>
+#include <condition_variable>
+#include <cstdio>
+#include <cstdlib>
+#include <fstream>
+#include <iostream>
+#include <memory>
+#include <string>
+#include <thread>
+#include <utility>
+
+#include "check.h"
+#include "colorprint.h"
+#include "commandlineflags.h"
+#include "complexity.h"
+#include "counter.h"
+#include "internal_macros.h"
+#include "log.h"
+#include "mutex.h"
+#include "perf_counters.h"
+#include "re.h"
+#include "statistics.h"
+#include "string_util.h"
+#include "thread_manager.h"
+#include "thread_timer.h"
+
+namespace benchmark {
+
+namespace internal {
+
+MemoryManager* memory_manager = nullptr;
+
+namespace {
+
+static constexpr IterationCount kMaxIterations = 1000000000;
+
+BenchmarkReporter::Run CreateRunReport(
+    const benchmark::internal::BenchmarkInstance& b,
+    const internal::ThreadManager::Result& results,
+    IterationCount memory_iterations,
+    const MemoryManager::Result& memory_result, double seconds,
+    int64_t repetition_index, int64_t repeats) {
+  // Create report about this benchmark run.
+  BenchmarkReporter::Run report;
+
+  report.run_name = b.name();
+  report.family_index = b.family_index();
+  report.per_family_instance_index = b.per_family_instance_index();
+  report.error_occurred = results.has_error_;
+  report.error_message = results.error_message_;
+  report.report_label = results.report_label_;
+  // This is the total iterations across all threads.
+  report.iterations = results.iterations;
+  report.time_unit = b.time_unit();
+  report.threads = b.threads();
+  report.repetition_index = repetition_index;
+  report.repetitions = repeats;
+
+  if (!report.error_occurred) {
+    if (b.use_manual_time()) {
+      report.real_accumulated_time = results.manual_time_used;
+    } else {
+      report.real_accumulated_time = results.real_time_used;
+    }
+    report.cpu_accumulated_time = results.cpu_time_used;
+    report.complexity_n = results.complexity_n;
+    report.complexity = b.complexity();
+    report.complexity_lambda = b.complexity_lambda();
+    report.statistics = &b.statistics();
+    report.counters = results.counters;
+
+    if (memory_iterations > 0) {
+      report.has_memory_result = true;
+      report.allocs_per_iter =
+          memory_iterations ? static_cast<double>(memory_result.num_allocs) /
+                                  memory_iterations
+                            : 0;
+      report.max_bytes_used = memory_result.max_bytes_used;
+    }
+
+    internal::Finish(&report.counters, results.iterations, seconds,
+                     b.threads());
+  }
+  return report;
+}
+
+// Execute one thread of benchmark b for the specified number of iterations.
+// Adds the stats collected for the thread into manager->results.
+void RunInThread(const BenchmarkInstance* b, IterationCount iters,
+                 int thread_id, ThreadManager* manager,
+                 PerfCountersMeasurement* perf_counters_measurement) {
+  internal::ThreadTimer timer(
+      b->measure_process_cpu_time()
+          ? internal::ThreadTimer::CreateProcessCpuTime()
+          : internal::ThreadTimer::Create());
+  State st =
+      b->Run(iters, thread_id, &timer, manager, perf_counters_measurement);
+  BM_CHECK(st.error_occurred() || st.iterations() >= st.max_iterations)
+      << "Benchmark returned before State::KeepRunning() returned false!";
+  {
+    MutexLock l(manager->GetBenchmarkMutex());
+    internal::ThreadManager::Result& results = manager->results;
+    results.iterations += st.iterations();
+    results.cpu_time_used += timer.cpu_time_used();
+    results.real_time_used += timer.real_time_used();
+    results.manual_time_used += timer.manual_time_used();
+    results.complexity_n += st.complexity_length_n();
+    internal::Increment(&results.counters, st.counters);
+  }
+  manager->NotifyThreadComplete();
+}
+
+}  // end namespace
+
+BenchmarkRunner::BenchmarkRunner(
+    const benchmark::internal::BenchmarkInstance& b_,
+    BenchmarkReporter::PerFamilyRunReports* reports_for_family_)
+    : b(b_),
+      reports_for_family(reports_for_family_),
+      min_time(!IsZero(b.min_time()) ? b.min_time() : FLAGS_benchmark_min_time),
+      repeats(b.repetitions() != 0 ? b.repetitions()
+                                   : FLAGS_benchmark_repetitions),
+      has_explicit_iteration_count(b.iterations() != 0),
+      pool(b.threads() - 1),
+      iters(has_explicit_iteration_count ? b.iterations() : 1),
+      perf_counters_measurement(
+          PerfCounters::Create(StrSplit(FLAGS_benchmark_perf_counters, ','))),
+      perf_counters_measurement_ptr(perf_counters_measurement.IsValid()
+                                        ? &perf_counters_measurement
+                                        : nullptr) {
+  run_results.display_report_aggregates_only =
+      (FLAGS_benchmark_report_aggregates_only ||
+       FLAGS_benchmark_display_aggregates_only);
+  run_results.file_report_aggregates_only =
+      FLAGS_benchmark_report_aggregates_only;
+  if (b.aggregation_report_mode() != internal::ARM_Unspecified) {
+    run_results.display_report_aggregates_only =
+        (b.aggregation_report_mode() &
+         internal::ARM_DisplayReportAggregatesOnly);
+    run_results.file_report_aggregates_only =
+        (b.aggregation_report_mode() & internal::ARM_FileReportAggregatesOnly);
+    BM_CHECK(FLAGS_benchmark_perf_counters.empty() ||
+             perf_counters_measurement.IsValid())
+        << "Perf counters were requested but could not be set up.";
+  }
+}
+
+BenchmarkRunner::IterationResults BenchmarkRunner::DoNIterations() {
+  BM_VLOG(2) << "Running " << b.name().str() << " for " << iters << "\n";
+
+  std::unique_ptr<internal::ThreadManager> manager;
+  manager.reset(new internal::ThreadManager(b.threads()));
+
+  // Run all but one thread in separate threads
+  for (std::size_t ti = 0; ti < pool.size(); ++ti) {
+    pool[ti] = std::thread(&RunInThread, &b, iters, static_cast<int>(ti + 1),
+                           manager.get(), perf_counters_measurement_ptr);
+  }
+  // And run one thread here directly.
+  // (If we were asked to run just one thread, we don't create new threads.)
+  // Yes, we need to do this here *after* we start the separate threads.
+  RunInThread(&b, iters, 0, manager.get(), perf_counters_measurement_ptr);
+
+  // The main thread has finished. Now let's wait for the other threads.
+  manager->WaitForAllThreads();
+  for (std::thread& thread : pool) thread.join();
+
+  IterationResults i;
+  // Acquire the measurements/counters from the manager, UNDER THE LOCK!
+  {
+    MutexLock l(manager->GetBenchmarkMutex());
+    i.results = manager->results;
+  }
+
+  // And get rid of the manager.
+  manager.reset();
+
+  // Adjust real/manual time stats since they were reported per thread.
+  i.results.real_time_used /= b.threads();
+  i.results.manual_time_used /= b.threads();
+  // If we were measuring whole-process CPU usage, adjust the CPU time too.
+  if (b.measure_process_cpu_time()) i.results.cpu_time_used /= b.threads();
+
+  BM_VLOG(2) << "Ran in " << i.results.cpu_time_used << "/"
+             << i.results.real_time_used << "\n";
+
+  // By using KeepRunningBatch a benchmark can iterate more times than
+  // requested, so take the iteration count from i.results.
+  i.iters = i.results.iterations / b.threads();
+
+  // Base decisions off of real time if requested by this benchmark.
+  i.seconds = i.results.cpu_time_used;
+  if (b.use_manual_time()) {
+    i.seconds = i.results.manual_time_used;
+  } else if (b.use_real_time()) {
+    i.seconds = i.results.real_time_used;
+  }
+
+  return i;
+}
+
+IterationCount BenchmarkRunner::PredictNumItersNeeded(
+    const IterationResults& i) const {
+  // See how much iterations should be increased by.
+  // Note: Avoid division by zero with max(seconds, 1ns).
+  double multiplier = min_time * 1.4 / std::max(i.seconds, 1e-9);
+  // If our last run was at least 10% of FLAGS_benchmark_min_time then we
+  // use the multiplier directly.
+  // Otherwise we use at most 10 times expansion.
+  // NOTE: When the last run was at least 10% of the min time the max
+  // expansion should be 14x.
+  bool is_significant = (i.seconds / min_time) > 0.1;
+  multiplier = is_significant ? multiplier : std::min(10.0, multiplier);
+  if (multiplier <= 1.0) multiplier = 2.0;
+
+  // So what seems to be the sufficiently-large iteration count? Round up.
+  const IterationCount max_next_iters = static_cast<IterationCount>(
+      std::lround(std::max(multiplier * static_cast<double>(i.iters),
+                           static_cast<double>(i.iters) + 1.0)));
+  // But we do have *some* sanity limits though..
+  const IterationCount next_iters = std::min(max_next_iters, kMaxIterations);
+
+  BM_VLOG(3) << "Next iters: " << next_iters << ", " << multiplier << "\n";
+  return next_iters;  // round up before conversion to integer.
+}
+
+bool BenchmarkRunner::ShouldReportIterationResults(
+    const IterationResults& i) const {
+  // Determine if this run should be reported;
+  // Either it has run for a sufficient amount of time
+  // or because an error was reported.
+  return i.results.has_error_ ||
+         i.iters >= kMaxIterations ||  // Too many iterations already.
+         i.seconds >= min_time ||      // The elapsed time is large enough.
+         // CPU time is specified but the elapsed real time greatly exceeds
+         // the minimum time.
+         // Note that user provided timers are except from this sanity check.
+         ((i.results.real_time_used >= 5 * min_time) && !b.use_manual_time());
+}
+
+void BenchmarkRunner::DoOneRepetition() {
+  assert(HasRepeatsRemaining() && "Already done all repetitions?");
+
+  const bool is_the_first_repetition = num_repetitions_done == 0;
+  IterationResults i;
+
+  // We *may* be gradually increasing the length (iteration count)
+  // of the benchmark until we decide the results are significant.
+  // And once we do, we report those last results and exit.
+  // Please do note that the if there are repetitions, the iteration count
+  // is *only* calculated for the *first* repetition, and other repetitions
+  // simply use that precomputed iteration count.
+  for (;;) {
+    i = DoNIterations();
+
+    // Do we consider the results to be significant?
+    // If we are doing repetitions, and the first repetition was already done,
+    // it has calculated the correct iteration time, so we have run that very
+    // iteration count just now. No need to calculate anything. Just report.
+    // Else, the normal rules apply.
+    const bool results_are_significant = !is_the_first_repetition ||
+                                         has_explicit_iteration_count ||
+                                         ShouldReportIterationResults(i);
+
+    if (results_are_significant) break;  // Good, let's report them!
+
+    // Nope, bad iteration. Let's re-estimate the hopefully-sufficient
+    // iteration count, and run the benchmark again...
+
+    iters = PredictNumItersNeeded(i);
+    assert(iters > i.iters &&
+           "if we did more iterations than we want to do the next time, "
+           "then we should have accepted the current iteration run.");
+  }
+
+  // Oh, one last thing, we need to also produce the 'memory measurements'..
+  MemoryManager::Result memory_result;
+  IterationCount memory_iterations = 0;
+  if (memory_manager != nullptr) {
+    // Only run a few iterations to reduce the impact of one-time
+    // allocations in benchmarks that are not properly managed.
+    memory_iterations = std::min<IterationCount>(16, iters);
+    memory_manager->Start();
+    std::unique_ptr<internal::ThreadManager> manager;
+    manager.reset(new internal::ThreadManager(1));
+    RunInThread(&b, memory_iterations, 0, manager.get(),
+                perf_counters_measurement_ptr);
+    manager->WaitForAllThreads();
+    manager.reset();
+
+    memory_manager->Stop(&memory_result);
+  }
+
+  // Ok, now actualy report.
+  BenchmarkReporter::Run report =
+      CreateRunReport(b, i.results, memory_iterations, memory_result, i.seconds,
+                      num_repetitions_done, repeats);
+
+  if (reports_for_family) {
+    ++reports_for_family->num_runs_done;
+    if (!report.error_occurred) reports_for_family->Runs.push_back(report);
+  }
+
+  run_results.non_aggregates.push_back(report);
+
+  ++num_repetitions_done;
+}
+
+RunResults&& BenchmarkRunner::GetResults() {
+  assert(!HasRepeatsRemaining() && "Did not run all repetitions yet?");
+
+  // Calculate additional statistics over the repetitions of this instance.
+  run_results.aggregates_only = ComputeStats(run_results.non_aggregates);
+
+  return std::move(run_results);
+}
+
+}  // end namespace internal
+
+}  // end namespace benchmark
diff --git a/src/benchmark_runner.h b/src/benchmark_runner.h
new file mode 100644
index 0000000000..8427ce6a26
--- /dev/null
+++ b/src/benchmark_runner.h
@@ -0,0 +1,102 @@
+// Copyright 2015 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef BENCHMARK_RUNNER_H_
+#define BENCHMARK_RUNNER_H_
+
+#include <thread>
+#include <vector>
+
+#include "benchmark_api_internal.h"
+#include "internal_macros.h"
+#include "perf_counters.h"
+#include "thread_manager.h"
+
+namespace benchmark {
+
+BM_DECLARE_double(benchmark_min_time);
+BM_DECLARE_int32(benchmark_repetitions);
+BM_DECLARE_bool(benchmark_report_aggregates_only);
+BM_DECLARE_bool(benchmark_display_aggregates_only);
+BM_DECLARE_string(benchmark_perf_counters);
+
+namespace internal {
+
+extern MemoryManager* memory_manager;
+
+struct RunResults {
+  std::vector<BenchmarkReporter::Run> non_aggregates;
+  std::vector<BenchmarkReporter::Run> aggregates_only;
+
+  bool display_report_aggregates_only = false;
+  bool file_report_aggregates_only = false;
+};
+
+class BenchmarkRunner {
+ public:
+  BenchmarkRunner(const benchmark::internal::BenchmarkInstance& b_,
+                  BenchmarkReporter::PerFamilyRunReports* reports_for_family);
+
+  int GetNumRepeats() const { return repeats; }
+
+  bool HasRepeatsRemaining() const {
+    return GetNumRepeats() != num_repetitions_done;
+  }
+
+  void DoOneRepetition();
+
+  RunResults&& GetResults();
+
+  BenchmarkReporter::PerFamilyRunReports* GetReportsForFamily() const {
+    return reports_for_family;
+  }
+
+ private:
+  RunResults run_results;
+
+  const benchmark::internal::BenchmarkInstance& b;
+  BenchmarkReporter::PerFamilyRunReports* reports_for_family;
+
+  const double min_time;
+  const int repeats;
+  const bool has_explicit_iteration_count;
+
+  int num_repetitions_done = 0;
+
+  std::vector<std::thread> pool;
+
+  IterationCount iters;  // preserved between repetitions!
+  // So only the first repetition has to find/calculate it,
+  // the other repetitions will just use that precomputed iteration count.
+
+  PerfCountersMeasurement perf_counters_measurement;
+  PerfCountersMeasurement* const perf_counters_measurement_ptr;
+
+  struct IterationResults {
+    internal::ThreadManager::Result results;
+    IterationCount iters;
+    double seconds;
+  };
+  IterationResults DoNIterations();
+
+  IterationCount PredictNumItersNeeded(const IterationResults& i) const;
+
+  bool ShouldReportIterationResults(const IterationResults& i) const;
+};
+
+}  // namespace internal
+
+}  // end namespace benchmark
+
+#endif  // BENCHMARK_RUNNER_H_
diff --git a/src/check.h b/src/check.h
index 73bead2fb5..0efd13ff4d 100644
--- a/src/check.h
+++ b/src/check.h
@@ -1,9 +1,9 @@
 #ifndef CHECK_H_
 #define CHECK_H_
 
+#include <cmath>
 #include <cstdlib>
 #include <ostream>
-#include <cmath>
 
 #include "internal_macros.h"
 #include "log.h"
@@ -23,8 +23,9 @@ BENCHMARK_NORETURN inline void CallAbortHandler() {
   std::abort();  // fallback to enforce noreturn
 }
 
-// CheckHandler is the class constructed by failing CHECK macros. CheckHandler
-// will log information about the failures and abort when it is destructed.
+// CheckHandler is the class constructed by failing BM_CHECK macros.
+// CheckHandler will log information about the failures and abort when it is
+// destructed.
 class CheckHandler {
  public:
   CheckHandler(const char* check, const char* file, const char* func, int line)
@@ -51,29 +52,32 @@ class CheckHandler {
 }  // end namespace internal
 }  // end namespace benchmark
 
-// The CHECK macro returns a std::ostream object that can have extra information
-// written to it.
+// The BM_CHECK macro returns a std::ostream object that can have extra
+// information written to it.
 #ifndef NDEBUG
-#define CHECK(b)                                                             \
+#define BM_CHECK(b)                                                          \
   (b ? ::benchmark::internal::GetNullLogInstance()                           \
      : ::benchmark::internal::CheckHandler(#b, __FILE__, __func__, __LINE__) \
            .GetLog())
 #else
-#define CHECK(b) ::benchmark::internal::GetNullLogInstance()
+#define BM_CHECK(b) ::benchmark::internal::GetNullLogInstance()
 #endif
 
-#define CHECK_EQ(a, b) CHECK((a) == (b))
-#define CHECK_NE(a, b) CHECK((a) != (b))
-#define CHECK_GE(a, b) CHECK((a) >= (b))
-#define CHECK_LE(a, b) CHECK((a) <= (b))
-#define CHECK_GT(a, b) CHECK((a) > (b))
-#define CHECK_LT(a, b) CHECK((a) < (b))
-
-#define CHECK_FLOAT_EQ(a, b, eps) CHECK(std::fabs((a) - (b)) <  (eps))
-#define CHECK_FLOAT_NE(a, b, eps) CHECK(std::fabs((a) - (b)) >= (eps))
-#define CHECK_FLOAT_GE(a, b, eps) CHECK((a) - (b) > -(eps))
-#define CHECK_FLOAT_LE(a, b, eps) CHECK((b) - (a) > -(eps))
-#define CHECK_FLOAT_GT(a, b, eps) CHECK((a) - (b) >  (eps))
-#define CHECK_FLOAT_LT(a, b, eps) CHECK((b) - (a) >  (eps))
+// clang-format off
+// preserve whitespacing between operators for alignment
+#define BM_CHECK_EQ(a, b) BM_CHECK((a) == (b))
+#define BM_CHECK_NE(a, b) BM_CHECK((a) != (b))
+#define BM_CHECK_GE(a, b) BM_CHECK((a) >= (b))
+#define BM_CHECK_LE(a, b) BM_CHECK((a) <= (b))
+#define BM_CHECK_GT(a, b) BM_CHECK((a) > (b))
+#define BM_CHECK_LT(a, b) BM_CHECK((a) < (b))
+
+#define BM_CHECK_FLOAT_EQ(a, b, eps) BM_CHECK(std::fabs((a) - (b)) <  (eps))
+#define BM_CHECK_FLOAT_NE(a, b, eps) BM_CHECK(std::fabs((a) - (b)) >= (eps))
+#define BM_CHECK_FLOAT_GE(a, b, eps) BM_CHECK((a) - (b) > -(eps))
+#define BM_CHECK_FLOAT_LE(a, b, eps) BM_CHECK((b) - (a) > -(eps))
+#define BM_CHECK_FLOAT_GT(a, b, eps) BM_CHECK((a) - (b) >  (eps))
+#define BM_CHECK_FLOAT_LT(a, b, eps) BM_CHECK((b) - (a) >  (eps))
+//clang-format on
 
 #endif  // CHECK_H_
diff --git a/src/colorprint.cc b/src/colorprint.cc
index fff6a98818..afaa55dd54 100644
--- a/src/colorprint.cc
+++ b/src/colorprint.cc
@@ -94,7 +94,7 @@ std::string FormatString(const char* msg, va_list args) {
   va_end(args_cp);
 
   // currently there is no error handling for failure, so this is hack.
-  CHECK(ret >= 0);
+  BM_CHECK(ret >= 0);
 
   if (ret == 0)  // handle empty expansion
     return {};
@@ -105,7 +105,7 @@ std::string FormatString(const char* msg, va_list args) {
     size = (size_t)ret + 1;  // + 1 for the null byte
     std::unique_ptr<char[]> buff(new char[size]);
     ret = vsnprintf(buff.get(), size, msg, args);
-    CHECK(ret > 0 && ((size_t)ret) < size);
+    BM_CHECK(ret > 0 && ((size_t)ret) < size);
     return buff.get();
   }
 }
diff --git a/src/commandlineflags.cc b/src/commandlineflags.cc
index 2fc92517a3..5724aaa294 100644
--- a/src/commandlineflags.cc
+++ b/src/commandlineflags.cc
@@ -14,13 +14,20 @@
 
 #include "commandlineflags.h"
 
+#include <algorithm>
 #include <cctype>
 #include <cstdlib>
 #include <cstring>
 #include <iostream>
 #include <limits>
+#include <map>
+#include <utility>
+
+#include "../src/string_util.h"
 
 namespace benchmark {
+namespace {
+
 // Parses 'str' for a 32-bit signed integer.  If successful, writes
 // the result to *value and returns true; otherwise leaves *value
 // unchanged and returns false.
@@ -45,7 +52,7 @@ bool ParseInt32(const std::string& src_text, const char* str, int32_t* value) {
       // LONG_MAX or LONG_MIN when the input overflows.)
       result != long_value
       // The parsed value overflows as an Int32.
-      ) {
+  ) {
     std::cerr << src_text << " is expected to be a 32-bit integer, "
               << "but actually has value \"" << str << "\", "
               << "which overflows.\n";
@@ -75,6 +82,30 @@ bool ParseDouble(const std::string& src_text, const char* str, double* value) {
   return true;
 }
 
+// Parses 'str' into KV pairs. If successful, writes the result to *value and
+// returns true; otherwise leaves *value unchanged and returns false.
+bool ParseKvPairs(const std::string& src_text, const char* str,
+                  std::map<std::string, std::string>* value) {
+  std::map<std::string, std::string> kvs;
+  for (const auto& kvpair : StrSplit(str, ',')) {
+    const auto kv = StrSplit(kvpair, '=');
+    if (kv.size() != 2) {
+      std::cerr << src_text << " is expected to be a comma-separated list of "
+                << "<key>=<value> strings, but actually has value \"" << str
+                << "\".\n";
+      return false;
+    }
+    if (!kvs.emplace(kv[0], kv[1]).second) {
+      std::cerr << src_text << " is expected to contain unique keys but key \""
+                << kv[0] << "\" was repeated.\n";
+      return false;
+    }
+  }
+
+  *value = kvs;
+  return true;
+}
+
 // Returns the name of the environment variable corresponding to the
 // given flag.  For example, FlagToEnvVar("foo") will return
 // "BENCHMARK_FOO" in the open-source version.
@@ -85,47 +116,59 @@ static std::string FlagToEnvVar(const char* flag) {
   for (size_t i = 0; i != flag_str.length(); ++i)
     env_var += static_cast<char>(::toupper(flag_str.c_str()[i]));
 
-  return "BENCHMARK_" + env_var;
+  return env_var;
 }
 
-// Reads and returns the Boolean environment variable corresponding to
-// the given flag; if it's not set, returns default_value.
-//
-// The value is considered true iff it's not "0".
-bool BoolFromEnv(const char* flag, bool default_value) {
+}  // namespace
+
+bool BoolFromEnv(const char* flag, bool default_val) {
   const std::string env_var = FlagToEnvVar(flag);
-  const char* const string_value = getenv(env_var.c_str());
-  return string_value == nullptr ? default_value
-                                 : strcmp(string_value, "0") != 0;
+  const char* const value_str = getenv(env_var.c_str());
+  return value_str == nullptr ? default_val : IsTruthyFlagValue(value_str);
 }
 
-// Reads and returns a 32-bit integer stored in the environment
-// variable corresponding to the given flag; if it isn't set or
-// doesn't represent a valid 32-bit integer, returns default_value.
-int32_t Int32FromEnv(const char* flag, int32_t default_value) {
+int32_t Int32FromEnv(const char* flag, int32_t default_val) {
   const std::string env_var = FlagToEnvVar(flag);
-  const char* const string_value = getenv(env_var.c_str());
-  if (string_value == nullptr) {
-    // The environment variable is not set.
-    return default_value;
+  const char* const value_str = getenv(env_var.c_str());
+  int32_t value = default_val;
+  if (value_str == nullptr ||
+      !ParseInt32(std::string("Environment variable ") + env_var, value_str,
+                  &value)) {
+    return default_val;
   }
+  return value;
+}
 
-  int32_t result = default_value;
-  if (!ParseInt32(std::string("Environment variable ") + env_var, string_value,
-                  &result)) {
-    std::cout << "The default value " << default_value << " is used.\n";
-    return default_value;
+double DoubleFromEnv(const char* flag, double default_val) {
+  const std::string env_var = FlagToEnvVar(flag);
+  const char* const value_str = getenv(env_var.c_str());
+  double value = default_val;
+  if (value_str == nullptr ||
+      !ParseDouble(std::string("Environment variable ") + env_var, value_str,
+                   &value)) {
+    return default_val;
   }
-
-  return result;
+  return value;
 }
 
-// Reads and returns the string environment variable corresponding to
-// the given flag; if it's not set, returns default_value.
-const char* StringFromEnv(const char* flag, const char* default_value) {
+const char* StringFromEnv(const char* flag, const char* default_val) {
   const std::string env_var = FlagToEnvVar(flag);
   const char* const value = getenv(env_var.c_str());
-  return value == nullptr ? default_value : value;
+  return value == nullptr ? default_val : value;
+}
+
+std::map<std::string, std::string> KvPairsFromEnv(
+    const char* flag, std::map<std::string, std::string> default_val) {
+  const std::string env_var = FlagToEnvVar(flag);
+  const char* const value_str = getenv(env_var.c_str());
+
+  if (value_str == nullptr) return default_val;
+
+  std::map<std::string, std::string> value;
+  if (!ParseKvPairs("Environment variable " + env_var, value_str, &value)) {
+    return default_val;
+  }
+  return value;
 }
 
 // Parses a string as a command line flag.  The string should have
@@ -205,14 +248,39 @@ bool ParseStringFlag(const char* str, const char* flag, std::string* value) {
   return true;
 }
 
+bool ParseKeyValueFlag(
+    const char* str, const char* flag,
+    std::map<std::string, std::string>* value) {
+  const char* const value_str = ParseFlagValue(str, flag, false);
+
+  if (value_str == nullptr) return false;
+
+  for (const auto& kvpair : StrSplit(value_str, ',')) {
+    const auto kv = StrSplit(kvpair, '=');
+    if (kv.size() != 2) return false;
+    value->emplace(kv[0], kv[1]);
+  }
+
+  return true;
+}
+
 bool IsFlag(const char* str, const char* flag) {
   return (ParseFlagValue(str, flag, true) != nullptr);
 }
 
 bool IsTruthyFlagValue(const std::string& value) {
-  if (value.empty()) return true;
-  char ch = value[0];
-  return isalnum(ch) &&
-         !(ch == '0' || ch == 'f' || ch == 'F' || ch == 'n' || ch == 'N');
+  if (value.size() == 1) {
+    char v = value[0];
+    return isalnum(v) &&
+           !(v == '0' || v == 'f' || v == 'F' || v == 'n' || v == 'N');
+  } else if (!value.empty()) {
+    std::string value_lower(value);
+    std::transform(value_lower.begin(), value_lower.end(), value_lower.begin(),
+                   [](char c) { return static_cast<char>(::tolower(c)); });
+    return !(value_lower == "false" || value_lower == "no" ||
+             value_lower == "off");
+  } else
+    return true;
 }
+
 }  // end namespace benchmark
diff --git a/src/commandlineflags.h b/src/commandlineflags.h
index 945c9a9fc4..5baaf11784 100644
--- a/src/commandlineflags.h
+++ b/src/commandlineflags.h
@@ -2,39 +2,70 @@
 #define BENCHMARK_COMMANDLINEFLAGS_H_
 
 #include <cstdint>
+#include <map>
 #include <string>
 
 // Macro for referencing flags.
 #define FLAG(name) FLAGS_##name
 
 // Macros for declaring flags.
-#define DECLARE_bool(name) extern bool FLAG(name)
-#define DECLARE_int32(name) extern int32_t FLAG(name)
-#define DECLARE_int64(name) extern int64_t FLAG(name)
-#define DECLARE_double(name) extern double FLAG(name)
-#define DECLARE_string(name) extern std::string FLAG(name)
+#define BM_DECLARE_bool(name) extern bool FLAG(name)
+#define BM_DECLARE_int32(name) extern int32_t FLAG(name)
+#define BM_DECLARE_double(name) extern double FLAG(name)
+#define BM_DECLARE_string(name) extern std::string FLAG(name)
+#define BM_DECLARE_kvpairs(name) \
+  extern std::map<std::string, std::string> FLAG(name)
 
 // Macros for defining flags.
-#define DEFINE_bool(name, default_val, doc) bool FLAG(name) = (default_val)
-#define DEFINE_int32(name, default_val, doc) int32_t FLAG(name) = (default_val)
-#define DEFINE_int64(name, default_val, doc) int64_t FLAG(name) = (default_val)
-#define DEFINE_double(name, default_val, doc) double FLAG(name) = (default_val)
-#define DEFINE_string(name, default_val, doc) \
-  std::string FLAG(name) = (default_val)
+#define BM_DEFINE_bool(name, default_val) \
+  bool FLAG(name) = benchmark::BoolFromEnv(#name, default_val)
+#define BM_DEFINE_int32(name, default_val) \
+  int32_t FLAG(name) = benchmark::Int32FromEnv(#name, default_val)
+#define BM_DEFINE_double(name, default_val) \
+  double FLAG(name) = benchmark::DoubleFromEnv(#name, default_val)
+#define BM_DEFINE_string(name, default_val) \
+  std::string FLAG(name) = benchmark::StringFromEnv(#name, default_val)
+#define BM_DEFINE_kvpairs(name, default_val)      \
+  std::map<std::string, std::string> FLAG(name) = \
+      benchmark::KvPairsFromEnv(#name, default_val)
 
 namespace benchmark {
-// Parses 'str' for a 32-bit signed integer.  If successful, writes the result
-// to *value and returns true; otherwise leaves *value unchanged and returns
-// false.
-bool ParseInt32(const std::string& src_text, const char* str, int32_t* value);
 
-// Parses a bool/Int32/string from the environment variable
-// corresponding to the given Google Test flag.
+// Parses a bool from the environment variable corresponding to the given flag.
+//
+// If the variable exists, returns IsTruthyFlagValue() value;  if not,
+// returns the given default value.
 bool BoolFromEnv(const char* flag, bool default_val);
+
+// Parses an Int32 from the environment variable corresponding to the given
+// flag.
+//
+// If the variable exists, returns ParseInt32() value;  if not, returns
+// the given default value.
 int32_t Int32FromEnv(const char* flag, int32_t default_val);
+
+// Parses an Double from the environment variable corresponding to the given
+// flag.
+//
+// If the variable exists, returns ParseDouble();  if not, returns
+// the given default value.
 double DoubleFromEnv(const char* flag, double default_val);
+
+// Parses a string from the environment variable corresponding to the given
+// flag.
+//
+// If variable exists, returns its value;  if not, returns
+// the given default value.
 const char* StringFromEnv(const char* flag, const char* default_val);
 
+// Parses a set of kvpairs from the environment variable corresponding to the
+// given flag.
+//
+// If variable exists, returns its value;  if not, returns
+// the given default value.
+std::map<std::string, std::string> KvPairsFromEnv(
+    const char* flag, std::map<std::string, std::string> default_val);
+
 // Parses a string for a bool flag, in the form of either
 // "--flag=value" or "--flag".
 //
@@ -46,34 +77,40 @@ const char* StringFromEnv(const char* flag, const char* default_val);
 // true.  On failure, returns false without changing *value.
 bool ParseBoolFlag(const char* str, const char* flag, bool* value);
 
-// Parses a string for an Int32 flag, in the form of
-// "--flag=value".
+// Parses a string for an Int32 flag, in the form of "--flag=value".
 //
 // On success, stores the value of the flag in *value, and returns
 // true.  On failure, returns false without changing *value.
 bool ParseInt32Flag(const char* str, const char* flag, int32_t* value);
 
-// Parses a string for a Double flag, in the form of
-// "--flag=value".
+// Parses a string for a Double flag, in the form of "--flag=value".
 //
 // On success, stores the value of the flag in *value, and returns
 // true.  On failure, returns false without changing *value.
 bool ParseDoubleFlag(const char* str, const char* flag, double* value);
 
-// Parses a string for a string flag, in the form of
-// "--flag=value".
+// Parses a string for a string flag, in the form of "--flag=value".
 //
 // On success, stores the value of the flag in *value, and returns
 // true.  On failure, returns false without changing *value.
 bool ParseStringFlag(const char* str, const char* flag, std::string* value);
 
+// Parses a string for a kvpairs flag in the form "--flag=key=value,key=value"
+//
+// On success, stores the value of the flag in *value and returns true. On
+// failure returns false, though *value may have been mutated.
+bool ParseKeyValueFlag(const char* str, const char* flag,
+                       std::map<std::string, std::string>* value);
+
 // Returns true if the string matches the flag.
 bool IsFlag(const char* str, const char* flag);
 
 // Returns true unless value starts with one of: '0', 'f', 'F', 'n' or 'N', or
-// some non-alphanumeric character. As a special case, also returns true if
-// value is the empty string.
+// some non-alphanumeric character. Also returns false if the value matches
+// one of 'no', 'false', 'off' (case-insensitive). As a special case, also
+// returns true if value is the empty string.
 bool IsTruthyFlagValue(const std::string& value);
+
 }  // end namespace benchmark
 
 #endif  // BENCHMARK_COMMANDLINEFLAGS_H_
diff --git a/src/complexity.cc b/src/complexity.cc
index 88832698ef..7757315dc3 100644
--- a/src/complexity.cc
+++ b/src/complexity.cc
@@ -26,20 +26,26 @@ namespace benchmark {
 
 // Internal function to calculate the different scalability forms
 BigOFunc* FittingCurve(BigO complexity) {
+  static const double kLog2E = 1.44269504088896340736;
   switch (complexity) {
     case oN:
-      return [](int n) -> double { return n; };
+      return [](IterationCount n) -> double { return static_cast<double>(n); };
     case oNSquared:
-      return [](int n) -> double { return std::pow(n, 2); };
+      return [](IterationCount n) -> double { return std::pow(n, 2); };
     case oNCubed:
-      return [](int n) -> double { return std::pow(n, 3); };
+      return [](IterationCount n) -> double { return std::pow(n, 3); };
     case oLogN:
-      return [](int n) { return log2(n); };
+      /* Note: can't use log2 because Android's GNU STL lacks it */
+      return
+          [](IterationCount n) { return kLog2E * log(static_cast<double>(n)); };
     case oNLogN:
-      return [](int n) { return n * log2(n); };
+      /* Note: can't use log2 because Android's GNU STL lacks it */
+      return [](IterationCount n) {
+        return kLog2E * n * log(static_cast<double>(n));
+      };
     case o1:
     default:
-      return [](int) { return 1.0; };
+      return [](IterationCount) { return 1.0; };
   }
 }
 
@@ -65,18 +71,17 @@ std::string GetBigOString(BigO complexity) {
 
 // Find the coefficient for the high-order term in the running time, by
 // minimizing the sum of squares of relative error, for the fitting curve
-// given by the lambda expresion.
+// given by the lambda expression.
 //   - n             : Vector containing the size of the benchmark tests.
 //   - time          : Vector containing the times for the benchmark tests.
-//   - fitting_curve : lambda expresion (e.g. [](int n) {return n; };).
+//   - fitting_curve : lambda expression (e.g. [](int64_t n) {return n; };).
 
-// For a deeper explanation on the algorithm logic, look the README file at
-// http://github.com/ismaelJimenez/Minimal-Cpp-Least-Squared-Fit
+// For a deeper explanation on the algorithm logic, please refer to
+// https://en.wikipedia.org/wiki/Least_squares#Least_squares,_regression_analysis_and_statistics
 
-LeastSq MinimalLeastSq(const std::vector<int>& n,
+LeastSq MinimalLeastSq(const std::vector<int64_t>& n,
                        const std::vector<double>& time,
                        BigOFunc* fitting_curve) {
-  double sigma_gn = 0.0;
   double sigma_gn_squared = 0.0;
   double sigma_time = 0.0;
   double sigma_time_gn = 0.0;
@@ -84,7 +89,6 @@ LeastSq MinimalLeastSq(const std::vector<int>& n,
   // Calculate least square fitting parameter
   for (size_t i = 0; i < n.size(); ++i) {
     double gn_i = fitting_curve(n[i]);
-    sigma_gn += gn_i;
     sigma_gn_squared += gn_i * gn_i;
     sigma_time += time[i];
     sigma_time_gn += time[i] * gn_i;
@@ -117,12 +121,12 @@ LeastSq MinimalLeastSq(const std::vector<int>& n,
 //   - complexity : If different than oAuto, the fitting curve will stick to
 //                  this one. If it is oAuto, it will be calculated the best
 //                  fitting curve.
-LeastSq MinimalLeastSq(const std::vector<int>& n,
+LeastSq MinimalLeastSq(const std::vector<int64_t>& n,
                        const std::vector<double>& time, const BigO complexity) {
-  CHECK_EQ(n.size(), time.size());
-  CHECK_GE(n.size(), 2);  // Do not compute fitting curve is less than two
-                          // benchmark runs are given
-  CHECK_NE(complexity, oNone);
+  BM_CHECK_EQ(n.size(), time.size());
+  BM_CHECK_GE(n.size(), 2);  // Do not compute fitting curve is less than two
+                             // benchmark runs are given
+  BM_CHECK_NE(complexity, oNone);
 
   LeastSq best_fit;
 
@@ -157,13 +161,14 @@ std::vector<BenchmarkReporter::Run> ComputeBigO(
   if (reports.size() < 2) return results;
 
   // Accumulators.
-  std::vector<int> n;
+  std::vector<int64_t> n;
   std::vector<double> real_time;
   std::vector<double> cpu_time;
 
   // Populate the accumulators.
   for (const Run& run : reports) {
-    CHECK_GT(run.complexity_n, 0) << "Did you forget to call SetComplexityN?";
+    BM_CHECK_GT(run.complexity_n, 0)
+        << "Did you forget to call SetComplexityN?";
     n.push_back(run.complexity_n);
     real_time.push_back(run.real_accumulated_time / run.iterations);
     cpu_time.push_back(run.cpu_accumulated_time / run.iterations);
@@ -179,12 +184,22 @@ std::vector<BenchmarkReporter::Run> ComputeBigO(
     result_cpu = MinimalLeastSq(n, cpu_time, reports[0].complexity);
     result_real = MinimalLeastSq(n, real_time, result_cpu.complexity);
   }
-  std::string benchmark_name =
-      reports[0].benchmark_name.substr(0, reports[0].benchmark_name.find('/'));
+
+  // Drop the 'args' when reporting complexity.
+  auto run_name = reports[0].run_name;
+  run_name.args.clear();
 
   // Get the data from the accumulator to BenchmarkReporter::Run's.
   Run big_o;
-  big_o.benchmark_name = benchmark_name + "_BigO";
+  big_o.run_name = run_name;
+  big_o.family_index = reports[0].family_index;
+  big_o.per_family_instance_index = reports[0].per_family_instance_index;
+  big_o.run_type = BenchmarkReporter::Run::RT_Aggregate;
+  big_o.repetitions = reports[0].repetitions;
+  big_o.repetition_index = Run::no_repetition_index;
+  big_o.threads = reports[0].threads;
+  big_o.aggregate_name = "BigO";
+  big_o.report_label = reports[0].report_label;
   big_o.iterations = 0;
   big_o.real_accumulated_time = result_real.coef;
   big_o.cpu_accumulated_time = result_cpu.coef;
@@ -200,10 +215,16 @@ std::vector<BenchmarkReporter::Run> ComputeBigO(
 
   // Only add label to mean/stddev if it is same for all runs
   Run rms;
-  big_o.report_label = reports[0].report_label;
-  rms.benchmark_name = benchmark_name + "_RMS";
+  rms.run_name = run_name;
+  rms.family_index = reports[0].family_index;
+  rms.per_family_instance_index = reports[0].per_family_instance_index;
+  rms.run_type = BenchmarkReporter::Run::RT_Aggregate;
+  rms.aggregate_name = "RMS";
   rms.report_label = big_o.report_label;
   rms.iterations = 0;
+  rms.repetition_index = Run::no_repetition_index;
+  rms.repetitions = reports[0].repetitions;
+  rms.threads = reports[0].threads;
   rms.real_accumulated_time = result_real.rms / multiplier;
   rms.cpu_accumulated_time = result_cpu.rms / multiplier;
   rms.report_rms = true;
diff --git a/src/console_reporter.cc b/src/console_reporter.cc
index 48920ca782..6fd764525e 100644
--- a/src/console_reporter.cc
+++ b/src/console_reporter.cc
@@ -12,21 +12,21 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "benchmark/benchmark.h"
-#include "complexity.h"
-#include "counter.h"
-
 #include <algorithm>
 #include <cstdint>
 #include <cstdio>
+#include <cstring>
 #include <iostream>
 #include <string>
 #include <tuple>
 #include <vector>
 
+#include "benchmark/benchmark.h"
 #include "check.h"
 #include "colorprint.h"
 #include "commandlineflags.h"
+#include "complexity.h"
+#include "counter.h"
 #include "internal_macros.h"
 #include "string_util.h"
 #include "timers.h"
@@ -53,7 +53,7 @@ bool ConsoleReporter::ReportContext(const Context& context) {
 }
 
 void ConsoleReporter::PrintHeader(const Run& run) {
-  std::string str = FormatString("%-*s %13s %13s %10s", static_cast<int>(name_field_width_),
+  std::string str = FormatString("%-*s %13s %15s %12s", static_cast<int>(name_field_width_),
                                  "Benchmark", "Time", "CPU", "Iterations");
   if(!run.counters.empty()) {
     if(output_options_ & OO_Tabular) {
@@ -64,9 +64,8 @@ void ConsoleReporter::PrintHeader(const Run& run) {
       str += " UserCounters...";
     }
   }
-  str += "\n";
   std::string line = std::string(str.length(), '-');
-  GetOutputStream() << line << "\n" << str << line << "\n";
+  GetOutputStream() << line << "\n" << str << "\n" << line << "\n";
 }
 
 void ConsoleReporter::ReportRuns(const std::vector<Run>& reports) {
@@ -98,6 +97,21 @@ static void IgnoreColorPrint(std::ostream& out, LogColor, const char* fmt,
   va_end(args);
 }
 
+
+static std::string FormatTime(double time) {
+  // Align decimal places...
+  if (time < 1.0) {
+    return FormatString("%10.3f", time);
+  }
+  if (time < 10.0) {
+    return FormatString("%10.2f", time);
+  }
+  if (time < 100.0) {
+    return FormatString("%10.1f", time);
+  }
+  return FormatString("%10.0f", time);
+}
+
 void ConsoleReporter::PrintRunData(const Run& result) {
   typedef void(PrinterFn)(std::ostream&, LogColor, const char*, ...);
   auto& Out = GetOutputStream();
@@ -106,7 +120,7 @@ void ConsoleReporter::PrintRunData(const Run& result) {
   auto name_color =
       (result.report_big_o || result.report_rms) ? COLOR_BLUE : COLOR_GREEN;
   printer(Out, name_color, "%-*s ", name_field_width_,
-          result.benchmark_name.c_str());
+          result.benchmark_name().c_str());
 
   if (result.error_occurred) {
     printer(Out, COLOR_RED, "ERROR OCCURRED: \'%s\'",
@@ -114,33 +128,24 @@ void ConsoleReporter::PrintRunData(const Run& result) {
     printer(Out, COLOR_DEFAULT, "\n");
     return;
   }
-  // Format bytes per second
-  std::string rate;
-  if (result.bytes_per_second > 0) {
-    rate = StrCat(" ", HumanReadableNumber(result.bytes_per_second), "B/s");
-  }
-
-  // Format items per second
-  std::string items;
-  if (result.items_per_second > 0) {
-    items =
-        StrCat(" ", HumanReadableNumber(result.items_per_second), " items/s");
-  }
 
   const double real_time = result.GetAdjustedRealTime();
   const double cpu_time = result.GetAdjustedCPUTime();
+  const std::string real_time_str = FormatTime(real_time);
+  const std::string cpu_time_str = FormatTime(cpu_time);
+
 
   if (result.report_big_o) {
     std::string big_o = GetBigOString(result.complexity);
-    printer(Out, COLOR_YELLOW, "%10.2f %s %10.2f %s ", real_time, big_o.c_str(),
+    printer(Out, COLOR_YELLOW, "%10.2f %-4s %10.2f %-4s ", real_time, big_o.c_str(),
             cpu_time, big_o.c_str());
   } else if (result.report_rms) {
-    printer(Out, COLOR_YELLOW, "%10.0f %% %10.0f %% ", real_time * 100,
-            cpu_time * 100);
+    printer(Out, COLOR_YELLOW, "%10.0f %-4s %10.0f %-4s ", real_time * 100, "%",
+            cpu_time * 100, "%");
   } else {
     const char* timeLabel = GetTimeUnitString(result.time_unit);
-    printer(Out, COLOR_YELLOW, "%10.0f %s %10.0f %s ", real_time, timeLabel,
-            cpu_time, timeLabel);
+    printer(Out, COLOR_YELLOW, "%s %-4s %s %-4s ", real_time_str.c_str(), timeLabel,
+            cpu_time_str.c_str(), timeLabel);
   }
 
   if (!result.report_big_o && !result.report_rms) {
@@ -150,28 +155,18 @@ void ConsoleReporter::PrintRunData(const Run& result) {
   for (auto& c : result.counters) {
     const std::size_t cNameLen = std::max(std::string::size_type(10),
                                           c.first.length());
-    auto const& s = HumanReadableNumber(c.second.value, 1000);
+    auto const& s = HumanReadableNumber(c.second.value, c.second.oneK);
+    const char* unit = "";
+    if (c.second.flags & Counter::kIsRate)
+      unit = (c.second.flags & Counter::kInvert) ? "s" : "/s";
     if (output_options_ & OO_Tabular) {
-      if (c.second.flags & Counter::kIsRate) {
-        printer(Out, COLOR_DEFAULT, " %*s/s", cNameLen - 2, s.c_str());
-      } else {
-        printer(Out, COLOR_DEFAULT, " %*s", cNameLen, s.c_str());
-      }
-    } else {
-      const char* unit = (c.second.flags & Counter::kIsRate) ? "/s" : "";
-      printer(Out, COLOR_DEFAULT, " %s=%s%s", c.first.c_str(), s.c_str(),
+      printer(Out, COLOR_DEFAULT, " %*s%s", cNameLen - strlen(unit), s.c_str(),
               unit);
+    } else {
+      printer(Out, COLOR_DEFAULT, " %s=%s%s", c.first.c_str(), s.c_str(), unit);
     }
   }
 
-  if (!rate.empty()) {
-    printer(Out, COLOR_DEFAULT, " %*s", 13, rate.c_str());
-  }
-
-  if (!items.empty()) {
-    printer(Out, COLOR_DEFAULT, " %*s", 18, items.c_str());
-  }
-
   if (!result.report_label.empty()) {
     printer(Out, COLOR_DEFAULT, " %s", result.report_label.c_str());
   }
diff --git a/src/counter.cc b/src/counter.cc
index ed1aa044ee..cf5b78ee3a 100644
--- a/src/counter.cc
+++ b/src/counter.cc
@@ -17,7 +17,8 @@
 namespace benchmark {
 namespace internal {
 
-double Finish(Counter const& c, double cpu_time, double num_threads) {
+double Finish(Counter const& c, IterationCount iterations, double cpu_time,
+              double num_threads) {
   double v = c.value;
   if (c.flags & Counter::kIsRate) {
     v /= cpu_time;
@@ -25,25 +26,36 @@ double Finish(Counter const& c, double cpu_time, double num_threads) {
   if (c.flags & Counter::kAvgThreads) {
     v /= num_threads;
   }
+  if (c.flags & Counter::kIsIterationInvariant) {
+    v *= iterations;
+  }
+  if (c.flags & Counter::kAvgIterations) {
+    v /= iterations;
+  }
+
+  if (c.flags & Counter::kInvert) {  // Invert is *always* last.
+    v = 1.0 / v;
+  }
   return v;
 }
 
-void Finish(UserCounters *l, double cpu_time, double num_threads) {
-  for (auto &c : *l) {
-    c.second.value = Finish(c.second, cpu_time, num_threads);
+void Finish(UserCounters* l, IterationCount iterations, double cpu_time,
+            double num_threads) {
+  for (auto& c : *l) {
+    c.second.value = Finish(c.second, iterations, cpu_time, num_threads);
   }
 }
 
-void Increment(UserCounters *l, UserCounters const& r) {
+void Increment(UserCounters* l, UserCounters const& r) {
   // add counters present in both or just in *l
-  for (auto &c : *l) {
+  for (auto& c : *l) {
     auto it = r.find(c.first);
     if (it != r.end()) {
       c.second.value = c.second + it->second;
     }
   }
   // add counters present in r, but not in *l
-  for (auto const &tc : r) {
+  for (auto const& tc : r) {
     auto it = l->find(tc.first);
     if (it == l->end()) {
       (*l)[tc.first] = tc.second;
@@ -64,5 +76,5 @@ bool SameNames(UserCounters const& l, UserCounters const& r) {
   return true;
 }
 
-} // end namespace internal
-} // end namespace benchmark
+}  // end namespace internal
+}  // end namespace benchmark
diff --git a/src/counter.h b/src/counter.h
index dd6865a31d..1f5a58e31f 100644
--- a/src/counter.h
+++ b/src/counter.h
@@ -12,15 +12,21 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#ifndef BENCHMARK_COUNTER_H_
+#define BENCHMARK_COUNTER_H_
+
 #include "benchmark/benchmark.h"
 
 namespace benchmark {
 
 // these counter-related functions are hidden to reduce API surface.
 namespace internal {
-void Finish(UserCounters *l, double time, double num_threads);
-void Increment(UserCounters *l, UserCounters const& r);
+void Finish(UserCounters* l, IterationCount iterations, double time,
+            double num_threads);
+void Increment(UserCounters* l, UserCounters const& r);
 bool SameNames(UserCounters const& l, UserCounters const& r);
-} // end namespace internal
+}  // end namespace internal
+
+}  // end namespace benchmark
 
-} //end namespace benchmark
+#endif  // BENCHMARK_COUNTER_H_
diff --git a/src/csv_reporter.cc b/src/csv_reporter.cc
index 35510645b0..9bd7121daf 100644
--- a/src/csv_reporter.cc
+++ b/src/csv_reporter.cc
@@ -22,9 +22,9 @@
 #include <tuple>
 #include <vector>
 
+#include "check.h"
 #include "string_util.h"
 #include "timers.h"
-#include "check.h"
 
 // File format reference: http://edoceo.com/utilitas/csv-file-format.
 
@@ -37,18 +37,32 @@ std::vector<std::string> elements = {
     "error_occurred", "error_message"};
 }  // namespace
 
+std::string CsvEscape(const std::string & s) {
+  std::string tmp;
+  tmp.reserve(s.size() + 2);
+  for (char c : s) {
+    switch (c) {
+    case '"' : tmp += "\"\""; break;
+    default  : tmp += c; break;
+    }
+  }
+  return '"' + tmp + '"';
+}
+
 bool CSVReporter::ReportContext(const Context& context) {
   PrintBasicContext(&GetErrorStream(), context);
   return true;
 }
 
-void CSVReporter::ReportRuns(const std::vector<Run> & reports) {
+void CSVReporter::ReportRuns(const std::vector<Run>& reports) {
   std::ostream& Out = GetOutputStream();
 
   if (!printed_header_) {
     // save the names of all the user counters
     for (const auto& run : reports) {
       for (const auto& cnt : run.counters) {
+        if (cnt.first == "bytes_per_second" || cnt.first == "items_per_second")
+          continue;
         user_counter_names_.insert(cnt.first);
       }
     }
@@ -58,7 +72,8 @@ void CSVReporter::ReportRuns(const std::vector<Run> & reports) {
       Out << *B++;
       if (B != elements.end()) Out << ",";
     }
-    for (auto B = user_counter_names_.begin(); B != user_counter_names_.end();) {
+    for (auto B = user_counter_names_.begin();
+         B != user_counter_names_.end();) {
       Out << ",\"" << *B++ << "\"";
     }
     Out << "\n";
@@ -68,10 +83,13 @@ void CSVReporter::ReportRuns(const std::vector<Run> & reports) {
     // check that all the current counters are saved in the name set
     for (const auto& run : reports) {
       for (const auto& cnt : run.counters) {
-        CHECK(user_counter_names_.find(cnt.first) != user_counter_names_.end())
-              << "All counters must be present in each run. "
-              << "Counter named \"" << cnt.first
-              << "\" was not in a run after being added to the header";
+        if (cnt.first == "bytes_per_second" || cnt.first == "items_per_second")
+          continue;
+        BM_CHECK(user_counter_names_.find(cnt.first) !=
+                 user_counter_names_.end())
+            << "All counters must be present in each run. "
+            << "Counter named \"" << cnt.first
+            << "\" was not in a run after being added to the header";
       }
     }
   }
@@ -80,23 +98,15 @@ void CSVReporter::ReportRuns(const std::vector<Run> & reports) {
   for (const auto& run : reports) {
     PrintRunData(run);
   }
-
 }
 
-void CSVReporter::PrintRunData(const Run & run) {
+void CSVReporter::PrintRunData(const Run& run) {
   std::ostream& Out = GetOutputStream();
-
-  // Field with embedded double-quote characters must be doubled and the field
-  // delimited with double-quotes.
-  std::string name = run.benchmark_name;
-  ReplaceAll(&name, "\"", "\"\"");
-  Out << '"' << name << "\",";
+  Out << CsvEscape(run.benchmark_name()) << ",";
   if (run.error_occurred) {
     Out << std::string(elements.size() - 3, ',');
     Out << "true,";
-    std::string msg = run.error_message;
-    ReplaceAll(&msg, "\"", "\"\"");
-    Out << '"' << msg << "\"\n";
+    Out << CsvEscape(run.error_message) << "\n";
     return;
   }
 
@@ -117,27 +127,23 @@ void CSVReporter::PrintRunData(const Run & run) {
   }
   Out << ",";
 
-  if (run.bytes_per_second > 0.0) {
-    Out << run.bytes_per_second;
+  if (run.counters.find("bytes_per_second") != run.counters.end()) {
+    Out << run.counters.at("bytes_per_second");
   }
   Out << ",";
-  if (run.items_per_second > 0.0) {
-    Out << run.items_per_second;
+  if (run.counters.find("items_per_second") != run.counters.end()) {
+    Out << run.counters.at("items_per_second");
   }
   Out << ",";
   if (!run.report_label.empty()) {
-    // Field with embedded double-quote characters must be doubled and the field
-    // delimited with double-quotes.
-    std::string label = run.report_label;
-    ReplaceAll(&label, "\"", "\"\"");
-    Out << "\"" << label << "\"";
+    Out << CsvEscape(run.report_label);
   }
   Out << ",,";  // for error_occurred and error_message
 
   // Print user counters
-  for (const auto &ucn : user_counter_names_) {
+  for (const auto& ucn : user_counter_names_) {
     auto it = run.counters.find(ucn);
-    if(it == run.counters.end()) {
+    if (it == run.counters.end()) {
       Out << ",";
     } else {
       Out << "," << it->second;
diff --git a/src/cycleclock.h b/src/cycleclock.h
index 4251fe4c32..f22ca9f7d2 100644
--- a/src/cycleclock.h
+++ b/src/cycleclock.h
@@ -36,12 +36,12 @@
 // declarations of some other intrinsics, breaking compilation.
 // Therefore, we simply declare __rdtsc ourselves. See also
 // http://connect.microsoft.com/VisualStudio/feedback/details/262047
-#if defined(COMPILER_MSVC) && !defined(_M_IX86)
+#if defined(COMPILER_MSVC) && !defined(_M_IX86) && !defined(_M_ARM64)
 extern "C" uint64_t __rdtsc();
 #pragma intrinsic(__rdtsc)
 #endif
 
-#ifndef BENCHMARK_OS_WINDOWS
+#if !defined(BENCHMARK_OS_WINDOWS) || defined(BENCHMARK_OS_MINGW)
 #include <sys/time.h>
 #include <time.h>
 #endif
@@ -84,13 +84,21 @@ inline BENCHMARK_ALWAYS_INLINE int64_t Now() {
   return (high << 32) | low;
 #elif defined(__powerpc__) || defined(__ppc__)
   // This returns a time-base, which is not always precisely a cycle-count.
-  int64_t tbl, tbu0, tbu1;
-  asm("mftbu %0" : "=r"(tbu0));
-  asm("mftb  %0" : "=r"(tbl));
-  asm("mftbu %0" : "=r"(tbu1));
-  tbl &= -static_cast<int64_t>(tbu0 == tbu1);
-  // high 32 bits in tbu1; low 32 bits in tbl  (tbu0 is garbage)
-  return (tbu1 << 32) | tbl;
+#if defined(__powerpc64__) || defined(__ppc64__)
+  int64_t tb;
+  asm volatile("mfspr %0, 268" : "=r"(tb));
+  return tb;
+#else
+  uint32_t tbl, tbu0, tbu1;
+  asm volatile(
+      "mftbu %0\n"
+      "mftb %1\n"
+      "mftbu %2"
+      : "=r"(tbu0), "=r"(tbl), "=r"(tbu1));
+  tbl &= -static_cast<int32_t>(tbu0 == tbu1);
+  // high 32 bits in tbu1; low 32 bits in tbl  (tbu0 is no longer needed)
+  return (static_cast<uint64_t>(tbu1) << 32) | tbl;
+#endif
 #elif defined(__sparc__)
   int64_t tick;
   asm(".byte 0x83, 0x41, 0x00, 0x00");
@@ -106,6 +114,12 @@ inline BENCHMARK_ALWAYS_INLINE int64_t Now() {
   // when I know it will work.  Otherwise, I'll use __rdtsc and hope
   // the code is being compiled with a non-ancient compiler.
   _asm rdtsc
+#elif defined(COMPILER_MSVC) && defined(_M_ARM64)
+  // See https://docs.microsoft.com/en-us/cpp/intrinsics/arm64-intrinsics?view=vs-2019
+  // and https://reviews.llvm.org/D53115
+  int64_t virtual_timer_value;
+  virtual_timer_value = _ReadStatusReg(ARM64_CNTVCT);
+  return virtual_timer_value;
 #elif defined(COMPILER_MSVC)
   return __rdtsc();
 #elif defined(BENCHMARK_OS_NACL)
@@ -121,7 +135,7 @@ inline BENCHMARK_ALWAYS_INLINE int64_t Now() {
   // because is provides nanosecond resolution (which is noticable at
   // least for PNaCl modules running on x86 Mac & Linux).
   // Initialize to always return 0 if clock_gettime fails.
-  struct timespec ts = { 0, 0 };
+  struct timespec ts = {0, 0};
   clock_gettime(CLOCK_MONOTONIC, &ts);
   return static_cast<int64_t>(ts.tv_sec) * 1000000000 + ts.tv_nsec;
 #elif defined(__aarch64__)
@@ -153,12 +167,51 @@ inline BENCHMARK_ALWAYS_INLINE int64_t Now() {
   struct timeval tv;
   gettimeofday(&tv, nullptr);
   return static_cast<int64_t>(tv.tv_sec) * 1000000 + tv.tv_usec;
-#elif defined(__mips__)
+#elif defined(__mips__) || defined(__m68k__)
   // mips apparently only allows rdtsc for superusers, so we fall
   // back to gettimeofday.  It's possible clock_gettime would be better.
   struct timeval tv;
   gettimeofday(&tv, nullptr);
   return static_cast<int64_t>(tv.tv_sec) * 1000000 + tv.tv_usec;
+#elif defined(__loongarch__)
+  struct timeval tv;
+  gettimeofday(&tv, nullptr);
+  return static_cast<int64_t>(tv.tv_sec) * 1000000 + tv.tv_usec;
+#elif defined(__s390__)  // Covers both s390 and s390x.
+  // Return the CPU clock.
+  uint64_t tsc;
+#if defined(BENCHMARK_OS_ZOS) && defined(COMPILER_IBMXL)
+  // z/OS XL compiler HLASM syntax.
+  asm(" stck %0" : "=m"(tsc) : : "cc");
+#else
+  asm("stck %0" : "=Q"(tsc) : : "cc");
+#endif
+  return tsc;
+#elif defined(__riscv) // RISC-V
+  // Use RDCYCLE (and RDCYCLEH on riscv32)
+#if __riscv_xlen == 32
+  uint32_t cycles_lo, cycles_hi0, cycles_hi1;
+  // This asm also includes the PowerPC overflow handling strategy, as above.
+  // Implemented in assembly because Clang insisted on branching.
+  asm volatile(
+      "rdcycleh %0\n"
+      "rdcycle %1\n"
+      "rdcycleh %2\n"
+      "sub %0, %0, %2\n"
+      "seqz %0, %0\n"
+      "sub %0, zero, %0\n"
+      "and %1, %1, %0\n"
+      : "=r"(cycles_hi0), "=r"(cycles_lo), "=r"(cycles_hi1));
+  return (static_cast<uint64_t>(cycles_hi1) << 32) | cycles_lo;
+#else
+  uint64_t cycles;
+  asm volatile("rdcycle %0" : "=r"(cycles));
+  return cycles;
+#endif
+#elif defined(__e2k__) || defined(__elbrus__)
+  struct timeval tv;
+  gettimeofday(&tv, nullptr);
+  return static_cast<int64_t>(tv.tv_sec) * 1000000 + tv.tv_usec;
 #else
 // The soft failover to a generic implementation is automatic only for ARM.
 // For other platforms the developer is expected to make an attempt to create
diff --git a/src/internal_macros.h b/src/internal_macros.h
index c34f5716e6..91f367b894 100644
--- a/src/internal_macros.h
+++ b/src/internal_macros.h
@@ -3,15 +3,21 @@
 
 #include "benchmark/benchmark.h"
 
+/* Needed to detect STL */
+#include <cstdlib>
+
+// clang-format off
+
 #ifndef __has_feature
 #define __has_feature(x) 0
 #endif
-#ifndef __has_builtin
-#define __has_builtin(x) 0
-#endif
 
 #if defined(__clang__)
-  #if !defined(COMPILER_CLANG)
+  #if defined(__ibmxl__)
+    #if !defined(COMPILER_IBMXL)
+      #define COMPILER_IBMXL
+    #endif
+  #elif !defined(COMPILER_CLANG)
     #define COMPILER_CLANG
   #endif
 #elif defined(_MSC_VER)
@@ -38,7 +44,11 @@
   #define BENCHMARK_OS_CYGWIN 1
 #elif defined(_WIN32)
   #define BENCHMARK_OS_WINDOWS 1
+  #if defined(__MINGW32__)
+    #define BENCHMARK_OS_MINGW 1
+  #endif
 #elif defined(__APPLE__)
+  #define BENCHMARK_OS_APPLE 1
   #include "TargetConditionals.h"
   #if defined(TARGET_OS_MAC)
     #define BENCHMARK_OS_MACOSX 1
@@ -50,14 +60,30 @@
   #define BENCHMARK_OS_FREEBSD 1
 #elif defined(__NetBSD__)
   #define BENCHMARK_OS_NETBSD 1
+#elif defined(__OpenBSD__)
+  #define BENCHMARK_OS_OPENBSD 1
+#elif defined(__DragonFly__)
+  #define BENCHMARK_OS_DRAGONFLY 1
 #elif defined(__linux__)
   #define BENCHMARK_OS_LINUX 1
 #elif defined(__native_client__)
   #define BENCHMARK_OS_NACL 1
-#elif defined(EMSCRIPTEN)
+#elif defined(__EMSCRIPTEN__)
   #define BENCHMARK_OS_EMSCRIPTEN 1
 #elif defined(__rtems__)
   #define BENCHMARK_OS_RTEMS 1
+#elif defined(__Fuchsia__)
+#define BENCHMARK_OS_FUCHSIA 1
+#elif defined (__SVR4) && defined (__sun)
+#define BENCHMARK_OS_SOLARIS 1
+#elif defined(__QNX__)
+#define BENCHMARK_OS_QNX 1
+#elif defined(__MVS__)
+#define BENCHMARK_OS_ZOS 1
+#endif
+
+#if defined(__ANDROID__) && defined(__GLIBCXX__)
+#define BENCHMARK_STL_ANDROID_GNUSTL 1
 #endif
 
 #if !__has_feature(cxx_exceptions) && !defined(__cpp_exceptions) \
@@ -71,12 +97,6 @@
   #define BENCHMARK_MAYBE_UNUSED
 #endif
 
-#if defined(COMPILER_GCC) || __has_builtin(__builtin_unreachable)
-  #define BENCHMARK_UNREACHABLE() __builtin_unreachable()
-#elif defined(COMPILER_MSVC)
-  #define BENCHMARK_UNREACHABLE() __assume(false)
-#else
-  #define BENCHMARK_UNREACHABLE() ((void)0)
-#endif
+// clang-format on
 
 #endif  // BENCHMARK_INTERNAL_MACROS_H_
diff --git a/src/json_reporter.cc b/src/json_reporter.cc
index 94e057d861..e65e93d85f 100644
--- a/src/json_reporter.cc
+++ b/src/json_reporter.cc
@@ -12,55 +12,108 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "benchmark/benchmark.h"
-#include "complexity.h"
-
 #include <algorithm>
+#include <cmath>
 #include <cstdint>
+#include <iomanip>  // for setprecision
 #include <iostream>
+#include <limits>
 #include <string>
 #include <tuple>
 #include <vector>
-#include <iomanip> // for setprecision
-#include <limits>
 
+#include "benchmark/benchmark.h"
+#include "complexity.h"
 #include "string_util.h"
 #include "timers.h"
 
 namespace benchmark {
+namespace internal {
+extern std::map<std::string, std::string>* global_context;
+}
 
 namespace {
 
+std::string StrEscape(const std::string& s) {
+  std::string tmp;
+  tmp.reserve(s.size());
+  for (char c : s) {
+    switch (c) {
+      case '\b':
+        tmp += "\\b";
+        break;
+      case '\f':
+        tmp += "\\f";
+        break;
+      case '\n':
+        tmp += "\\n";
+        break;
+      case '\r':
+        tmp += "\\r";
+        break;
+      case '\t':
+        tmp += "\\t";
+        break;
+      case '\\':
+        tmp += "\\\\";
+        break;
+      case '"':
+        tmp += "\\\"";
+        break;
+      default:
+        tmp += c;
+        break;
+    }
+  }
+  return tmp;
+}
+
 std::string FormatKV(std::string const& key, std::string const& value) {
-  return StringPrintF("\"%s\": \"%s\"", key.c_str(), value.c_str());
+  return StrFormat("\"%s\": \"%s\"", StrEscape(key).c_str(),
+                   StrEscape(value).c_str());
 }
 
 std::string FormatKV(std::string const& key, const char* value) {
-  return StringPrintF("\"%s\": \"%s\"", key.c_str(), value);
+  return StrFormat("\"%s\": \"%s\"", StrEscape(key).c_str(),
+                   StrEscape(value).c_str());
 }
 
 std::string FormatKV(std::string const& key, bool value) {
-  return StringPrintF("\"%s\": %s", key.c_str(), value ? "true" : "false");
+  return StrFormat("\"%s\": %s", StrEscape(key).c_str(),
+                   value ? "true" : "false");
 }
 
 std::string FormatKV(std::string const& key, int64_t value) {
   std::stringstream ss;
-  ss << '"' << key << "\": " << value;
+  ss << '"' << StrEscape(key) << "\": " << value;
   return ss.str();
 }
 
-std::string FormatKV(std::string const& key, double value) {
+std::string FormatKV(std::string const& key, IterationCount value) {
   std::stringstream ss;
-  ss << '"' << key << "\": ";
+  ss << '"' << StrEscape(key) << "\": " << value;
+  return ss.str();
+}
 
-  const auto max_digits10 = std::numeric_limits<decltype (value)>::max_digits10;
-  const auto max_fractional_digits10 = max_digits10 - 1;
+std::string FormatKV(std::string const& key, double value) {
+  std::stringstream ss;
+  ss << '"' << StrEscape(key) << "\": ";
 
-  ss << std::scientific << std::setprecision(max_fractional_digits10) << value;
+  if (std::isnan(value))
+    ss << (value < 0 ? "-" : "") << "NaN";
+  else if (std::isinf(value))
+    ss << (value < 0 ? "-" : "") << "Infinity";
+  else {
+    const auto max_digits10 =
+        std::numeric_limits<decltype(value)>::max_digits10;
+    const auto max_fractional_digits10 = max_digits10 - 1;
+    ss << std::scientific << std::setprecision(max_fractional_digits10)
+       << value;
+  }
   return ss.str();
 }
 
-int64_t RoundDouble(double v) { return static_cast<int64_t>(v + 0.5); }
+int64_t RoundDouble(double v) { return std::lround(v); }
 
 }  // end namespace
 
@@ -77,6 +130,12 @@ bool JSONReporter::ReportContext(const Context& context) {
   std::string walltime_value = LocalDateTimeString();
   out << indent << FormatKV("date", walltime_value) << ",\n";
 
+  out << indent << FormatKV("host_name", context.sys_info.name) << ",\n";
+
+  if (Context::executable_name) {
+    out << indent << FormatKV("executable", Context::executable_name) << ",\n";
+  }
+
   CPUInfo const& info = context.cpu_info;
   out << indent << FormatKV("num_cpus", static_cast<int64_t>(info.num_cpus))
       << ",\n";
@@ -84,8 +143,12 @@ bool JSONReporter::ReportContext(const Context& context) {
       << FormatKV("mhz_per_cpu",
                   RoundDouble(info.cycles_per_second / 1000000.0))
       << ",\n";
-  out << indent << FormatKV("cpu_scaling_enabled", info.scaling_enabled)
-      << ",\n";
+  if (CPUInfo::Scaling::UNKNOWN != info.scaling) {
+    out << indent
+        << FormatKV("cpu_scaling_enabled",
+                    info.scaling == CPUInfo::Scaling::ENABLED ? true : false)
+        << ",\n";
+  }
 
   out << indent << "\"caches\": [\n";
   indent = std::string(6, ' ');
@@ -96,8 +159,8 @@ bool JSONReporter::ReportContext(const Context& context) {
     out << cache_indent << FormatKV("type", CI.type) << ",\n";
     out << cache_indent << FormatKV("level", static_cast<int64_t>(CI.level))
         << ",\n";
-    out << cache_indent
-        << FormatKV("size", static_cast<int64_t>(CI.size) * 1000u) << ",\n";
+    out << cache_indent << FormatKV("size", static_cast<int64_t>(CI.size))
+        << ",\n";
     out << cache_indent
         << FormatKV("num_sharing", static_cast<int64_t>(CI.num_sharing))
         << "\n";
@@ -107,13 +170,28 @@ bool JSONReporter::ReportContext(const Context& context) {
   }
   indent = std::string(4, ' ');
   out << indent << "],\n";
+  out << indent << "\"load_avg\": [";
+  for (auto it = info.load_avg.begin(); it != info.load_avg.end();) {
+    out << *it++;
+    if (it != info.load_avg.end()) out << ",";
+  }
+  out << "],\n";
 
 #if defined(NDEBUG)
   const char build_type[] = "release";
 #else
   const char build_type[] = "debug";
 #endif
-  out << indent << FormatKV("library_build_type", build_type) << "\n";
+  out << indent << FormatKV("library_build_type", build_type);
+
+  if (internal::global_context != nullptr) {
+    for (const auto& kv : *internal::global_context) {
+      out << ",\n";
+      out << indent << FormatKV(kv.first, kv.second);
+    }
+  }
+  out << "\n";
+
   // Close context block and open the list of benchmarks.
   out << inner_indent << "},\n";
   out << inner_indent << "\"benchmarks\": [\n";
@@ -150,48 +228,60 @@ void JSONReporter::Finalize() {
 void JSONReporter::PrintRunData(Run const& run) {
   std::string indent(6, ' ');
   std::ostream& out = GetOutputStream();
-  out << indent << FormatKV("name", run.benchmark_name) << ",\n";
+  out << indent << FormatKV("name", run.benchmark_name()) << ",\n";
+  out << indent << FormatKV("family_index", run.family_index) << ",\n";
+  out << indent
+      << FormatKV("per_family_instance_index", run.per_family_instance_index)
+      << ",\n";
+  out << indent << FormatKV("run_name", run.run_name.str()) << ",\n";
+  out << indent << FormatKV("run_type", [&run]() -> const char* {
+    switch (run.run_type) {
+      case BenchmarkReporter::Run::RT_Iteration:
+        return "iteration";
+      case BenchmarkReporter::Run::RT_Aggregate:
+        return "aggregate";
+    }
+    BENCHMARK_UNREACHABLE();
+  }()) << ",\n";
+  out << indent << FormatKV("repetitions", run.repetitions) << ",\n";
+  if (run.run_type != BenchmarkReporter::Run::RT_Aggregate) {
+    out << indent << FormatKV("repetition_index", run.repetition_index)
+        << ",\n";
+  }
+  out << indent << FormatKV("threads", run.threads) << ",\n";
+  if (run.run_type == BenchmarkReporter::Run::RT_Aggregate) {
+    out << indent << FormatKV("aggregate_name", run.aggregate_name) << ",\n";
+  }
   if (run.error_occurred) {
     out << indent << FormatKV("error_occurred", run.error_occurred) << ",\n";
     out << indent << FormatKV("error_message", run.error_message) << ",\n";
   }
   if (!run.report_big_o && !run.report_rms) {
     out << indent << FormatKV("iterations", run.iterations) << ",\n";
-    out << indent
-        << FormatKV("real_time", run.GetAdjustedRealTime())
-        << ",\n";
-    out << indent
-        << FormatKV("cpu_time", run.GetAdjustedCPUTime());
+    out << indent << FormatKV("real_time", run.GetAdjustedRealTime()) << ",\n";
+    out << indent << FormatKV("cpu_time", run.GetAdjustedCPUTime());
     out << ",\n"
         << indent << FormatKV("time_unit", GetTimeUnitString(run.time_unit));
   } else if (run.report_big_o) {
-    out << indent
-        << FormatKV("cpu_coefficient", run.GetAdjustedCPUTime())
+    out << indent << FormatKV("cpu_coefficient", run.GetAdjustedCPUTime())
         << ",\n";
-    out << indent
-        << FormatKV("real_coefficient", run.GetAdjustedRealTime())
+    out << indent << FormatKV("real_coefficient", run.GetAdjustedRealTime())
         << ",\n";
     out << indent << FormatKV("big_o", GetBigOString(run.complexity)) << ",\n";
     out << indent << FormatKV("time_unit", GetTimeUnitString(run.time_unit));
   } else if (run.report_rms) {
-    out << indent
-        << FormatKV("rms", run.GetAdjustedCPUTime());
+    out << indent << FormatKV("rms", run.GetAdjustedCPUTime());
   }
-  if (run.bytes_per_second > 0.0) {
-    out << ",\n"
-        << indent
-        << FormatKV("bytes_per_second", run.bytes_per_second);
-  }
-  if (run.items_per_second > 0.0) {
-    out << ",\n"
-        << indent
-        << FormatKV("items_per_second", run.items_per_second);
+
+  for (auto& c : run.counters) {
+    out << ",\n" << indent << FormatKV(c.first, c.second);
   }
-  for(auto &c : run.counters) {
-    out << ",\n"
-        << indent
-        << FormatKV(c.first, c.second);
+
+  if (run.has_memory_result) {
+    out << ",\n" << indent << FormatKV("allocs_per_iter", run.allocs_per_iter);
+    out << ",\n" << indent << FormatKV("max_bytes_used", run.max_bytes_used);
   }
+
   if (!run.report_label.empty()) {
     out << ",\n" << indent << FormatKV("label", run.report_label);
   }
@@ -204,4 +294,4 @@ void JSONReporter::PrintRunData(Run const& run) {
   out << '\n';
 }
 
-} // end namespace benchmark
+}  // end namespace benchmark
diff --git a/src/log.h b/src/log.h
index d06e1031db..48c071aded 100644
--- a/src/log.h
+++ b/src/log.h
@@ -66,8 +66,9 @@ inline LogType& GetLogInstanceForLevel(int level) {
 }  // end namespace internal
 }  // end namespace benchmark
 
-#define VLOG(x)                                                               \
+// clang-format off
+#define BM_VLOG(x)                                                               \
   (::benchmark::internal::GetLogInstanceForLevel(x) << "-- LOG(" << x << "):" \
                                                                          " ")
-
+// clang-format on
 #endif
diff --git a/src/mutex.h b/src/mutex.h
index 5f461d05a0..bec78d9e5f 100644
--- a/src/mutex.h
+++ b/src/mutex.h
@@ -9,60 +9,60 @@
 // Enable thread safety attributes only with clang.
 // The attributes can be safely erased when compiling with other compilers.
 #if defined(HAVE_THREAD_SAFETY_ATTRIBUTES)
-#define THREAD_ANNOTATION_ATTRIBUTE__(x) __attribute__((x))
+#define THREAD_ANNOTATION_ATTRIBUTE_(x) __attribute__((x))
 #else
-#define THREAD_ANNOTATION_ATTRIBUTE__(x)  // no-op
+#define THREAD_ANNOTATION_ATTRIBUTE_(x)  // no-op
 #endif
 
-#define CAPABILITY(x) THREAD_ANNOTATION_ATTRIBUTE__(capability(x))
+#define CAPABILITY(x) THREAD_ANNOTATION_ATTRIBUTE_(capability(x))
 
-#define SCOPED_CAPABILITY THREAD_ANNOTATION_ATTRIBUTE__(scoped_lockable)
+#define SCOPED_CAPABILITY THREAD_ANNOTATION_ATTRIBUTE_(scoped_lockable)
 
-#define GUARDED_BY(x) THREAD_ANNOTATION_ATTRIBUTE__(guarded_by(x))
+#define GUARDED_BY(x) THREAD_ANNOTATION_ATTRIBUTE_(guarded_by(x))
 
-#define PT_GUARDED_BY(x) THREAD_ANNOTATION_ATTRIBUTE__(pt_guarded_by(x))
+#define PT_GUARDED_BY(x) THREAD_ANNOTATION_ATTRIBUTE_(pt_guarded_by(x))
 
 #define ACQUIRED_BEFORE(...) \
-  THREAD_ANNOTATION_ATTRIBUTE__(acquired_before(__VA_ARGS__))
+  THREAD_ANNOTATION_ATTRIBUTE_(acquired_before(__VA_ARGS__))
 
 #define ACQUIRED_AFTER(...) \
-  THREAD_ANNOTATION_ATTRIBUTE__(acquired_after(__VA_ARGS__))
+  THREAD_ANNOTATION_ATTRIBUTE_(acquired_after(__VA_ARGS__))
 
 #define REQUIRES(...) \
-  THREAD_ANNOTATION_ATTRIBUTE__(requires_capability(__VA_ARGS__))
+  THREAD_ANNOTATION_ATTRIBUTE_(requires_capability(__VA_ARGS__))
 
 #define REQUIRES_SHARED(...) \
-  THREAD_ANNOTATION_ATTRIBUTE__(requires_shared_capability(__VA_ARGS__))
+  THREAD_ANNOTATION_ATTRIBUTE_(requires_shared_capability(__VA_ARGS__))
 
 #define ACQUIRE(...) \
-  THREAD_ANNOTATION_ATTRIBUTE__(acquire_capability(__VA_ARGS__))
+  THREAD_ANNOTATION_ATTRIBUTE_(acquire_capability(__VA_ARGS__))
 
 #define ACQUIRE_SHARED(...) \
-  THREAD_ANNOTATION_ATTRIBUTE__(acquire_shared_capability(__VA_ARGS__))
+  THREAD_ANNOTATION_ATTRIBUTE_(acquire_shared_capability(__VA_ARGS__))
 
 #define RELEASE(...) \
-  THREAD_ANNOTATION_ATTRIBUTE__(release_capability(__VA_ARGS__))
+  THREAD_ANNOTATION_ATTRIBUTE_(release_capability(__VA_ARGS__))
 
 #define RELEASE_SHARED(...) \
-  THREAD_ANNOTATION_ATTRIBUTE__(release_shared_capability(__VA_ARGS__))
+  THREAD_ANNOTATION_ATTRIBUTE_(release_shared_capability(__VA_ARGS__))
 
 #define TRY_ACQUIRE(...) \
-  THREAD_ANNOTATION_ATTRIBUTE__(try_acquire_capability(__VA_ARGS__))
+  THREAD_ANNOTATION_ATTRIBUTE_(try_acquire_capability(__VA_ARGS__))
 
 #define TRY_ACQUIRE_SHARED(...) \
-  THREAD_ANNOTATION_ATTRIBUTE__(try_acquire_shared_capability(__VA_ARGS__))
+  THREAD_ANNOTATION_ATTRIBUTE_(try_acquire_shared_capability(__VA_ARGS__))
 
-#define EXCLUDES(...) THREAD_ANNOTATION_ATTRIBUTE__(locks_excluded(__VA_ARGS__))
+#define EXCLUDES(...) THREAD_ANNOTATION_ATTRIBUTE_(locks_excluded(__VA_ARGS__))
 
-#define ASSERT_CAPABILITY(x) THREAD_ANNOTATION_ATTRIBUTE__(assert_capability(x))
+#define ASSERT_CAPABILITY(x) THREAD_ANNOTATION_ATTRIBUTE_(assert_capability(x))
 
 #define ASSERT_SHARED_CAPABILITY(x) \
-  THREAD_ANNOTATION_ATTRIBUTE__(assert_shared_capability(x))
+  THREAD_ANNOTATION_ATTRIBUTE_(assert_shared_capability(x))
 
-#define RETURN_CAPABILITY(x) THREAD_ANNOTATION_ATTRIBUTE__(lock_returned(x))
+#define RETURN_CAPABILITY(x) THREAD_ANNOTATION_ATTRIBUTE_(lock_returned(x))
 
 #define NO_THREAD_SAFETY_ANALYSIS \
-  THREAD_ANNOTATION_ATTRIBUTE__(no_thread_safety_analysis)
+  THREAD_ANNOTATION_ATTRIBUTE_(no_thread_safety_analysis)
 
 namespace benchmark {
 
@@ -71,7 +71,7 @@ typedef std::condition_variable Condition;
 // NOTE: Wrappers for std::mutex and std::unique_lock are provided so that
 // we can annotate them with thread safety attributes and use the
 // -Wthread-safety warning with clang. The standard library types cannot be
-// used directly because they do not provided the required annotations.
+// used directly because they do not provide the required annotations.
 class CAPABILITY("mutex") Mutex {
  public:
   Mutex() {}
@@ -130,7 +130,7 @@ class Barrier {
   // entered the barrier.  Returns iff this is the last thread to
   // enter the barrier.
   bool createBarrier(MutexLock& ml) REQUIRES(lock_) {
-    CHECK_LT(entered_, running_threads_);
+    BM_CHECK_LT(entered_, running_threads_);
     entered_++;
     if (entered_ < running_threads_) {
       // Wait for all threads to enter
diff --git a/src/perf_counters.cc b/src/perf_counters.cc
new file mode 100644
index 0000000000..4ddf0de250
--- /dev/null
+++ b/src/perf_counters.cc
@@ -0,0 +1,132 @@
+// Copyright 2021 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "perf_counters.h"
+
+#include <cstring>
+#include <vector>
+
+#if defined HAVE_LIBPFM
+#include "perfmon/pfmlib.h"
+#include "perfmon/pfmlib_perf_event.h"
+#endif
+
+namespace benchmark {
+namespace internal {
+
+constexpr size_t PerfCounterValues::kMaxCounters;
+
+#if defined HAVE_LIBPFM
+const bool PerfCounters::kSupported = true;
+
+bool PerfCounters::Initialize() { return pfm_initialize() == PFM_SUCCESS; }
+
+PerfCounters PerfCounters::Create(
+    const std::vector<std::string>& counter_names) {
+  if (counter_names.empty()) {
+    return NoCounters();
+  }
+  if (counter_names.size() > PerfCounterValues::kMaxCounters) {
+    GetErrorLogInstance()
+        << counter_names.size()
+        << " counters were requested. The minimum is 1, the maximum is "
+        << PerfCounterValues::kMaxCounters << "\n";
+    return NoCounters();
+  }
+  std::vector<int> counter_ids(counter_names.size());
+
+  const int mode = PFM_PLM3;  // user mode only
+  for (size_t i = 0; i < counter_names.size(); ++i) {
+    const bool is_first = i == 0;
+    struct perf_event_attr attr{};
+    attr.size = sizeof(attr);
+    const int group_id = !is_first ? counter_ids[0] : -1;
+    const auto& name = counter_names[i];
+    if (name.empty()) {
+      GetErrorLogInstance() << "A counter name was the empty string\n";
+      return NoCounters();
+    }
+    pfm_perf_encode_arg_t arg{};
+    arg.attr = &attr;
+
+    const int pfm_get =
+        pfm_get_os_event_encoding(name.c_str(), mode, PFM_OS_PERF_EVENT, &arg);
+    if (pfm_get != PFM_SUCCESS) {
+      GetErrorLogInstance() << "Unknown counter name: " << name << "\n";
+      return NoCounters();
+    }
+    attr.disabled = is_first;
+    // Note: the man page for perf_event_create suggests inerit = true and
+    // read_format = PERF_FORMAT_GROUP don't work together, but that's not the
+    // case.
+    attr.inherit = true;
+    attr.pinned = is_first;
+    attr.exclude_kernel = true;
+    attr.exclude_user = false;
+    attr.exclude_hv = true;
+    // Read all counters in one read.
+    attr.read_format = PERF_FORMAT_GROUP;
+
+    int id = -1;
+    static constexpr size_t kNrOfSyscallRetries = 5;
+    // Retry syscall as it was interrupted often (b/64774091).
+    for (size_t num_retries = 0; num_retries < kNrOfSyscallRetries;
+         ++num_retries) {
+      id = perf_event_open(&attr, 0, -1, group_id, 0);
+      if (id >= 0 || errno != EINTR) {
+        break;
+      }
+    }
+    if (id < 0) {
+      GetErrorLogInstance()
+          << "Failed to get a file descriptor for " << name << "\n";
+      return NoCounters();
+    }
+
+    counter_ids[i] = id;
+  }
+  if (ioctl(counter_ids[0], PERF_EVENT_IOC_ENABLE) != 0) {
+    GetErrorLogInstance() << "Failed to start counters\n";
+    return NoCounters();
+  }
+
+  return PerfCounters(counter_names, std::move(counter_ids));
+}
+
+PerfCounters::~PerfCounters() {
+  if (counter_ids_.empty()) {
+    return;
+  }
+  ioctl(counter_ids_[0], PERF_EVENT_IOC_DISABLE);
+  for (int fd : counter_ids_) {
+    close(fd);
+  }
+}
+#else   // defined HAVE_LIBPFM
+const bool PerfCounters::kSupported = false;
+
+bool PerfCounters::Initialize() { return false; }
+
+PerfCounters PerfCounters::Create(
+    const std::vector<std::string>& counter_names) {
+  if (!counter_names.empty()) {
+    GetErrorLogInstance() << "Performance counters not supported.";
+  }
+  return NoCounters();
+}
+
+PerfCounters::~PerfCounters() = default;
+#endif  // defined HAVE_LIBPFM
+}  // namespace internal
+}  // namespace benchmark
diff --git a/src/perf_counters.h b/src/perf_counters.h
new file mode 100644
index 0000000000..47ca1385e2
--- /dev/null
+++ b/src/perf_counters.h
@@ -0,0 +1,172 @@
+// Copyright 2021 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef BENCHMARK_PERF_COUNTERS_H
+#define BENCHMARK_PERF_COUNTERS_H
+
+#include <array>
+#include <cstdint>
+#include <vector>
+
+#include "benchmark/benchmark.h"
+#include "check.h"
+#include "log.h"
+
+#ifndef BENCHMARK_OS_WINDOWS
+#include <unistd.h>
+#endif
+
+namespace benchmark {
+namespace internal {
+
+// Typically, we can only read a small number of counters. There is also a
+// padding preceding counter values, when reading multiple counters with one
+// syscall (which is desirable). PerfCounterValues abstracts these details.
+// The implementation ensures the storage is inlined, and allows 0-based
+// indexing into the counter values.
+// The object is used in conjunction with a PerfCounters object, by passing it
+// to Snapshot(). The values are populated such that
+// perfCounters->names()[i]'s value is obtained at position i (as given by
+// operator[]) of this object.
+class PerfCounterValues {
+ public:
+  explicit PerfCounterValues(size_t nr_counters) : nr_counters_(nr_counters) {
+    BM_CHECK_LE(nr_counters_, kMaxCounters);
+  }
+
+  uint64_t operator[](size_t pos) const { return values_[kPadding + pos]; }
+
+  static constexpr size_t kMaxCounters = 3;
+
+ private:
+  friend class PerfCounters;
+  // Get the byte buffer in which perf counters can be captured.
+  // This is used by PerfCounters::Read
+  std::pair<char*, size_t> get_data_buffer() {
+    return {reinterpret_cast<char*>(values_.data()),
+            sizeof(uint64_t) * (kPadding + nr_counters_)};
+  }
+
+  static constexpr size_t kPadding = 1;
+  std::array<uint64_t, kPadding + kMaxCounters> values_;
+  const size_t nr_counters_;
+};
+
+// Collect PMU counters. The object, once constructed, is ready to be used by
+// calling read(). PMU counter collection is enabled from the time create() is
+// called, to obtain the object, until the object's destructor is called.
+class PerfCounters final {
+ public:
+  // True iff this platform supports performance counters.
+  static const bool kSupported;
+
+  bool IsValid() const { return is_valid_; }
+  static PerfCounters NoCounters() { return PerfCounters(); }
+
+  ~PerfCounters();
+  PerfCounters(PerfCounters&&) = default;
+  PerfCounters(const PerfCounters&) = delete;
+
+  // Platform-specific implementations may choose to do some library
+  // initialization here.
+  static bool Initialize();
+
+  // Return a PerfCounters object ready to read the counters with the names
+  // specified. The values are user-mode only. The counter name format is
+  // implementation and OS specific.
+  // TODO: once we move to C++-17, this should be a std::optional, and then the
+  // IsValid() boolean can be dropped.
+  static PerfCounters Create(const std::vector<std::string>& counter_names);
+
+  // Take a snapshot of the current value of the counters into the provided
+  // valid PerfCounterValues storage. The values are populated such that:
+  // names()[i]'s value is (*values)[i]
+  BENCHMARK_ALWAYS_INLINE bool Snapshot(PerfCounterValues* values) const {
+#ifndef BENCHMARK_OS_WINDOWS
+    assert(values != nullptr);
+    assert(IsValid());
+    auto buffer = values->get_data_buffer();
+    auto read_bytes = ::read(counter_ids_[0], buffer.first, buffer.second);
+    return static_cast<size_t>(read_bytes) == buffer.second;
+#else
+    (void)values;
+    return false;
+#endif
+  }
+
+  const std::vector<std::string>& names() const { return counter_names_; }
+  size_t num_counters() const { return counter_names_.size(); }
+
+ private:
+  PerfCounters(const std::vector<std::string>& counter_names,
+               std::vector<int>&& counter_ids)
+      : counter_ids_(std::move(counter_ids)),
+        counter_names_(counter_names),
+        is_valid_(true) {}
+  PerfCounters() : is_valid_(false) {}
+
+  std::vector<int> counter_ids_;
+  const std::vector<std::string> counter_names_;
+  const bool is_valid_;
+};
+
+// Typical usage of the above primitives.
+class PerfCountersMeasurement final {
+ public:
+  PerfCountersMeasurement(PerfCounters&& c)
+      : counters_(std::move(c)),
+        start_values_(counters_.IsValid() ? counters_.names().size() : 0),
+        end_values_(counters_.IsValid() ? counters_.names().size() : 0) {}
+
+  bool IsValid() const { return counters_.IsValid(); }
+
+  BENCHMARK_ALWAYS_INLINE void Start() {
+    assert(IsValid());
+    // Tell the compiler to not move instructions above/below where we take
+    // the snapshot.
+    ClobberMemory();
+    counters_.Snapshot(&start_values_);
+    ClobberMemory();
+  }
+
+  BENCHMARK_ALWAYS_INLINE std::vector<std::pair<std::string, double>>
+  StopAndGetMeasurements() {
+    assert(IsValid());
+    // Tell the compiler to not move instructions above/below where we take
+    // the snapshot.
+    ClobberMemory();
+    counters_.Snapshot(&end_values_);
+    ClobberMemory();
+
+    std::vector<std::pair<std::string, double>> ret;
+    for (size_t i = 0; i < counters_.names().size(); ++i) {
+      double measurement = static_cast<double>(end_values_[i]) -
+                           static_cast<double>(start_values_[i]);
+      ret.push_back({counters_.names()[i], measurement});
+    }
+    return ret;
+  }
+
+ private:
+  PerfCounters counters_;
+  PerfCounterValues start_values_;
+  PerfCounterValues end_values_;
+};
+
+BENCHMARK_UNUSED static bool perf_init_anchor = PerfCounters::Initialize();
+
+}  // namespace internal
+}  // namespace benchmark
+
+#endif  // BENCHMARK_PERF_COUNTERS_H
diff --git a/src/re.h b/src/re.h
index 01e9736505..630046782d 100644
--- a/src/re.h
+++ b/src/re.h
@@ -17,22 +17,39 @@
 
 #include "internal_macros.h"
 
+// clang-format off
+
+#if !defined(HAVE_STD_REGEX) && \
+    !defined(HAVE_GNU_POSIX_REGEX) && \
+    !defined(HAVE_POSIX_REGEX)
+  // No explicit regex selection; detect based on builtin hints.
+  #if defined(BENCHMARK_OS_LINUX) || defined(BENCHMARK_OS_APPLE)
+    #define HAVE_POSIX_REGEX 1
+  #elif __cplusplus >= 199711L
+    #define HAVE_STD_REGEX 1
+  #endif
+#endif
+
 // Prefer C regex libraries when compiling w/o exceptions so that we can
 // correctly report errors.
-#if defined(BENCHMARK_HAS_NO_EXCEPTIONS) && defined(HAVE_STD_REGEX) && \
+#if defined(BENCHMARK_HAS_NO_EXCEPTIONS) && \
+    defined(BENCHMARK_HAVE_STD_REGEX) && \
     (defined(HAVE_GNU_POSIX_REGEX) || defined(HAVE_POSIX_REGEX))
-#undef HAVE_STD_REGEX
+  #undef HAVE_STD_REGEX
 #endif
 
 #if defined(HAVE_STD_REGEX)
-#include <regex>
+  #include <regex>
 #elif defined(HAVE_GNU_POSIX_REGEX)
-#include <gnuregex.h>
+  #include <gnuregex.h>
 #elif defined(HAVE_POSIX_REGEX)
-#include <regex.h>
+  #include <regex.h>
 #else
 #error No regular expression backend was found!
 #endif
+
+// clang-format on
+
 #include <string>
 
 #include "check.h"
@@ -72,20 +89,21 @@ class Regex {
 
 inline bool Regex::Init(const std::string& spec, std::string* error) {
 #ifdef BENCHMARK_HAS_NO_EXCEPTIONS
-  ((void)error); // suppress unused warning
+  ((void)error);  // suppress unused warning
 #else
   try {
 #endif
-    re_ = std::regex(spec, std::regex_constants::extended);
-    init_ = true;
+  re_ = std::regex(spec, std::regex_constants::extended);
+  init_ = true;
 #ifndef BENCHMARK_HAS_NO_EXCEPTIONS
-  } catch (const std::regex_error& e) {
-    if (error) {
-      *error = e.what();
-    }
+}
+catch (const std::regex_error& e) {
+  if (error) {
+    *error = e.what();
   }
+}
 #endif
-  return init_;
+return init_;
 }
 
 inline Regex::~Regex() {}
@@ -108,7 +126,7 @@ inline bool Regex::Init(const std::string& spec, std::string* error) {
 
       // regerror returns the number of bytes necessary to null terminate
       // the string, so we move that when assigning to error.
-      CHECK_NE(needed, 0);
+      BM_CHECK_NE(needed, 0);
       error->assign(errbuf, needed - 1);
 
       delete[] errbuf;
diff --git a/src/reporter.cc b/src/reporter.cc
index 5d2fa05a2b..c720a9df1d 100644
--- a/src/reporter.cc
+++ b/src/reporter.cc
@@ -18,12 +18,18 @@
 #include <cstdlib>
 
 #include <iostream>
+#include <map>
+#include <string>
 #include <tuple>
 #include <vector>
 
 #include "check.h"
+#include "string_util.h"
 
 namespace benchmark {
+namespace internal {
+extern std::map<std::string, std::string>* global_context;
+}
 
 BenchmarkReporter::BenchmarkReporter()
     : output_stream_(&std::cout), error_stream_(&std::cerr) {}
@@ -32,11 +38,14 @@ BenchmarkReporter::~BenchmarkReporter() {}
 
 void BenchmarkReporter::PrintBasicContext(std::ostream *out,
                                           Context const &context) {
-  CHECK(out) << "cannot be null";
+  BM_CHECK(out) << "cannot be null";
   auto &Out = *out;
 
   Out << LocalDateTimeString() << "\n";
 
+  if (context.executable_name)
+    Out << "Running " << context.executable_name << "\n";
+
   const CPUInfo &info = context.cpu_info;
   Out << "Run on (" << info.num_cpus << " X "
       << (info.cycles_per_second / 1000000.0) << " MHz CPU "
@@ -45,14 +54,28 @@ void BenchmarkReporter::PrintBasicContext(std::ostream *out,
     Out << "CPU Caches:\n";
     for (auto &CInfo : info.caches) {
       Out << "  L" << CInfo.level << " " << CInfo.type << " "
-          << (CInfo.size / 1000) << "K";
+          << (CInfo.size / 1024) << " KiB";
       if (CInfo.num_sharing != 0)
         Out << " (x" << (info.num_cpus / CInfo.num_sharing) << ")";
       Out << "\n";
     }
   }
+  if (!info.load_avg.empty()) {
+    Out << "Load Average: ";
+    for (auto It = info.load_avg.begin(); It != info.load_avg.end();) {
+      Out << StrFormat("%.2f", *It++);
+      if (It != info.load_avg.end()) Out << ", ";
+    }
+    Out << "\n";
+  }
 
-  if (info.scaling_enabled) {
+  if (internal::global_context != nullptr) {
+    for (const auto& kv: *internal::global_context) {
+      Out << kv.first << ": " << kv.second << "\n";
+    }
+  }
+
+  if (CPUInfo::Scaling::ENABLED == info.scaling) {
     Out << "***WARNING*** CPU scaling is enabled, the benchmark "
            "real time measurements may be noisy and will incur extra "
            "overhead.\n";
@@ -64,7 +87,19 @@ void BenchmarkReporter::PrintBasicContext(std::ostream *out,
 #endif
 }
 
-BenchmarkReporter::Context::Context() : cpu_info(CPUInfo::Get()) {}
+// No initializer because it's already initialized to NULL.
+const char *BenchmarkReporter::Context::executable_name;
+
+BenchmarkReporter::Context::Context()
+    : cpu_info(CPUInfo::Get()), sys_info(SystemInfo::Get()) {}
+
+std::string BenchmarkReporter::Run::benchmark_name() const {
+  std::string name = run_name.str();
+  if (run_type == RT_Aggregate) {
+    name += "_" + aggregate_name;
+  }
+  return name;
+}
 
 double BenchmarkReporter::Run::GetAdjustedRealTime() const {
   double new_time = real_accumulated_time * GetTimeUnitMultiplier(time_unit);
diff --git a/src/statistics.cc b/src/statistics.cc
index d2843679d0..5f82c27372 100644
--- a/src/statistics.cc
+++ b/src/statistics.cc
@@ -17,9 +17,9 @@
 
 #include <algorithm>
 #include <cmath>
+#include <numeric>
 #include <string>
 #include <vector>
-#include <numeric>
 #include "check.h"
 #include "statistics.h"
 
@@ -36,16 +36,19 @@ double StatisticsMean(const std::vector<double>& v) {
 
 double StatisticsMedian(const std::vector<double>& v) {
   if (v.size() < 3) return StatisticsMean(v);
-  std::vector<double> partial;
-  // we need roundDown(count/2)+1 slots
-  partial.resize(1 + (v.size() / 2));
-  std::partial_sort_copy(v.begin(), v.end(), partial.begin(), partial.end());
-  // did we have odd number of samples?
-  // if yes, then the last element of partially-sorted vector is the median
-  // it no, then the average of the last two elements is the median
-  if(v.size() % 2 == 1)
-    return partial.back();
-  return (partial[partial.size() - 2] + partial[partial.size() - 1]) / 2.0;
+  std::vector<double> copy(v);
+
+  auto center = copy.begin() + v.size() / 2;
+  std::nth_element(copy.begin(), center, copy.end());
+
+  // did we have an odd number of samples?
+  // if yes, then center is the median
+  // it no, then we are looking for the average between center and the value
+  // before
+  if (v.size() % 2 == 1) return *center;
+  auto center2 = copy.begin() + v.size() / 2 - 1;
+  std::nth_element(copy.begin(), center2, copy.end());
+  return (*center + *center2) / 2.0;
 }
 
 // Return the sum of the squares of this sample set
@@ -65,8 +68,7 @@ double StatisticsStdDev(const std::vector<double>& v) {
   if (v.empty()) return mean;
 
   // Sample standard deviation is undefined for n = 1
-  if (v.size() == 1)
-    return 0.0;
+  if (v.size() == 1) return 0.0;
 
   const double avg_squares = SumSquares(v) * (1.0 / v.size());
   return Sqrt(v.size() / (v.size() - 1.0) * (avg_squares - Sqr(mean)));
@@ -89,49 +91,43 @@ std::vector<BenchmarkReporter::Run> ComputeStats(
   // Accumulators.
   std::vector<double> real_accumulated_time_stat;
   std::vector<double> cpu_accumulated_time_stat;
-  std::vector<double> bytes_per_second_stat;
-  std::vector<double> items_per_second_stat;
 
   real_accumulated_time_stat.reserve(reports.size());
   cpu_accumulated_time_stat.reserve(reports.size());
-  bytes_per_second_stat.reserve(reports.size());
-  items_per_second_stat.reserve(reports.size());
 
   // All repetitions should be run with the same number of iterations so we
   // can take this information from the first benchmark.
-  int64_t const run_iterations = reports.front().iterations;
+  const IterationCount run_iterations = reports.front().iterations;
   // create stats for user counters
   struct CounterStat {
     Counter c;
     std::vector<double> s;
   };
-  std::map< std::string, CounterStat > counter_stats;
-  for(Run const& r : reports) {
-    for(auto const& cnt : r.counters) {
+  std::map<std::string, CounterStat> counter_stats;
+  for (Run const& r : reports) {
+    for (auto const& cnt : r.counters) {
       auto it = counter_stats.find(cnt.first);
-      if(it == counter_stats.end()) {
+      if (it == counter_stats.end()) {
         counter_stats.insert({cnt.first, {cnt.second, std::vector<double>{}}});
         it = counter_stats.find(cnt.first);
         it->second.s.reserve(reports.size());
       } else {
-        CHECK_EQ(counter_stats[cnt.first].c.flags, cnt.second.flags);
+        BM_CHECK_EQ(counter_stats[cnt.first].c.flags, cnt.second.flags);
       }
     }
   }
 
   // Populate the accumulators.
   for (Run const& run : reports) {
-    CHECK_EQ(reports[0].benchmark_name, run.benchmark_name);
-    CHECK_EQ(run_iterations, run.iterations);
+    BM_CHECK_EQ(reports[0].benchmark_name(), run.benchmark_name());
+    BM_CHECK_EQ(run_iterations, run.iterations);
     if (run.error_occurred) continue;
     real_accumulated_time_stat.emplace_back(run.real_accumulated_time);
     cpu_accumulated_time_stat.emplace_back(run.cpu_accumulated_time);
-    items_per_second_stat.emplace_back(run.items_per_second);
-    bytes_per_second_stat.emplace_back(run.bytes_per_second);
     // user counters
-    for(auto const& cnt : run.counters) {
+    for (auto const& cnt : run.counters) {
       auto it = counter_stats.find(cnt.first);
-      CHECK_NE(it, counter_stats.end());
+      BM_CHECK_NE(it, counter_stats.end());
       it->second.s.emplace_back(cnt.second);
     }
   }
@@ -145,24 +141,48 @@ std::vector<BenchmarkReporter::Run> ComputeStats(
     }
   }
 
-  for(const auto& Stat : *reports[0].statistics) {
+  const double iteration_rescale_factor =
+      double(reports.size()) / double(run_iterations);
+
+  for (const auto& Stat : *reports[0].statistics) {
     // Get the data from the accumulator to BenchmarkReporter::Run's.
     Run data;
-    data.benchmark_name = reports[0].benchmark_name + "_" + Stat.name_;
+    data.run_name = reports[0].run_name;
+    data.family_index = reports[0].family_index;
+    data.per_family_instance_index = reports[0].per_family_instance_index;
+    data.run_type = BenchmarkReporter::Run::RT_Aggregate;
+    data.threads = reports[0].threads;
+    data.repetitions = reports[0].repetitions;
+    data.repetition_index = Run::no_repetition_index;
+    data.aggregate_name = Stat.name_;
     data.report_label = report_label;
-    data.iterations = run_iterations;
+
+    // It is incorrect to say that an aggregate is computed over
+    // run's iterations, because those iterations already got averaged.
+    // Similarly, if there are N repetitions with 1 iterations each,
+    // an aggregate will be computed over N measurements, not 1.
+    // Thus it is best to simply use the count of separate reports.
+    data.iterations = reports.size();
 
     data.real_accumulated_time = Stat.compute_(real_accumulated_time_stat);
     data.cpu_accumulated_time = Stat.compute_(cpu_accumulated_time_stat);
-    data.bytes_per_second = Stat.compute_(bytes_per_second_stat);
-    data.items_per_second = Stat.compute_(items_per_second_stat);
+
+    // We will divide these times by data.iterations when reporting, but the
+    // data.iterations is not nessesairly the scale of these measurements,
+    // because in each repetition, these timers are sum over all the iterations.
+    // And if we want to say that the stats are over N repetitions and not
+    // M iterations, we need to multiply these by (N/M).
+    data.real_accumulated_time *= iteration_rescale_factor;
+    data.cpu_accumulated_time *= iteration_rescale_factor;
 
     data.time_unit = reports[0].time_unit;
 
     // user counters
-    for(auto const& kv : counter_stats) {
+    for (auto const& kv : counter_stats) {
+      // Do *NOT* rescale the custom counters. They are already properly scaled.
       const auto uc_stat = Stat.compute_(kv.second.s);
-      auto c = Counter(uc_stat, counter_stats[kv.first].c.flags);
+      auto c = Counter(uc_stat, counter_stats[kv.first].c.flags,
+                       counter_stats[kv.first].c.oneK);
       data.counters[kv.first] = c;
     }
 
diff --git a/src/string_util.cc b/src/string_util.cc
index 29edb2a468..3551418174 100644
--- a/src/string_util.cc
+++ b/src/string_util.cc
@@ -1,6 +1,9 @@
 #include "string_util.h"
 
 #include <array>
+#ifdef BENCHMARK_STL_ANDROID_GNUSTL
+#include <cerrno>
+#endif
 #include <cmath>
 #include <cstdarg>
 #include <cstdio>
@@ -122,7 +125,7 @@ std::string HumanReadableNumber(double n, double one_k) {
   return ToBinaryStringFullySpecified(n, 1.1, 1, one_k);
 }
 
-std::string StringPrintFImp(const char* msg, va_list args) {
+std::string StrFormatImp(const char* msg, va_list args) {
   // we might need a second shot at this, so pre-emptivly make a copy
   va_list args_cp;
   va_copy(args_cp, args);
@@ -152,21 +155,114 @@ std::string StringPrintFImp(const char* msg, va_list args) {
   return std::string(buff_ptr.get());
 }
 
-std::string StringPrintF(const char* format, ...) {
+std::string StrFormat(const char* format, ...) {
   va_list args;
   va_start(args, format);
-  std::string tmp = StringPrintFImp(format, args);
+  std::string tmp = StrFormatImp(format, args);
   va_end(args);
   return tmp;
 }
 
-void ReplaceAll(std::string* str, const std::string& from,
-                const std::string& to) {
-  std::size_t start = 0;
-  while ((start = str->find(from, start)) != std::string::npos) {
-    str->replace(start, from.length(), to);
-    start += to.length();
+std::vector<std::string> StrSplit(const std::string& str, char delim) {
+  if (str.empty()) return {};
+  std::vector<std::string> ret;
+  size_t first = 0;
+  size_t next = str.find(delim);
+  for (; next != std::string::npos;
+       first = next + 1, next = str.find(delim, first)) {
+    ret.push_back(str.substr(first, next - first));
   }
+  ret.push_back(str.substr(first));
+  return ret;
 }
 
+#ifdef BENCHMARK_STL_ANDROID_GNUSTL
+/*
+ * GNU STL in Android NDK lacks support for some C++11 functions, including
+ * stoul, stoi, stod. We reimplement them here using C functions strtoul,
+ * strtol, strtod. Note that reimplemented functions are in benchmark::
+ * namespace, not std:: namespace.
+ */
+unsigned long stoul(const std::string& str, size_t* pos, int base) {
+  /* Record previous errno */
+  const int oldErrno = errno;
+  errno = 0;
+
+  const char* strStart = str.c_str();
+  char* strEnd = const_cast<char*>(strStart);
+  const unsigned long result = strtoul(strStart, &strEnd, base);
+
+  const int strtoulErrno = errno;
+  /* Restore previous errno */
+  errno = oldErrno;
+
+  /* Check for errors and return */
+  if (strtoulErrno == ERANGE) {
+    throw std::out_of_range(
+      "stoul failed: " + str + " is outside of range of unsigned long");
+  } else if (strEnd == strStart || strtoulErrno != 0) {
+    throw std::invalid_argument(
+      "stoul failed: " + str + " is not an integer");
+  }
+  if (pos != nullptr) {
+    *pos = static_cast<size_t>(strEnd - strStart);
+  }
+  return result;
+}
+
+int stoi(const std::string& str, size_t* pos, int base) {
+  /* Record previous errno */
+  const int oldErrno = errno;
+  errno = 0;
+
+  const char* strStart = str.c_str();
+  char* strEnd = const_cast<char*>(strStart);
+  const long result = strtol(strStart, &strEnd, base);
+
+  const int strtolErrno = errno;
+  /* Restore previous errno */
+  errno = oldErrno;
+
+  /* Check for errors and return */
+  if (strtolErrno == ERANGE || long(int(result)) != result) {
+    throw std::out_of_range(
+      "stoul failed: " + str + " is outside of range of int");
+  } else if (strEnd == strStart || strtolErrno != 0) {
+    throw std::invalid_argument(
+      "stoul failed: " + str + " is not an integer");
+  }
+  if (pos != nullptr) {
+    *pos = static_cast<size_t>(strEnd - strStart);
+  }
+  return int(result);
+}
+
+double stod(const std::string& str, size_t* pos) {
+  /* Record previous errno */
+  const int oldErrno = errno;
+  errno = 0;
+
+  const char* strStart = str.c_str();
+  char* strEnd = const_cast<char*>(strStart);
+  const double result = strtod(strStart, &strEnd);
+
+  /* Restore previous errno */
+  const int strtodErrno = errno;
+  errno = oldErrno;
+
+  /* Check for errors and return */
+  if (strtodErrno == ERANGE) {
+    throw std::out_of_range(
+      "stoul failed: " + str + " is outside of range of int");
+  } else if (strEnd == strStart || strtodErrno != 0) {
+    throw std::invalid_argument(
+      "stoul failed: " + str + " is not an integer");
+  }
+  if (pos != nullptr) {
+    *pos = static_cast<size_t>(strEnd - strStart);
+  }
+  return result;
+}
+#endif
+
 }  // end namespace benchmark
diff --git a/src/string_util.h b/src/string_util.h
index c3d53bfd33..6bc28b6912 100644
--- a/src/string_util.h
+++ b/src/string_util.h
@@ -12,28 +12,49 @@ void AppendHumanReadable(int n, std::string* str);
 
 std::string HumanReadableNumber(double n, double one_k = 1024.0);
 
-std::string StringPrintF(const char* format, ...);
-
-inline std::ostream& StringCatImp(std::ostream& out) BENCHMARK_NOEXCEPT {
+#if defined(__MINGW32__)
+__attribute__((format(__MINGW_PRINTF_FORMAT, 1, 2)))
+#elif defined(__GNUC__)
+__attribute__((format(printf, 1, 2)))
+#endif
+std::string
+StrFormat(const char* format, ...);
+
+inline std::ostream& StrCatImp(std::ostream& out) BENCHMARK_NOEXCEPT {
   return out;
 }
 
 template <class First, class... Rest>
-inline std::ostream& StringCatImp(std::ostream& out, First&& f,
-                                  Rest&&... rest) {
+inline std::ostream& StrCatImp(std::ostream& out, First&& f, Rest&&... rest) {
   out << std::forward<First>(f);
-  return StringCatImp(out, std::forward<Rest>(rest)...);
+  return StrCatImp(out, std::forward<Rest>(rest)...);
 }
 
 template <class... Args>
 inline std::string StrCat(Args&&... args) {
   std::ostringstream ss;
-  StringCatImp(ss, std::forward<Args>(args)...);
+  StrCatImp(ss, std::forward<Args>(args)...);
   return ss.str();
 }
 
-void ReplaceAll(std::string* str, const std::string& from,
-                const std::string& to);
+std::vector<std::string> StrSplit(const std::string& str, char delim);
+
+#ifdef BENCHMARK_STL_ANDROID_GNUSTL
+/*
+ * GNU STL in Android NDK lacks support for some C++11 functions, including
+ * stoul, stoi, stod. We reimplement them here using C functions strtoul,
+ * strtol, strtod. Note that reimplemented functions are in benchmark::
+ * namespace, not std:: namespace.
+ */
+unsigned long stoul(const std::string& str, size_t* pos = nullptr,
+                           int base = 10);
+int stoi(const std::string& str, size_t* pos = nullptr, int base = 10);
+double stod(const std::string& str, size_t* pos = nullptr);
+#else
+using std::stoul;
+using std::stoi;
+using std::stod;
+#endif
 
 }  // end namespace benchmark
 
diff --git a/src/sysinfo.cc b/src/sysinfo.cc
index 7b2285bcc7..1d0daaa2af 100644
--- a/src/sysinfo.cc
+++ b/src/sysinfo.cc
@@ -122,7 +122,7 @@ struct ValueUnion {
   template <class T, int N>
   std::array<T, N> GetAsArray() {
     const int ArrSize = sizeof(T) * N;
-    CHECK_LE(ArrSize, Size);
+    BM_CHECK_LE(ArrSize, Size);
     std::array<T, N> Arr;
     std::memcpy(Arr.data(), data(), ArrSize);
     return Arr;
diff --git a/src/thread_manager.h b/src/thread_manager.h
new file mode 100644
index 0000000000..28e2dd53af
--- /dev/null
+++ b/src/thread_manager.h
@@ -0,0 +1,64 @@
+#ifndef BENCHMARK_THREAD_MANAGER_H
+#define BENCHMARK_THREAD_MANAGER_H
+
+#include <atomic>
+
+#include "benchmark/benchmark.h"
+#include "mutex.h"
+
+namespace benchmark {
+namespace internal {
+
+class ThreadManager {
+ public:
+  explicit ThreadManager(int num_threads)
+      : alive_threads_(num_threads), start_stop_barrier_(num_threads) {}
+
+  Mutex& GetBenchmarkMutex() const RETURN_CAPABILITY(benchmark_mutex_) {
+    return benchmark_mutex_;
+  }
+
+  bool StartStopBarrier() EXCLUDES(end_cond_mutex_) {
+    return start_stop_barrier_.wait();
+  }
+
+  void NotifyThreadComplete() EXCLUDES(end_cond_mutex_) {
+    start_stop_barrier_.removeThread();
+    if (--alive_threads_ == 0) {
+      MutexLock lock(end_cond_mutex_);
+      end_condition_.notify_all();
+    }
+  }
+
+  void WaitForAllThreads() EXCLUDES(end_cond_mutex_) {
+    MutexLock lock(end_cond_mutex_);
+    end_condition_.wait(lock.native_handle(),
+                        [this]() { return alive_threads_ == 0; });
+  }
+
+ public:
+  struct Result {
+    IterationCount iterations = 0;
+    double real_time_used = 0;
+    double cpu_time_used = 0;
+    double manual_time_used = 0;
+    int64_t complexity_n = 0;
+    std::string report_label_;
+    std::string error_message_;
+    bool has_error_ = false;
+    UserCounters counters;
+  };
+  GUARDED_BY(GetBenchmarkMutex()) Result results;
+
+ private:
+  mutable Mutex benchmark_mutex_;
+  std::atomic<int> alive_threads_;
+  Barrier start_stop_barrier_;
+  Mutex end_cond_mutex_;
+  Condition end_condition_;
+};
+
+}  // namespace internal
+}  // namespace benchmark
+
+#endif  // BENCHMARK_THREAD_MANAGER_H
diff --git a/src/thread_timer.h b/src/thread_timer.h
new file mode 100644
index 0000000000..eb23f59561
--- /dev/null
+++ b/src/thread_timer.h
@@ -0,0 +1,86 @@
+#ifndef BENCHMARK_THREAD_TIMER_H
+#define BENCHMARK_THREAD_TIMER_H
+
+#include "check.h"
+#include "timers.h"
+
+namespace benchmark {
+namespace internal {
+
+class ThreadTimer {
+  explicit ThreadTimer(bool measure_process_cpu_time_)
+      : measure_process_cpu_time(measure_process_cpu_time_) {}
+
+ public:
+  static ThreadTimer Create() {
+    return ThreadTimer(/*measure_process_cpu_time_=*/false);
+  }
+  static ThreadTimer CreateProcessCpuTime() {
+    return ThreadTimer(/*measure_process_cpu_time_=*/true);
+  }
+
+  // Called by each thread
+  void StartTimer() {
+    running_ = true;
+    start_real_time_ = ChronoClockNow();
+    start_cpu_time_ = ReadCpuTimerOfChoice();
+  }
+
+  // Called by each thread
+  void StopTimer() {
+    BM_CHECK(running_);
+    running_ = false;
+    real_time_used_ += ChronoClockNow() - start_real_time_;
+    // Floating point error can result in the subtraction producing a negative
+    // time. Guard against that.
+    cpu_time_used_ +=
+        std::max<double>(ReadCpuTimerOfChoice() - start_cpu_time_, 0);
+  }
+
+  // Called by each thread
+  void SetIterationTime(double seconds) { manual_time_used_ += seconds; }
+
+  bool running() const { return running_; }
+
+  // REQUIRES: timer is not running
+  double real_time_used() const {
+    BM_CHECK(!running_);
+    return real_time_used_;
+  }
+
+  // REQUIRES: timer is not running
+  double cpu_time_used() const {
+    BM_CHECK(!running_);
+    return cpu_time_used_;
+  }
+
+  // REQUIRES: timer is not running
+  double manual_time_used() const {
+    BM_CHECK(!running_);
+    return manual_time_used_;
+  }
+
+ private:
+  double ReadCpuTimerOfChoice() const {
+    if (measure_process_cpu_time) return ProcessCPUUsage();
+    return ThreadCPUUsage();
+  }
+
+  // should the thread, or the process, time be measured?
+  const bool measure_process_cpu_time;
+
+  bool running_ = false;        // Is the timer running
+  double start_real_time_ = 0;  // If running_
+  double start_cpu_time_ = 0;   // If running_
+
+  // Accumulated time so far (does not contain current slice if running_)
+  double real_time_used_ = 0;
+  double cpu_time_used_ = 0;
+  // Manually set iteration time. User sets this with SetIterationTime(seconds).
+  double manual_time_used_ = 0;
+};
+
+}  // namespace internal
+}  // namespace benchmark
+
+#endif  // BENCHMARK_THREAD_TIMER_H
diff --git a/test/AssemblyTests.cmake b/test/AssemblyTests.cmake
new file mode 100644
index 0000000000..3d078586f1
--- /dev/null
+++ b/test/AssemblyTests.cmake
@@ -0,0 +1,46 @@
+
+include(split_list)
+
+set(ASM_TEST_FLAGS "")
+check_cxx_compiler_flag(-O3 BENCHMARK_HAS_O3_FLAG)
+if (BENCHMARK_HAS_O3_FLAG)
+  list(APPEND ASM_TEST_FLAGS -O3)
+endif()
+
+check_cxx_compiler_flag(-g0 BENCHMARK_HAS_G0_FLAG)
+if (BENCHMARK_HAS_G0_FLAG)
+  list(APPEND ASM_TEST_FLAGS -g0)
+endif()
+
+check_cxx_compiler_flag(-fno-stack-protector BENCHMARK_HAS_FNO_STACK_PROTECTOR_FLAG)
+if (BENCHMARK_HAS_FNO_STACK_PROTECTOR_FLAG)
+  list(APPEND ASM_TEST_FLAGS -fno-stack-protector)
+endif()
+
+split_list(ASM_TEST_FLAGS)
+string(TOUPPER "${CMAKE_CXX_COMPILER_ID}" ASM_TEST_COMPILER)
+
+macro(add_filecheck_test name)
+  cmake_parse_arguments(ARG "" "" "CHECK_PREFIXES" ${ARGV})
+  add_library(${name} OBJECT ${name}.cc)
+  set_target_properties(${name} PROPERTIES COMPILE_FLAGS "-S ${ASM_TEST_FLAGS}")
+  set(ASM_OUTPUT_FILE "${CMAKE_CURRENT_BINARY_DIR}/${name}.s")
+  add_custom_target(copy_${name} ALL
+      COMMAND ${PROJECT_SOURCE_DIR}/tools/strip_asm.py
+        $<TARGET_OBJECTS:${name}>
+        ${ASM_OUTPUT_FILE}
+      BYPRODUCTS ${ASM_OUTPUT_FILE})
+  add_dependencies(copy_${name} ${name})
+  if (NOT ARG_CHECK_PREFIXES)
+    set(ARG_CHECK_PREFIXES "CHECK")
+  endif()
+  foreach(prefix ${ARG_CHECK_PREFIXES})
+    add_test(NAME run_${name}_${prefix}
+        COMMAND
+          ${LLVM_FILECHECK_EXE} ${name}.cc
+          --input-file=${ASM_OUTPUT_FILE}
+          --check-prefixes=CHECK,CHECK-${ASM_TEST_COMPILER}
+        WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
+  endforeach()
+endmacro()
+
diff --git a/test/BUILD b/test/BUILD
new file mode 100644
index 0000000000..1f27f99ede
--- /dev/null
+++ b/test/BUILD
@@ -0,0 +1,74 @@
+TEST_COPTS = [
+    "-pedantic",
+    "-pedantic-errors",
+    "-std=c++11",
+    "-Wall",
+    "-Wextra",
+    "-Wshadow",
+    #    "-Wshorten-64-to-32",
+    "-Wfloat-equal",
+    "-fstrict-aliasing",
+]
+
+PER_SRC_COPTS = ({
+    "cxx03_test.cc": ["-std=c++03"],
+    # Some of the issues with DoNotOptimize only occur when optimization is enabled
+    "donotoptimize_test.cc": ["-O3"],
+})
+
+TEST_ARGS = ["--benchmark_min_time=0.01"]
+
+PER_SRC_TEST_ARGS = ({
+    "user_counters_tabular_test.cc": ["--benchmark_counters_tabular=true"],
+    "repetitions_test.cc": [" --benchmark_repetitions=3"],
+})
+
+load("@rules_cc//cc:defs.bzl", "cc_library", "cc_test")
+
+cc_library(
+    name = "output_test_helper",
+    testonly = 1,
+    srcs = ["output_test_helper.cc"],
+    hdrs = ["output_test.h"],
+    copts = TEST_COPTS,
+    deps = [
+        "//:benchmark",
+        "//:benchmark_internal_headers",
+    ],
+)
+
+[
+    cc_test(
+        name = test_src[:-len(".cc")],
+        size = "small",
+        srcs = [test_src],
+        args = TEST_ARGS + PER_SRC_TEST_ARGS.get(test_src, []),
+        copts = TEST_COPTS + PER_SRC_COPTS.get(test_src, []),
+        deps = [
+            ":output_test_helper",
+            "//:benchmark",
+            "//:benchmark_internal_headers",
+            "@com_google_googletest//:gtest",
+        ] + (
+            ["@com_google_googletest//:gtest_main"] if (test_src[-len("gtest.cc"):] == "gtest.cc") else []
+        ),
+        # FIXME: Add support for assembly tests to bazel.
+        # See Issue #556
+        # https://github.com/google/benchmark/issues/556
+    )
+    for test_src in glob(
+        ["*test.cc"],
+        exclude = [
+            "*_assembly_test.cc",
+            "link_main_test.cc",
+        ],
+    )
+]
+
+cc_test(
+    name = "link_main_test",
+    size = "small",
+    srcs = ["link_main_test.cc"],
+    copts = TEST_COPTS,
+    deps = ["//:benchmark_main"],
+)
diff --git a/test/args_product_test.cc b/test/args_product_test.cc
new file mode 100644
index 0000000000..32a75d50dd
--- /dev/null
+++ b/test/args_product_test.cc
@@ -0,0 +1,77 @@
+#include "benchmark/benchmark.h"
+
+#include <cassert>
+#include <iostream>
+#include <set>
+#include <vector>
+
+class ArgsProductFixture : public ::benchmark::Fixture {
+ public:
+  ArgsProductFixture()
+      : expectedValues({{0, 100, 2000, 30000},
+                        {1, 15, 3, 8},
+                        {1, 15, 3, 9},
+                        {1, 15, 7, 8},
+                        {1, 15, 7, 9},
+                        {1, 15, 10, 8},
+                        {1, 15, 10, 9},
+                        {2, 15, 3, 8},
+                        {2, 15, 3, 9},
+                        {2, 15, 7, 8},
+                        {2, 15, 7, 9},
+                        {2, 15, 10, 8},
+                        {2, 15, 10, 9},
+                        {4, 5, 6, 11}}) {}
+
+  void SetUp(const ::benchmark::State& state) BENCHMARK_OVERRIDE {
+    std::vector<int64_t> ranges = {state.range(0), state.range(1),
+                                   state.range(2), state.range(3)};
+
+    assert(expectedValues.find(ranges) != expectedValues.end());
+
+    actualValues.insert(ranges);
+  }
+
+  // NOTE: This is not TearDown as we want to check after _all_ runs are
+  // complete.
+  virtual ~ArgsProductFixture() {
+    if (actualValues != expectedValues) {
+      std::cout << "EXPECTED\n";
+      for (auto v : expectedValues) {
+        std::cout << "{";
+        for (int64_t iv : v) {
+          std::cout << iv << ", ";
+        }
+        std::cout << "}\n";
+      }
+      std::cout << "ACTUAL\n";
+      for (auto v : actualValues) {
+        std::cout << "{";
+        for (int64_t iv : v) {
+          std::cout << iv << ", ";
+        }
+        std::cout << "}\n";
+      }
+    }
+  }
+
+  std::set<std::vector<int64_t>> expectedValues;
+  std::set<std::vector<int64_t>> actualValues;
+};
+
+BENCHMARK_DEFINE_F(ArgsProductFixture, Empty)(benchmark::State& state) {
+  for (auto _ : state) {
+    int64_t product =
+        state.range(0) * state.range(1) * state.range(2) * state.range(3);
+    for (int64_t x = 0; x < product; x++) {
+      benchmark::DoNotOptimize(x);
+    }
+  }
+}
+
+BENCHMARK_REGISTER_F(ArgsProductFixture, Empty)
+    ->Args({0, 100, 2000, 30000})
+    ->ArgsProduct({{1, 2}, {15}, {3, 7, 10}, {8, 9}})
+    ->Args({4, 5, 6, 11});
+
+BENCHMARK_MAIN();
diff --git a/test/basic_test.cc b/test/basic_test.cc
index 12579c0945..33642211e2 100644
--- a/test/basic_test.cc
+++ b/test/basic_test.cc
@@ -98,7 +98,8 @@ BENCHMARK(BM_empty_stop_start)->ThreadPerCpu();
 
 
 void BM_KeepRunning(benchmark::State& state) {
-  size_t iter_count = 0;
+  benchmark::IterationCount iter_count = 0;
+  assert(iter_count == state.iterations());
   while (state.KeepRunning()) {
     ++iter_count;
   }
@@ -107,18 +108,33 @@ void BM_KeepRunning(benchmark::State& state) {
 BENCHMARK(BM_KeepRunning);
 
 void BM_KeepRunningBatch(benchmark::State& state) {
-  // Choose a prime batch size to avoid evenly dividing max_iterations.
-  const size_t batch_size = 101;
-  size_t iter_count = 0;
+  // Choose a batch size >1000 to skip the typical runs with iteration
+  // targets of 10, 100 and 1000.  If these are not actually skipped the
+  // bug would be detectable as consecutive runs with the same iteration
+  // count.  Below we assert that this does not happen.
+  const benchmark::IterationCount batch_size = 1009;
+
+  static benchmark::IterationCount prior_iter_count = 0;
+  benchmark::IterationCount iter_count = 0;
   while (state.KeepRunningBatch(batch_size)) {
     iter_count += batch_size;
   }
   assert(state.iterations() == iter_count);
+
+  // Verify that the iteration count always increases across runs (see
+  // comment above).
+  assert(iter_count == batch_size            // max_iterations == 1
+         || iter_count > prior_iter_count);  // max_iterations > batch_size
+  prior_iter_count = iter_count;
 }
-BENCHMARK(BM_KeepRunningBatch);
+// Register with a fixed repetition count to establish the invariant that
+// the iteration count should always change across runs.  This overrides
+// the --benchmark_repetitions command line flag, which would otherwise
+// cause this test to fail if set > 1.
+BENCHMARK(BM_KeepRunningBatch)->Repetitions(1);
 
 void BM_RangedFor(benchmark::State& state) {
-  size_t iter_count = 0;
+  benchmark::IterationCount iter_count = 0;
   for (auto _ : state) {
     ++iter_count;
   }
@@ -126,4 +142,10 @@ void BM_RangedFor(benchmark::State& state) {
 }
 BENCHMARK(BM_RangedFor);
 
+// Ensure that StateIterator provides all the necessary typedefs required to
+// instantiate std::iterator_traits.
+static_assert(std::is_same<
+  typename std::iterator_traits<benchmark::State::StateIterator>::value_type,
+  typename benchmark::State::StateIterator::value_type>::value, "");
+
 BENCHMARK_MAIN();
diff --git a/test/benchmark_gtest.cc b/test/benchmark_gtest.cc
new file mode 100644
index 0000000000..14a885ba46
--- /dev/null
+++ b/test/benchmark_gtest.cc
@@ -0,0 +1,165 @@
+#include <map>
+#include <string>
+#include <vector>
+
+#include "../src/benchmark_register.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+namespace benchmark {
+namespace internal {
+extern std::map<std::string, std::string>* global_context;
+
+namespace {
+
+TEST(AddRangeTest, Simple) {
+  std::vector<int> dst;
+  AddRange(&dst, 1, 2, 2);
+  EXPECT_THAT(dst, testing::ElementsAre(1, 2));
+}
+
+TEST(AddRangeTest, Simple64) {
+  std::vector<int64_t> dst;
+  AddRange(&dst, static_cast<int64_t>(1), static_cast<int64_t>(2), 2);
+  EXPECT_THAT(dst, testing::ElementsAre(1, 2));
+}
+
+TEST(AddRangeTest, Advanced) {
+  std::vector<int> dst;
+  AddRange(&dst, 5, 15, 2);
+  EXPECT_THAT(dst, testing::ElementsAre(5, 8, 15));
+}
+
+TEST(AddRangeTest, Advanced64) {
+  std::vector<int64_t> dst;
+  AddRange(&dst, static_cast<int64_t>(5), static_cast<int64_t>(15), 2);
+  EXPECT_THAT(dst, testing::ElementsAre(5, 8, 15));
+}
+
+TEST(AddRangeTest, FullRange8) {
+  std::vector<int8_t> dst;
+  AddRange(&dst, int8_t{1}, std::numeric_limits<int8_t>::max(), 8);
+  EXPECT_THAT(dst, testing::ElementsAre(1, 8, 64, 127));
+}
+
+TEST(AddRangeTest, FullRange64) {
+  std::vector<int64_t> dst;
+  AddRange(&dst, int64_t{1}, std::numeric_limits<int64_t>::max(), 1024);
+  EXPECT_THAT(
+      dst, testing::ElementsAre(1LL, 1024LL, 1048576LL, 1073741824LL,
+                                1099511627776LL, 1125899906842624LL,
+                                1152921504606846976LL, 9223372036854775807LL));
+}
+
+TEST(AddRangeTest, NegativeRanges) {
+  std::vector<int> dst;
+  AddRange(&dst, -8, 0, 2);
+  EXPECT_THAT(dst, testing::ElementsAre(-8, -4, -2, -1, 0));
+}
+
+TEST(AddRangeTest, StrictlyNegative) {
+  std::vector<int> dst;
+  AddRange(&dst, -8, -1, 2);
+  EXPECT_THAT(dst, testing::ElementsAre(-8, -4, -2, -1));
+}
+
+TEST(AddRangeTest, SymmetricNegativeRanges) {
+  std::vector<int> dst;
+  AddRange(&dst, -8, 8, 2);
+  EXPECT_THAT(dst, testing::ElementsAre(-8, -4, -2, -1, 0, 1, 2, 4, 8));
+}
+
+TEST(AddRangeTest, SymmetricNegativeRangesOddMult) {
+  std::vector<int> dst;
+  AddRange(&dst, -30, 32, 5);
+  EXPECT_THAT(dst, testing::ElementsAre(-30, -25, -5, -1, 0, 1, 5, 25, 32));
+}
+
+TEST(AddRangeTest, NegativeRangesAsymmetric) {
+  std::vector<int> dst;
+  AddRange(&dst, -3, 5, 2);
+  EXPECT_THAT(dst, testing::ElementsAre(-3, -2, -1, 0, 1, 2, 4, 5));
+}
+
+TEST(AddRangeTest, NegativeRangesLargeStep) {
+  // Always include -1, 0, 1 when crossing zero.
+  std::vector<int> dst;
+  AddRange(&dst, -8, 8, 10);
+  EXPECT_THAT(dst, testing::ElementsAre(-8, -1, 0, 1, 8));
+}
+
+TEST(AddRangeTest, ZeroOnlyRange) {
+  std::vector<int> dst;
+  AddRange(&dst, 0, 0, 2);
+  EXPECT_THAT(dst, testing::ElementsAre(0));
+}
+
+TEST(AddRangeTest, ZeroStartingRange) {
+  std::vector<int> dst;
+  AddRange(&dst, 0, 2, 2);
+  EXPECT_THAT(dst, testing::ElementsAre(0, 1, 2));
+}
+
+TEST(AddRangeTest, NegativeRange64) {
+  std::vector<int64_t> dst;
+  AddRange<int64_t>(&dst, -4, 4, 2);
+  EXPECT_THAT(dst, testing::ElementsAre(-4, -2, -1, 0, 1, 2, 4));
+}
+
+TEST(AddRangeTest, NegativeRangePreservesExistingOrder) {
+  // If elements already exist in the range, ensure we don't change
+  // their ordering by adding negative values.
+  std::vector<int64_t> dst = {1, 2, 3};
+  AddRange<int64_t>(&dst, -2, 2, 2);
+  EXPECT_THAT(dst, testing::ElementsAre(1, 2, 3, -2, -1, 0, 1, 2));
+}
+
+TEST(AddRangeTest, FullNegativeRange64) {
+  std::vector<int64_t> dst;
+  const auto min = std::numeric_limits<int64_t>::min();
+  const auto max = std::numeric_limits<int64_t>::max();
+  AddRange(&dst, min, max, 1024);
+  EXPECT_THAT(
+      dst, testing::ElementsAreArray(std::vector<int64_t>{
+               min, -1152921504606846976LL, -1125899906842624LL,
+               -1099511627776LL, -1073741824LL, -1048576LL, -1024LL, -1LL, 0LL,
+               1LL, 1024LL, 1048576LL, 1073741824LL, 1099511627776LL,
+               1125899906842624LL, 1152921504606846976LL, max}));
+}
+
+TEST(AddRangeTest, Simple8) {
+  std::vector<int8_t> dst;
+  AddRange<int8_t>(&dst, 1, 8, 2);
+  EXPECT_THAT(dst, testing::ElementsAre(1, 2, 4, 8));
+}
+
+TEST(AddCustomContext, Simple) {
+  EXPECT_THAT(global_context, nullptr);
+
+  AddCustomContext("foo", "bar");
+  AddCustomContext("baz", "qux");
+
+  EXPECT_THAT(*global_context,
+              testing::UnorderedElementsAre(testing::Pair("foo", "bar"),
+                                            testing::Pair("baz", "qux")));
+
+  delete global_context;
+  global_context = nullptr;
+}
+
+TEST(AddCustomContext, DuplicateKey) {
+  EXPECT_THAT(global_context, nullptr);
+
+  AddCustomContext("foo", "bar");
+  AddCustomContext("foo", "qux");
+
+  EXPECT_THAT(*global_context,
+              testing::UnorderedElementsAre(testing::Pair("foo", "bar")));
+
+  delete global_context;
+  global_context = nullptr;
+}
+
+}  // namespace
+}  // namespace internal
+}  // namespace benchmark
diff --git a/test/benchmark_name_gtest.cc b/test/benchmark_name_gtest.cc
new file mode 100644
index 0000000000..afb401c1f5
--- /dev/null
+++ b/test/benchmark_name_gtest.cc
@@ -0,0 +1,74 @@
+#include "benchmark/benchmark.h"
+#include "gtest/gtest.h"
+
+namespace {
+
+using namespace benchmark;
+using namespace benchmark::internal;
+
+TEST(BenchmarkNameTest, Empty) {
+  const auto name = BenchmarkName();
+  EXPECT_EQ(name.str(), std::string());
+}
+
+TEST(BenchmarkNameTest, FunctionName) {
+  auto name = BenchmarkName();
+  name.function_name = "function_name";
+  EXPECT_EQ(name.str(), "function_name");
+}
+
+TEST(BenchmarkNameTest, FunctionNameAndArgs) {
+  auto name = BenchmarkName();
+  name.function_name = "function_name";
+  name.args = "some_args:3/4/5";
+  EXPECT_EQ(name.str(), "function_name/some_args:3/4/5");
+}
+
+TEST(BenchmarkNameTest, MinTime) {
+  auto name = BenchmarkName();
+  name.function_name = "function_name";
+  name.args = "some_args:3/4";
+  name.min_time = "min_time:3.4s";
+  EXPECT_EQ(name.str(), "function_name/some_args:3/4/min_time:3.4s");
+}
+
+TEST(BenchmarkNameTest, Iterations) {
+  auto name = BenchmarkName();
+  name.function_name = "function_name";
+  name.min_time = "min_time:3.4s";
+  name.iterations = "iterations:42";
+  EXPECT_EQ(name.str(), "function_name/min_time:3.4s/iterations:42");
+}
+
+TEST(BenchmarkNameTest, Repetitions) {
+  auto name = BenchmarkName();
+  name.function_name = "function_name";
+  name.min_time = "min_time:3.4s";
+  name.repetitions = "repetitions:24";
+  EXPECT_EQ(name.str(), "function_name/min_time:3.4s/repetitions:24");
+}
+
+TEST(BenchmarkNameTest, TimeType) {
+  auto name = BenchmarkName();
+  name.function_name = "function_name";
+  name.min_time = "min_time:3.4s";
+  name.time_type = "hammer_time";
+  EXPECT_EQ(name.str(), "function_name/min_time:3.4s/hammer_time");
+}
+
+TEST(BenchmarkNameTest, Threads) {
+  auto name = BenchmarkName();
+  name.function_name = "function_name";
+  name.min_time = "min_time:3.4s";
+  name.threads = "threads:256";
+  EXPECT_EQ(name.str(), "function_name/min_time:3.4s/threads:256");
+}
+
+TEST(BenchmarkNameTest, TestEmptyFunctionName) {
+  auto name = BenchmarkName();
+  name.args = "first:3/second:4";
+  name.threads = "threads:22";
+  EXPECT_EQ(name.str(), "first:3/second:4/threads:22");
+}
+
+}  // end namespace
diff --git a/test/benchmark_random_interleaving_gtest.cc b/test/benchmark_random_interleaving_gtest.cc
new file mode 100644
index 0000000000..f68c680e94
--- /dev/null
+++ b/test/benchmark_random_interleaving_gtest.cc
@@ -0,0 +1,127 @@
+#include <queue>
+#include <string>
+#include <vector>
+
+#include "../src/commandlineflags.h"
+#include "../src/string_util.h"
+#include "benchmark/benchmark.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+namespace benchmark {
+
+BM_DECLARE_bool(benchmark_enable_random_interleaving);
+BM_DECLARE_string(benchmark_filter);
+BM_DECLARE_int32(benchmark_repetitions);
+
+namespace internal {
+namespace {
+
+class EventQueue : public std::queue<std::string> {
+ public:
+  void Put(const std::string& event) { push(event); }
+
+  void Clear() {
+    while (!empty()) {
+      pop();
+    }
+  }
+
+  std::string Get() {
+    std::string event = front();
+    pop();
+    return event;
+  }
+};
+
+static EventQueue* queue = new EventQueue;
+
+class NullReporter : public BenchmarkReporter {
+ public:
+  bool ReportContext(const Context& /*context*/) override { return true; }
+  void ReportRuns(const std::vector<Run>& /* report */) override {}
+};
+
+class BenchmarkTest : public testing::Test {
+ public:
+  static void SetupHook(int /* num_threads */) { queue->push("Setup"); }
+
+  static void TeardownHook(int /* num_threads */) { queue->push("Teardown"); }
+
+  void Execute(const std::string& pattern) {
+    queue->Clear();
+
+    BenchmarkReporter* reporter = new NullReporter;
+    FLAGS_benchmark_filter = pattern;
+    RunSpecifiedBenchmarks(reporter);
+    delete reporter;
+
+    queue->Put("DONE");  // End marker
+  }
+};
+
+static void BM_Match1(benchmark::State& state) {
+  const int64_t arg = state.range(0);
+
+  for (auto _ : state) {
+  }
+  queue->Put(StrFormat("BM_Match1/%d", static_cast<int>(arg)));
+}
+BENCHMARK(BM_Match1)
+    ->Iterations(100)
+    ->Arg(1)
+    ->Arg(2)
+    ->Arg(3)
+    ->Range(10, 80)
+    ->Args({90})
+    ->Args({100});
+
+TEST_F(BenchmarkTest, Match1) {
+  Execute("BM_Match1");
+  ASSERT_EQ("BM_Match1/1", queue->Get());
+  ASSERT_EQ("BM_Match1/2", queue->Get());
+  ASSERT_EQ("BM_Match1/3", queue->Get());
+  ASSERT_EQ("BM_Match1/10", queue->Get());
+  ASSERT_EQ("BM_Match1/64", queue->Get());
+  ASSERT_EQ("BM_Match1/80", queue->Get());
+  ASSERT_EQ("BM_Match1/90", queue->Get());
+  ASSERT_EQ("BM_Match1/100", queue->Get());
+  ASSERT_EQ("DONE", queue->Get());
+}
+
+TEST_F(BenchmarkTest, Match1WithRepetition) {
+  FLAGS_benchmark_repetitions = 2;
+
+  Execute("BM_Match1/(64|80)");
+  ASSERT_EQ("BM_Match1/64", queue->Get());
+  ASSERT_EQ("BM_Match1/64", queue->Get());
+  ASSERT_EQ("BM_Match1/80", queue->Get());
+  ASSERT_EQ("BM_Match1/80", queue->Get());
+  ASSERT_EQ("DONE", queue->Get());
+}
+
+TEST_F(BenchmarkTest, Match1WithRandomInterleaving) {
+  FLAGS_benchmark_enable_random_interleaving = true;
+  FLAGS_benchmark_repetitions = 100;
+
+  std::map<std::string, int> element_count;
+  std::map<std::string, int> interleaving_count;
+  Execute("BM_Match1/(64|80)");
+  for (int i = 0; i < 100; ++i) {
+    std::vector<std::string> interleaving;
+    interleaving.push_back(queue->Get());
+    interleaving.push_back(queue->Get());
+    element_count[interleaving[0].c_str()]++;
+    element_count[interleaving[1].c_str()]++;
+    interleaving_count[StrFormat("%s,%s", interleaving[0].c_str(),
+                                 interleaving[1].c_str())]++;
+  }
+  EXPECT_EQ(element_count["BM_Match1/64"], 100) << "Unexpected repetitions.";
+  EXPECT_EQ(element_count["BM_Match1/80"], 100) << "Unexpected repetitions.";
+  EXPECT_GE(interleaving_count.size(), 2) << "Interleaving was not randomized.";
+  ASSERT_EQ("DONE", queue->Get());
+}
+
+}  // namespace
+}  // namespace internal
+}  // namespace benchmark
diff --git a/test/benchmark_test.cc b/test/benchmark_test.cc
index 78802c8da6..3cd4f5565f 100644
--- a/test/benchmark_test.cc
+++ b/test/benchmark_test.cc
@@ -40,8 +40,8 @@ double CalculatePi(int depth) {
   return (pi - 1.0) * 4;
 }
 
-std::set<int> ConstructRandomSet(int size) {
-  std::set<int> s;
+std::set<int64_t> ConstructRandomSet(int64_t size) {
+  std::set<int64_t> s;
   for (int i = 0; i < size; ++i) s.insert(s.end(), i);
   return s;
 }
@@ -64,7 +64,7 @@ BENCHMARK(BM_Factorial)->UseRealTime();
 
 static void BM_CalculatePiRange(benchmark::State& state) {
   double pi = 0.0;
-  for (auto _ : state) pi = CalculatePi(state.range(0));
+  for (auto _ : state) pi = CalculatePi(static_cast<int>(state.range(0)));
   std::stringstream ss;
   ss << pi;
   state.SetLabel(ss.str());
@@ -74,7 +74,7 @@ BENCHMARK_RANGE(BM_CalculatePiRange, 1, 1024 * 1024);
 static void BM_CalculatePi(benchmark::State& state) {
   static const int depth = 1024;
   for (auto _ : state) {
-    benchmark::DoNotOptimize(CalculatePi(depth));
+    benchmark::DoNotOptimize(CalculatePi(static_cast<int>(depth)));
   }
 }
 BENCHMARK(BM_CalculatePi)->Threads(8);
@@ -82,7 +82,7 @@ BENCHMARK(BM_CalculatePi)->ThreadRange(1, 32);
 BENCHMARK(BM_CalculatePi)->ThreadPerCpu();
 
 static void BM_SetInsert(benchmark::State& state) {
-  std::set<int> data;
+  std::set<int64_t> data;
   for (auto _ : state) {
     state.PauseTiming();
     data = ConstructRandomSet(state.range(0));
@@ -103,9 +103,9 @@ static void BM_Sequential(benchmark::State& state) {
   ValueType v = 42;
   for (auto _ : state) {
     Container c;
-    for (int i = state.range(0); --i;) c.push_back(v);
+    for (int64_t i = state.range(0); --i;) c.push_back(v);
   }
-  const size_t items_processed = state.iterations() * state.range(0);
+  const int64_t items_processed = state.iterations() * state.range(0);
   state.SetItemsProcessed(items_processed);
   state.SetBytesProcessed(items_processed * sizeof(v));
 }
@@ -118,8 +118,9 @@ BENCHMARK_TEMPLATE(BM_Sequential, std::vector<int>, int)->Arg(512);
 #endif
 
 static void BM_StringCompare(benchmark::State& state) {
-  std::string s1(state.range(0), '-');
-  std::string s2(state.range(0), '-');
+  size_t len = static_cast<size_t>(state.range(0));
+  std::string s1(len, '-');
+  std::string s2(len, '-');
   for (auto _ : state) benchmark::DoNotOptimize(s1.compare(s2));
 }
 BENCHMARK(BM_StringCompare)->Range(1, 1 << 20);
@@ -154,13 +155,13 @@ static void BM_LongTest(benchmark::State& state) {
 BENCHMARK(BM_LongTest)->Range(1 << 16, 1 << 28);
 
 static void BM_ParallelMemset(benchmark::State& state) {
-  int size = state.range(0) / static_cast<int>(sizeof(int));
-  int thread_size = size / state.threads;
+  int64_t size = state.range(0) / static_cast<int64_t>(sizeof(int));
+  int thread_size = static_cast<int>(size) / state.threads;
   int from = thread_size * state.thread_index;
   int to = from + thread_size;
 
   if (state.thread_index == 0) {
-    test_vector = new std::vector<int>(size);
+    test_vector = new std::vector<int>(static_cast<size_t>(size));
   }
 
   for (auto _ : state) {
@@ -178,8 +179,8 @@ static void BM_ParallelMemset(benchmark::State& state) {
 BENCHMARK(BM_ParallelMemset)->Arg(10 << 20)->ThreadRange(1, 4);
 
 static void BM_ManualTiming(benchmark::State& state) {
-  size_t slept_for = 0;
-  int microseconds = state.range(0);
+  int64_t slept_for = 0;
+  int64_t microseconds = state.range(0);
   std::chrono::duration<double, std::micro> sleep_duration{
       static_cast<double>(microseconds)};
 
diff --git a/test/clobber_memory_assembly_test.cc b/test/clobber_memory_assembly_test.cc
new file mode 100644
index 0000000000..f41911a39c
--- /dev/null
+++ b/test/clobber_memory_assembly_test.cc
@@ -0,0 +1,64 @@
+#include <benchmark/benchmark.h>
+
+#ifdef __clang__
+#pragma clang diagnostic ignored "-Wreturn-type"
+#endif
+
+extern "C" {
+
+extern int ExternInt;
+extern int ExternInt2;
+extern int ExternInt3;
+
+}
+
+// CHECK-LABEL: test_basic:
+extern "C" void test_basic() {
+  int x;
+  benchmark::DoNotOptimize(&x);
+  x = 101;
+  benchmark::ClobberMemory();
+  // CHECK: leaq [[DEST:[^,]+]], %rax
+  // CHECK: movl $101, [[DEST]]
+  // CHECK: ret
+}
+
+// CHECK-LABEL: test_redundant_store:
+extern "C" void test_redundant_store() {
+  ExternInt = 3;
+  benchmark::ClobberMemory();
+  ExternInt = 51;
+  // CHECK-DAG: ExternInt
+  // CHECK-DAG: movl $3
+  // CHECK: movl $51
+}
+
+// CHECK-LABEL: test_redundant_read:
+extern "C" void test_redundant_read() {
+  int x;
+  benchmark::DoNotOptimize(&x);
+  x = ExternInt;
+  benchmark::ClobberMemory();
+  x = ExternInt2;
+  // CHECK: leaq [[DEST:[^,]+]], %rax
+  // CHECK: ExternInt(%rip)
+  // CHECK: movl %eax, [[DEST]]
+  // CHECK-NOT: ExternInt2
+  // CHECK: ret
+}
+
+// CHECK-LABEL: test_redundant_read2:
+extern "C" void test_redundant_read2() {
+  int x;
+  benchmark::DoNotOptimize(&x);
+  x = ExternInt;
+  benchmark::ClobberMemory();
+  x = ExternInt2;
+  benchmark::ClobberMemory();
+  // CHECK: leaq [[DEST:[^,]+]], %rax
+  // CHECK: ExternInt(%rip)
+  // CHECK: movl %eax, [[DEST]]
+  // CHECK: ExternInt2(%rip)
+  // CHECK: movl %eax, [[DEST]]
+  // CHECK: ret
+}
diff --git a/test/commandlineflags_gtest.cc b/test/commandlineflags_gtest.cc
new file mode 100644
index 0000000000..8412008ffe
--- /dev/null
+++ b/test/commandlineflags_gtest.cc
@@ -0,0 +1,228 @@
+#include <cstdlib>
+
+#include "../src/commandlineflags.h"
+#include "../src/internal_macros.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+namespace benchmark {
+namespace {
+
+#if defined(BENCHMARK_OS_WINDOWS)
+int setenv(const char* name, const char* value, int overwrite) {
+  if (!overwrite) {
+    // NOTE: getenv_s is far superior but not available under mingw.
+    char* env_value = getenv(name);
+    if (env_value == nullptr) {
+      return -1;
+    }
+  }
+  return _putenv_s(name, value);
+}
+
+int unsetenv(const char* name) { return _putenv_s(name, ""); }
+
+#endif  // BENCHMARK_OS_WINDOWS
+
+TEST(BoolFromEnv, Default) {
+  ASSERT_EQ(unsetenv("NOT_IN_ENV"), 0);
+  EXPECT_EQ(BoolFromEnv("not_in_env", true), true);
+}
+
+TEST(BoolFromEnv, False) {
+  ASSERT_EQ(setenv("IN_ENV", "0", 1), 0);
+  EXPECT_EQ(BoolFromEnv("in_env", true), false);
+  unsetenv("IN_ENV");
+
+  ASSERT_EQ(setenv("IN_ENV", "N", 1), 0);
+  EXPECT_EQ(BoolFromEnv("in_env", true), false);
+  unsetenv("IN_ENV");
+
+  ASSERT_EQ(setenv("IN_ENV", "n", 1), 0);
+  EXPECT_EQ(BoolFromEnv("in_env", true), false);
+  unsetenv("IN_ENV");
+
+  ASSERT_EQ(setenv("IN_ENV", "NO", 1), 0);
+  EXPECT_EQ(BoolFromEnv("in_env", true), false);
+  unsetenv("IN_ENV");
+
+  ASSERT_EQ(setenv("IN_ENV", "No", 1), 0);
+  EXPECT_EQ(BoolFromEnv("in_env", true), false);
+  unsetenv("IN_ENV");
+
+  ASSERT_EQ(setenv("IN_ENV", "no", 1), 0);
+  EXPECT_EQ(BoolFromEnv("in_env", true), false);
+  unsetenv("IN_ENV");
+
+  ASSERT_EQ(setenv("IN_ENV", "F", 1), 0);
+  EXPECT_EQ(BoolFromEnv("in_env", true), false);
+  unsetenv("IN_ENV");
+
+  ASSERT_EQ(setenv("IN_ENV", "f", 1), 0);
+  EXPECT_EQ(BoolFromEnv("in_env", true), false);
+  unsetenv("IN_ENV");
+
+  ASSERT_EQ(setenv("IN_ENV", "FALSE", 1), 0);
+  EXPECT_EQ(BoolFromEnv("in_env", true), false);
+  unsetenv("IN_ENV");
+
+  ASSERT_EQ(setenv("IN_ENV", "False", 1), 0);
+  EXPECT_EQ(BoolFromEnv("in_env", true), false);
+  unsetenv("IN_ENV");
+
+  ASSERT_EQ(setenv("IN_ENV", "false", 1), 0);
+  EXPECT_EQ(BoolFromEnv("in_env", true), false);
+  unsetenv("IN_ENV");
+
+  ASSERT_EQ(setenv("IN_ENV", "OFF", 1), 0);
+  EXPECT_EQ(BoolFromEnv("in_env", true), false);
+  unsetenv("IN_ENV");
+
+  ASSERT_EQ(setenv("IN_ENV", "Off", 1), 0);
+  EXPECT_EQ(BoolFromEnv("in_env", true), false);
+  unsetenv("IN_ENV");
+
+  ASSERT_EQ(setenv("IN_ENV", "off", 1), 0);
+  EXPECT_EQ(BoolFromEnv("in_env", true), false);
+  unsetenv("IN_ENV");
+}
+
+TEST(BoolFromEnv, True) {
+  ASSERT_EQ(setenv("IN_ENV", "1", 1), 0);
+  EXPECT_EQ(BoolFromEnv("in_env", false), true);
+  unsetenv("IN_ENV");
+
+  ASSERT_EQ(setenv("IN_ENV", "Y", 1), 0);
+  EXPECT_EQ(BoolFromEnv("in_env", false), true);
+  unsetenv("IN_ENV");
+
+  ASSERT_EQ(setenv("IN_ENV", "y", 1), 0);
+  EXPECT_EQ(BoolFromEnv("in_env", false), true);
+  unsetenv("IN_ENV");
+
+  ASSERT_EQ(setenv("IN_ENV", "YES", 1), 0);
+  EXPECT_EQ(BoolFromEnv("in_env", false), true);
+  unsetenv("IN_ENV");
+
+  ASSERT_EQ(setenv("IN_ENV", "Yes", 1), 0);
+  EXPECT_EQ(BoolFromEnv("in_env", false), true);
+  unsetenv("IN_ENV");
+
+  ASSERT_EQ(setenv("IN_ENV", "yes", 1), 0);
+  EXPECT_EQ(BoolFromEnv("in_env", false), true);
+  unsetenv("IN_ENV");
+
+  ASSERT_EQ(setenv("IN_ENV", "T", 1), 0);
+  EXPECT_EQ(BoolFromEnv("in_env", false), true);
+  unsetenv("IN_ENV");
+
+  ASSERT_EQ(setenv("IN_ENV", "t", 1), 0);
+  EXPECT_EQ(BoolFromEnv("in_env", false), true);
+  unsetenv("IN_ENV");
+
+  ASSERT_EQ(setenv("IN_ENV", "TRUE", 1), 0);
+  EXPECT_EQ(BoolFromEnv("in_env", false), true);
+  unsetenv("IN_ENV");
+
+  ASSERT_EQ(setenv("IN_ENV", "True", 1), 0);
+  EXPECT_EQ(BoolFromEnv("in_env", false), true);
+  unsetenv("IN_ENV");
+
+  ASSERT_EQ(setenv("IN_ENV", "true", 1), 0);
+  EXPECT_EQ(BoolFromEnv("in_env", false), true);
+  unsetenv("IN_ENV");
+
+  ASSERT_EQ(setenv("IN_ENV", "ON", 1), 0);
+  EXPECT_EQ(BoolFromEnv("in_env", false), true);
+  unsetenv("IN_ENV");
+
+  ASSERT_EQ(setenv("IN_ENV", "On", 1), 0);
+  EXPECT_EQ(BoolFromEnv("in_env", false), true);
+  unsetenv("IN_ENV");
+
+  ASSERT_EQ(setenv("IN_ENV", "on", 1), 0);
+  EXPECT_EQ(BoolFromEnv("in_env", false), true);
+  unsetenv("IN_ENV");
+
+#ifndef BENCHMARK_OS_WINDOWS
+  ASSERT_EQ(setenv("IN_ENV", "", 1), 0);
+  EXPECT_EQ(BoolFromEnv("in_env", false), true);
+  unsetenv("IN_ENV");
+#endif
+}
+
+TEST(Int32FromEnv, NotInEnv) {
+  ASSERT_EQ(unsetenv("NOT_IN_ENV"), 0);
+  EXPECT_EQ(Int32FromEnv("not_in_env", 42), 42);
+}
+
+TEST(Int32FromEnv, InvalidInteger) {
+  ASSERT_EQ(setenv("IN_ENV", "foo", 1), 0);
+  EXPECT_EQ(Int32FromEnv("in_env", 42), 42);
+  unsetenv("IN_ENV");
+}
+
+TEST(Int32FromEnv, ValidInteger) {
+  ASSERT_EQ(setenv("IN_ENV", "42", 1), 0);
+  EXPECT_EQ(Int32FromEnv("in_env", 64), 42);
+  unsetenv("IN_ENV");
+}
+
+TEST(DoubleFromEnv, NotInEnv) {
+  ASSERT_EQ(unsetenv("NOT_IN_ENV"), 0);
+  EXPECT_EQ(DoubleFromEnv("not_in_env", 0.51), 0.51);
+}
+
+TEST(DoubleFromEnv, InvalidReal) {
+  ASSERT_EQ(setenv("IN_ENV", "foo", 1), 0);
+  EXPECT_EQ(DoubleFromEnv("in_env", 0.51), 0.51);
+  unsetenv("IN_ENV");
+}
+
+TEST(DoubleFromEnv, ValidReal) {
+  ASSERT_EQ(setenv("IN_ENV", "0.51", 1), 0);
+  EXPECT_EQ(DoubleFromEnv("in_env", 0.71), 0.51);
+  unsetenv("IN_ENV");
+}
+
+TEST(StringFromEnv, Default) {
+  ASSERT_EQ(unsetenv("NOT_IN_ENV"), 0);
+  EXPECT_STREQ(StringFromEnv("not_in_env", "foo"), "foo");
+}
+
+TEST(StringFromEnv, Valid) {
+  ASSERT_EQ(setenv("IN_ENV", "foo", 1), 0);
+  EXPECT_STREQ(StringFromEnv("in_env", "bar"), "foo");
+  unsetenv("IN_ENV");
+}
+
+TEST(KvPairsFromEnv, Default) {
+  ASSERT_EQ(unsetenv("NOT_IN_ENV"), 0);
+  EXPECT_THAT(KvPairsFromEnv("not_in_env", {{"foo", "bar"}}),
+              testing::ElementsAre(testing::Pair("foo", "bar")));
+}
+
+TEST(KvPairsFromEnv, MalformedReturnsDefault) {
+  ASSERT_EQ(setenv("IN_ENV", "foo", 1), 0);
+  EXPECT_THAT(KvPairsFromEnv("in_env", {{"foo", "bar"}}),
+              testing::ElementsAre(testing::Pair("foo", "bar")));
+  unsetenv("IN_ENV");
+}
+
+TEST(KvPairsFromEnv, Single) {
+  ASSERT_EQ(setenv("IN_ENV", "foo=bar", 1), 0);
+  EXPECT_THAT(KvPairsFromEnv("in_env", {}),
+              testing::ElementsAre(testing::Pair("foo", "bar")));
+  unsetenv("IN_ENV");
+}
+
+TEST(KvPairsFromEnv, Multiple) {
+  ASSERT_EQ(setenv("IN_ENV", "foo=bar,baz=qux", 1), 0);
+  EXPECT_THAT(KvPairsFromEnv("in_env", {}),
+              testing::UnorderedElementsAre(testing::Pair("foo", "bar"),
+                                            testing::Pair("baz", "qux")));
+  unsetenv("IN_ENV");
+}
+
+}  // namespace
+}  // namespace benchmark
diff --git a/test/complexity_test.cc b/test/complexity_test.cc
index 89dfa580e6..0de73c5722 100644
--- a/test/complexity_test.cc
+++ b/test/complexity_test.cc
@@ -12,9 +12,11 @@ namespace {
 #define ADD_COMPLEXITY_CASES(...) \
   int CONCAT(dummy, __LINE__) = AddComplexityTest(__VA_ARGS__)
 
-int AddComplexityTest(std::string big_o_test_name, std::string rms_test_name,
-                      std::string big_o) {
-  SetSubstitutions({{"%bigo_name", big_o_test_name},
+int AddComplexityTest(std::string test_name, std::string big_o_test_name,
+                      std::string rms_test_name, std::string big_o,
+                      int family_index) {
+  SetSubstitutions({{"%name", test_name},
+                    {"%bigo_name", big_o_test_name},
                     {"%rms_name", rms_test_name},
                     {"%bigo_str", "[ ]* %float " + big_o},
                     {"%bigo", big_o},
@@ -24,15 +26,31 @@ int AddComplexityTest(std::string big_o_test_name, std::string rms_test_name,
       {{"^%bigo_name %bigo_str %bigo_str[ ]*$"},
        {"^%bigo_name", MR_Not},  // Assert we we didn't only matched a name.
        {"^%rms_name %rms %rms[ ]*$", MR_Next}});
-  AddCases(TC_JSONOut, {{"\"name\": \"%bigo_name\",$"},
-                        {"\"cpu_coefficient\": %float,$", MR_Next},
-                        {"\"real_coefficient\": %float,$", MR_Next},
-                        {"\"big_o\": \"%bigo\",$", MR_Next},
-                        {"\"time_unit\": \"ns\"$", MR_Next},
-                        {"}", MR_Next},
-                        {"\"name\": \"%rms_name\",$"},
-                        {"\"rms\": %float$", MR_Next},
-                        {"}", MR_Next}});
+  AddCases(
+      TC_JSONOut,
+      {{"\"name\": \"%bigo_name\",$"},
+       {"\"family_index\": " + std::to_string(family_index) + ",$", MR_Next},
+       {"\"per_family_instance_index\": 0,$", MR_Next},
+       {"\"run_name\": \"%name\",$", MR_Next},
+       {"\"run_type\": \"aggregate\",$", MR_Next},
+       {"\"repetitions\": %int,$", MR_Next},
+       {"\"threads\": 1,$", MR_Next},
+       {"\"aggregate_name\": \"BigO\",$", MR_Next},
+       {"\"cpu_coefficient\": %float,$", MR_Next},
+       {"\"real_coefficient\": %float,$", MR_Next},
+       {"\"big_o\": \"%bigo\",$", MR_Next},
+       {"\"time_unit\": \"ns\"$", MR_Next},
+       {"}", MR_Next},
+       {"\"name\": \"%rms_name\",$"},
+       {"\"family_index\": " + std::to_string(family_index) + ",$", MR_Next},
+       {"\"per_family_instance_index\": 0,$", MR_Next},
+       {"\"run_name\": \"%name\",$", MR_Next},
+       {"\"run_type\": \"aggregate\",$", MR_Next},
+       {"\"repetitions\": %int,$", MR_Next},
+       {"\"threads\": 1,$", MR_Next},
+       {"\"aggregate_name\": \"RMS\",$", MR_Next},
+       {"\"rms\": %float$", MR_Next},
+       {"}", MR_Next}});
   AddCases(TC_CSVOut, {{"^\"%bigo_name\",,%float,%float,%bigo,,,,,$"},
                        {"^\"%bigo_name\"", MR_Not},
                        {"^\"%rms_name\",,%float,%float,,,,,,$", MR_Next}});
@@ -55,10 +73,11 @@ void BM_Complexity_O1(benchmark::State& state) {
 }
 BENCHMARK(BM_Complexity_O1)->Range(1, 1 << 18)->Complexity(benchmark::o1);
 BENCHMARK(BM_Complexity_O1)->Range(1, 1 << 18)->Complexity();
-BENCHMARK(BM_Complexity_O1)->Range(1, 1 << 18)->Complexity([](int) {
-  return 1.0;
-});
+BENCHMARK(BM_Complexity_O1)
+    ->Range(1, 1 << 18)
+    ->Complexity([](benchmark::IterationCount) { return 1.0; });
 
+const char *one_test_name = "BM_Complexity_O1";
 const char *big_o_1_test_name = "BM_Complexity_O1_BigO";
 const char *rms_o_1_test_name = "BM_Complexity_O1_RMS";
 const char *enum_big_o_1 = "\\([0-9]+\\)";
@@ -69,31 +88,34 @@ const char *auto_big_o_1 = "(\\([0-9]+\\))|(lgN)";
 const char *lambda_big_o_1 = "f\\(N\\)";
 
 // Add enum tests
-ADD_COMPLEXITY_CASES(big_o_1_test_name, rms_o_1_test_name, enum_big_o_1);
+ADD_COMPLEXITY_CASES(one_test_name, big_o_1_test_name, rms_o_1_test_name,
+                     enum_big_o_1, /*family_index=*/0);
 
 // Add auto enum tests
-ADD_COMPLEXITY_CASES(big_o_1_test_name, rms_o_1_test_name, auto_big_o_1);
+ADD_COMPLEXITY_CASES(one_test_name, big_o_1_test_name, rms_o_1_test_name,
+                     auto_big_o_1, /*family_index=*/1);
 
 // Add lambda tests
-ADD_COMPLEXITY_CASES(big_o_1_test_name, rms_o_1_test_name, lambda_big_o_1);
+ADD_COMPLEXITY_CASES(one_test_name, big_o_1_test_name, rms_o_1_test_name,
+                     lambda_big_o_1, /*family_index=*/2);
 
 // ========================================================================= //
 // --------------------------- Testing BigO O(N) --------------------------- //
 // ========================================================================= //
 
-std::vector<int> ConstructRandomVector(int size) {
+std::vector<int> ConstructRandomVector(int64_t size) {
   std::vector<int> v;
-  v.reserve(size);
+  v.reserve(static_cast<int>(size));
   for (int i = 0; i < size; ++i) {
-    v.push_back(std::rand() % size);
+    v.push_back(static_cast<int>(std::rand() % size));
   }
   return v;
 }
 
 void BM_Complexity_O_N(benchmark::State& state) {
   auto v = ConstructRandomVector(state.range(0));
-  const int item_not_in_vector =
-      state.range(0) * 2;  // Test worst case scenario (item not in vector)
+  // Test worst case scenario (item not in vector)
+  const int64_t item_not_in_vector = state.range(0) * 2;
   for (auto _ : state) {
     benchmark::DoNotOptimize(std::find(v.begin(), v.end(), item_not_in_vector));
   }
@@ -106,22 +128,27 @@ BENCHMARK(BM_Complexity_O_N)
 BENCHMARK(BM_Complexity_O_N)
     ->RangeMultiplier(2)
     ->Range(1 << 10, 1 << 16)
-    ->Complexity([](int n) -> double { return n; });
+    ->Complexity([](benchmark::IterationCount n) -> double {
+      return static_cast<double>(n);
+    });
 BENCHMARK(BM_Complexity_O_N)
     ->RangeMultiplier(2)
     ->Range(1 << 10, 1 << 16)
     ->Complexity();
 
+const char *n_test_name = "BM_Complexity_O_N";
 const char *big_o_n_test_name = "BM_Complexity_O_N_BigO";
 const char *rms_o_n_test_name = "BM_Complexity_O_N_RMS";
 const char *enum_auto_big_o_n = "N";
 const char *lambda_big_o_n = "f\\(N\\)";
 
 // Add enum tests
-ADD_COMPLEXITY_CASES(big_o_n_test_name, rms_o_n_test_name, enum_auto_big_o_n);
+ADD_COMPLEXITY_CASES(n_test_name, big_o_n_test_name, rms_o_n_test_name,
+                     enum_auto_big_o_n, /*family_index=*/3);
 
 // Add lambda tests
-ADD_COMPLEXITY_CASES(big_o_n_test_name, rms_o_n_test_name, lambda_big_o_n);
+ADD_COMPLEXITY_CASES(n_test_name, big_o_n_test_name, rms_o_n_test_name,
+                     lambda_big_o_n, /*family_index=*/4);
 
 // ========================================================================= //
 // ------------------------- Testing BigO O(N*lgN) ------------------------- //
@@ -134,6 +161,7 @@ static void BM_Complexity_O_N_log_N(benchmark::State& state) {
   }
   state.SetComplexityN(state.range(0));
 }
+static const double kLog2E = 1.44269504088896340736;
 BENCHMARK(BM_Complexity_O_N_log_N)
     ->RangeMultiplier(2)
     ->Range(1 << 10, 1 << 16)
@@ -141,24 +169,51 @@ BENCHMARK(BM_Complexity_O_N_log_N)
 BENCHMARK(BM_Complexity_O_N_log_N)
     ->RangeMultiplier(2)
     ->Range(1 << 10, 1 << 16)
-    ->Complexity([](int n) { return n * log2(n); });
+    ->Complexity([](benchmark::IterationCount n) {
+      return kLog2E * n * log(static_cast<double>(n));
+    });
 BENCHMARK(BM_Complexity_O_N_log_N)
     ->RangeMultiplier(2)
     ->Range(1 << 10, 1 << 16)
     ->Complexity();
 
+const char *n_lg_n_test_name = "BM_Complexity_O_N_log_N";
 const char *big_o_n_lg_n_test_name = "BM_Complexity_O_N_log_N_BigO";
 const char *rms_o_n_lg_n_test_name = "BM_Complexity_O_N_log_N_RMS";
 const char *enum_auto_big_o_n_lg_n = "NlgN";
 const char *lambda_big_o_n_lg_n = "f\\(N\\)";
 
 // Add enum tests
-ADD_COMPLEXITY_CASES(big_o_n_lg_n_test_name, rms_o_n_lg_n_test_name,
-                     enum_auto_big_o_n_lg_n);
+ADD_COMPLEXITY_CASES(n_lg_n_test_name, big_o_n_lg_n_test_name,
+                     rms_o_n_lg_n_test_name, enum_auto_big_o_n_lg_n,
+                     /*family_index=*/6);
 
 // Add lambda tests
-ADD_COMPLEXITY_CASES(big_o_n_lg_n_test_name, rms_o_n_lg_n_test_name,
-                     lambda_big_o_n_lg_n);
+ADD_COMPLEXITY_CASES(n_lg_n_test_name, big_o_n_lg_n_test_name,
+                     rms_o_n_lg_n_test_name, lambda_big_o_n_lg_n,
+                     /*family_index=*/7);
+
+// ========================================================================= //
+// -------- Testing formatting of Complexity with captured args ------------ //
+// ========================================================================= //
+
+void BM_ComplexityCaptureArgs(benchmark::State& state, int n) {
+  for (auto _ : state) {
+    // This test requires a non-zero CPU time to avoid divide-by-zero
+    benchmark::DoNotOptimize(state.iterations());
+  }
+  state.SetComplexityN(n);
+}
+
+BENCHMARK_CAPTURE(BM_ComplexityCaptureArgs, capture_test, 100)
+    ->Complexity(benchmark::oN)
+    ->Ranges({{1, 2}, {3, 4}});
+
+const std::string complexity_capture_name =
+    "BM_ComplexityCaptureArgs/capture_test";
+
+ADD_COMPLEXITY_CASES(complexity_capture_name, complexity_capture_name + "_BigO",
+                     complexity_capture_name + "_RMS", "N", /*family_index=*/9);
 
 // ========================================================================= //
 // --------------------------- TEST CASES END ------------------------------ //
diff --git a/test/cxx03_test.cc b/test/cxx03_test.cc
index baa9ed9262..c4c9a52273 100644
--- a/test/cxx03_test.cc
+++ b/test/cxx03_test.cc
@@ -14,7 +14,7 @@
 
 void BM_empty(benchmark::State& state) {
   while (state.KeepRunning()) {
-    volatile std::size_t x = state.iterations();
+    volatile benchmark::IterationCount x = state.iterations();
     ((void)x);
   }
 }
diff --git a/test/display_aggregates_only_test.cc b/test/display_aggregates_only_test.cc
new file mode 100644
index 0000000000..3c36d3f03c
--- /dev/null
+++ b/test/display_aggregates_only_test.cc
@@ -0,0 +1,43 @@
+
+#undef NDEBUG
+#include <cstdio>
+#include <string>
+
+#include "benchmark/benchmark.h"
+#include "output_test.h"
+
+// Ok this test is super ugly. We want to check what happens with the file
+// reporter in the presence of DisplayAggregatesOnly().
+// We do not care about console output, the normal tests check that already.
+
+void BM_SummaryRepeat(benchmark::State& state) {
+  for (auto _ : state) {
+  }
+}
+BENCHMARK(BM_SummaryRepeat)->Repetitions(3)->DisplayAggregatesOnly();
+
+int main(int argc, char* argv[]) {
+  const std::string output = GetFileReporterOutput(argc, argv);
+
+  if (SubstrCnt(output, "\"name\": \"BM_SummaryRepeat/repeats:3") != 6 ||
+      SubstrCnt(output, "\"name\": \"BM_SummaryRepeat/repeats:3\"") != 3 ||
+      SubstrCnt(output, "\"name\": \"BM_SummaryRepeat/repeats:3_mean\"") != 1 ||
+      SubstrCnt(output, "\"name\": \"BM_SummaryRepeat/repeats:3_median\"") !=
+          1 ||
+      SubstrCnt(output, "\"name\": \"BM_SummaryRepeat/repeats:3_stddev\"") !=
+          1) {
+    std::cout << "Precondition mismatch. Expected to only find 6 "
+                 "occurrences of \"BM_SummaryRepeat/repeats:3\" substring:\n"
+                 "\"name\": \"BM_SummaryRepeat/repeats:3\", "
+                 "\"name\": \"BM_SummaryRepeat/repeats:3\", "
+                 "\"name\": \"BM_SummaryRepeat/repeats:3\", "
+                 "\"name\": \"BM_SummaryRepeat/repeats:3_mean\", "
+                 "\"name\": \"BM_SummaryRepeat/repeats:3_median\", "
+                 "\"name\": \"BM_SummaryRepeat/repeats:3_stddev\"\nThe entire "
+                 "output:\n";
+    std::cout << output;
+    return 1;
+  }
+
+  return 0;
+}
diff --git a/test/donotoptimize_assembly_test.cc b/test/donotoptimize_assembly_test.cc
new file mode 100644
index 0000000000..d4b0bab70e
--- /dev/null
+++ b/test/donotoptimize_assembly_test.cc
@@ -0,0 +1,163 @@
+#include <benchmark/benchmark.h>
+
+#ifdef __clang__
+#pragma clang diagnostic ignored "-Wreturn-type"
+#endif
+
+extern "C" {
+
+extern int ExternInt;
+extern int ExternInt2;
+extern int ExternInt3;
+
+inline int Add42(int x) { return x + 42; }
+
+struct NotTriviallyCopyable {
+  NotTriviallyCopyable();
+  explicit NotTriviallyCopyable(int x) : value(x) {}
+  NotTriviallyCopyable(NotTriviallyCopyable const&);
+  int value;
+};
+
+struct Large {
+  int value;
+  int data[2];
+};
+
+}
+// CHECK-LABEL: test_with_rvalue:
+extern "C" void test_with_rvalue() {
+  benchmark::DoNotOptimize(Add42(0));
+  // CHECK: movl $42, %eax
+  // CHECK: ret
+}
+
+// CHECK-LABEL: test_with_large_rvalue:
+extern "C" void test_with_large_rvalue() {
+  benchmark::DoNotOptimize(Large{ExternInt, {ExternInt, ExternInt}});
+  // CHECK: ExternInt(%rip)
+  // CHECK: movl %eax, -{{[0-9]+}}(%[[REG:[a-z]+]]
+  // CHECK: movl %eax, -{{[0-9]+}}(%[[REG]])
+  // CHECK: movl %eax, -{{[0-9]+}}(%[[REG]])
+  // CHECK: ret
+}
+
+// CHECK-LABEL: test_with_non_trivial_rvalue:
+extern "C" void test_with_non_trivial_rvalue() {
+  benchmark::DoNotOptimize(NotTriviallyCopyable(ExternInt));
+  // CHECK: mov{{l|q}} ExternInt(%rip)
+  // CHECK: ret
+}
+
+// CHECK-LABEL: test_with_lvalue:
+extern "C" void test_with_lvalue() {
+  int x = 101;
+  benchmark::DoNotOptimize(x);
+  // CHECK-GNU: movl $101, %eax
+  // CHECK-CLANG: movl $101, -{{[0-9]+}}(%[[REG:[a-z]+]])
+  // CHECK: ret
+}
+
+// CHECK-LABEL: test_with_large_lvalue:
+extern "C" void test_with_large_lvalue() {
+  Large L{ExternInt, {ExternInt, ExternInt}};
+  benchmark::DoNotOptimize(L);
+  // CHECK: ExternInt(%rip)
+  // CHECK: movl %eax, -{{[0-9]+}}(%[[REG:[a-z]+]])
+  // CHECK: movl %eax, -{{[0-9]+}}(%[[REG]])
+  // CHECK: movl %eax, -{{[0-9]+}}(%[[REG]])
+  // CHECK: ret
+}
+
+// CHECK-LABEL: test_with_non_trivial_lvalue:
+extern "C" void test_with_non_trivial_lvalue() {
+  NotTriviallyCopyable NTC(ExternInt);
+  benchmark::DoNotOptimize(NTC);
+  // CHECK: ExternInt(%rip)
+  // CHECK: movl %eax, -{{[0-9]+}}(%[[REG:[a-z]+]])
+  // CHECK: ret
+}
+
+// CHECK-LABEL: test_with_const_lvalue:
+extern "C" void test_with_const_lvalue() {
+  const int x = 123;
+  benchmark::DoNotOptimize(x);
+  // CHECK: movl $123, %eax
+  // CHECK: ret
+}
+
+// CHECK-LABEL: test_with_large_const_lvalue:
+extern "C" void test_with_large_const_lvalue() {
+  const Large L{ExternInt, {ExternInt, ExternInt}};
+  benchmark::DoNotOptimize(L);
+  // CHECK: ExternInt(%rip)
+  // CHECK: movl %eax, -{{[0-9]+}}(%[[REG:[a-z]+]])
+  // CHECK: movl %eax, -{{[0-9]+}}(%[[REG]])
+  // CHECK: movl %eax, -{{[0-9]+}}(%[[REG]])
+  // CHECK: ret
+}
+
+// CHECK-LABEL: test_with_non_trivial_const_lvalue:
+extern "C" void test_with_non_trivial_const_lvalue() {
+  const NotTriviallyCopyable Obj(ExternInt);
+  benchmark::DoNotOptimize(Obj);
+  // CHECK: mov{{q|l}} ExternInt(%rip)
+  // CHECK: ret
+}
+
+// CHECK-LABEL: test_div_by_two:
+extern "C" int test_div_by_two(int input) {
+  int divisor = 2;
+  benchmark::DoNotOptimize(divisor);
+  return input / divisor;
+  // CHECK: movl $2, [[DEST:.*]]
+  // CHECK: idivl [[DEST]]
+  // CHECK: ret
+}
+
+// CHECK-LABEL: test_inc_integer:
+extern "C" int test_inc_integer() {
+  int x = 0;
+  for (int i=0; i < 5; ++i)
+    benchmark::DoNotOptimize(++x);
+  // CHECK: movl $1, [[DEST:.*]]
+  // CHECK: {{(addl \$1,|incl)}} [[DEST]]
+  // CHECK: {{(addl \$1,|incl)}} [[DEST]]
+  // CHECK: {{(addl \$1,|incl)}} [[DEST]]
+  // CHECK: {{(addl \$1,|incl)}} [[DEST]]
+  // CHECK-CLANG: movl [[DEST]], %eax
+  // CHECK: ret
+  return x;
+}
+
+// CHECK-LABEL: test_pointer_rvalue
+extern "C" void test_pointer_rvalue() {
+  // CHECK: movl $42, [[DEST:.*]]
+  // CHECK: leaq [[DEST]], %rax
+  // CHECK-CLANG: movq %rax, -{{[0-9]+}}(%[[REG:[a-z]+]])
+  // CHECK: ret
+  int x = 42;
+  benchmark::DoNotOptimize(&x);
+}
+
+// CHECK-LABEL: test_pointer_const_lvalue:
+extern "C" void test_pointer_const_lvalue() {
+  // CHECK: movl $42, [[DEST:.*]]
+  // CHECK: leaq [[DEST]], %rax
+  // CHECK-CLANG: movq %rax, -{{[0-9]+}}(%[[REG:[a-z]+]])
+  // CHECK: ret
+  int x = 42;
+  int * const xp = &x;
+  benchmark::DoNotOptimize(xp);
+}
+
+// CHECK-LABEL: test_pointer_lvalue:
+extern "C" void test_pointer_lvalue() {
+  // CHECK: movl $42, [[DEST:.*]]
+  // CHECK: leaq [[DEST]], %rax
+  // CHECK-CLANG: movq %rax, -{{[0-9]+}}(%[[REG:[a-z+]+]])
+  // CHECK: ret
+  int x = 42;
+  int *xp = &x;
+  benchmark::DoNotOptimize(xp);
+}
diff --git a/test/donotoptimize_test.cc b/test/donotoptimize_test.cc
index a705654a26..2ce92d1c72 100644
--- a/test/donotoptimize_test.cc
+++ b/test/donotoptimize_test.cc
@@ -28,13 +28,13 @@ struct BitRef {
 int main(int, char*[]) {
   // this test verifies compilation of DoNotOptimize() for some types
 
-  char buffer8[8];
+  char buffer8[8] = "";
   benchmark::DoNotOptimize(buffer8);
 
-  char buffer20[20];
+  char buffer20[20] = "";
   benchmark::DoNotOptimize(buffer20);
 
-  char buffer1024[1024];
+  char buffer1024[1024] = "";
   benchmark::DoNotOptimize(buffer1024);
   benchmark::DoNotOptimize(&buffer1024[0]);
 
diff --git a/test/filter_test.cc b/test/filter_test.cc
index 0e27065c15..1c198913b3 100644
--- a/test/filter_test.cc
+++ b/test/filter_test.cc
@@ -1,36 +1,41 @@
-#include "benchmark/benchmark.h"
-
+#include <algorithm>
 #include <cassert>
 #include <cmath>
 #include <cstdint>
 #include <cstdlib>
-
 #include <iostream>
 #include <limits>
 #include <sstream>
 #include <string>
 
+#include "benchmark/benchmark.h"
+
 namespace {
 
 class TestReporter : public benchmark::ConsoleReporter {
  public:
-  virtual bool ReportContext(const Context& context) {
+  virtual bool ReportContext(const Context& context) BENCHMARK_OVERRIDE {
     return ConsoleReporter::ReportContext(context);
   };
 
-  virtual void ReportRuns(const std::vector<Run>& report) {
+  virtual void ReportRuns(const std::vector<Run>& report) BENCHMARK_OVERRIDE {
     ++count_;
+    max_family_index_ =
+        std::max<size_t>(max_family_index_, report[0].family_index);
     ConsoleReporter::ReportRuns(report);
   };
 
-  TestReporter() : count_(0) {}
+  TestReporter() : count_(0), max_family_index_(0) {}
 
   virtual ~TestReporter() {}
 
   size_t GetCount() const { return count_; }
 
+  size_t GetMaxFamilyIndex() const { return max_family_index_; }
+
  private:
   mutable size_t count_;
+  mutable size_t max_family_index_;
 };
 
 }  // end namespace
@@ -98,6 +103,15 @@ int main(int argc, char **argv) {
                 << std::endl;
       return -1;
     }
+
+    const size_t max_family_index = test_reporter.GetMaxFamilyIndex();
+    const size_t num_families = reports_count == 0 ? 0 : 1 + max_family_index;
+    if (num_families != expected_reports) {
+      std::cerr << "ERROR: Expected " << expected_reports
+                << " test families to be run but num_families = "
+                << num_families << std::endl;
+      return -1;
+    }
   }
 
   return 0;
diff --git a/test/fixture_test.cc b/test/fixture_test.cc
index 1462b10f02..eba0a42d9c 100644
--- a/test/fixture_test.cc
+++ b/test/fixture_test.cc
@@ -4,35 +4,37 @@
 #include <cassert>
 #include <memory>
 
-class MyFixture : public ::benchmark::Fixture {
+#define FIXTURE_BECHMARK_NAME MyFixture
+
+class FIXTURE_BECHMARK_NAME : public ::benchmark::Fixture {
  public:
-  void SetUp(const ::benchmark::State& state) {
+  void SetUp(const ::benchmark::State& state) BENCHMARK_OVERRIDE {
     if (state.thread_index == 0) {
       assert(data.get() == nullptr);
       data.reset(new int(42));
     }
   }
 
-  void TearDown(const ::benchmark::State& state) {
+  void TearDown(const ::benchmark::State& state) BENCHMARK_OVERRIDE {
     if (state.thread_index == 0) {
       assert(data.get() != nullptr);
       data.reset();
     }
   }
 
-  ~MyFixture() { assert(data == nullptr); }
+  ~FIXTURE_BECHMARK_NAME() { assert(data == nullptr); }
 
   std::unique_ptr<int> data;
 };
 
-BENCHMARK_F(MyFixture, Foo)(benchmark::State &st) {
+BENCHMARK_F(FIXTURE_BECHMARK_NAME, Foo)(benchmark::State &st) {
   assert(data.get() != nullptr);
   assert(*data == 42);
   for (auto _ : st) {
   }
 }
 
-BENCHMARK_DEFINE_F(MyFixture, Bar)(benchmark::State& st) {
+BENCHMARK_DEFINE_F(FIXTURE_BECHMARK_NAME, Bar)(benchmark::State& st) {
   if (st.thread_index == 0) {
     assert(data.get() != nullptr);
     assert(*data == 42);
@@ -43,7 +45,7 @@ BENCHMARK_DEFINE_F(MyFixture, Bar)(benchmark::State& st) {
   }
   st.SetItemsProcessed(st.range(0));
 }
-BENCHMARK_REGISTER_F(MyFixture, Bar)->Arg(42);
-BENCHMARK_REGISTER_F(MyFixture, Bar)->Arg(42)->ThreadPerCpu();
+BENCHMARK_REGISTER_F(FIXTURE_BECHMARK_NAME, Bar)->Arg(42);
+BENCHMARK_REGISTER_F(FIXTURE_BECHMARK_NAME, Bar)->Arg(42)->ThreadPerCpu();
 
 BENCHMARK_MAIN();
diff --git a/test/internal_threading_test.cc b/test/internal_threading_test.cc
new file mode 100644
index 0000000000..039d7c14a8
--- /dev/null
+++ b/test/internal_threading_test.cc
@@ -0,0 +1,184 @@
+
+#undef NDEBUG
+
+#include <chrono>
+#include <thread>
+#include "../src/timers.h"
+#include "benchmark/benchmark.h"
+#include "output_test.h"
+
+static const std::chrono::duration<double, std::milli> time_frame(50);
+static const double time_frame_in_sec(
+    std::chrono::duration_cast<std::chrono::duration<double, std::ratio<1, 1>>>(
+        time_frame)
+        .count());
+
+void MyBusySpinwait() {
+  const auto start = benchmark::ChronoClockNow();
+
+  while (true) {
+    const auto now = benchmark::ChronoClockNow();
+    const auto elapsed = now - start;
+
+    if (std::chrono::duration<double, std::chrono::seconds::period>(elapsed) >=
+        time_frame)
+      return;
+  }
+}
+
+// ========================================================================= //
+// --------------------------- TEST CASES BEGIN ---------------------------- //
+// ========================================================================= //
+
+// ========================================================================= //
+// BM_MainThread
+
+void BM_MainThread(benchmark::State& state) {
+  for (auto _ : state) {
+    MyBusySpinwait();
+    state.SetIterationTime(time_frame_in_sec);
+  }
+  state.counters["invtime"] =
+      benchmark::Counter{1, benchmark::Counter::kIsRate};
+}
+
+BENCHMARK(BM_MainThread)->Iterations(1)->Threads(1);
+BENCHMARK(BM_MainThread)->Iterations(1)->Threads(1)->UseRealTime();
+BENCHMARK(BM_MainThread)->Iterations(1)->Threads(1)->UseManualTime();
+BENCHMARK(BM_MainThread)->Iterations(1)->Threads(1)->MeasureProcessCPUTime();
+BENCHMARK(BM_MainThread)
+    ->Iterations(1)
+    ->Threads(1)
+    ->MeasureProcessCPUTime()
+    ->UseRealTime();
+BENCHMARK(BM_MainThread)
+    ->Iterations(1)
+    ->Threads(1)
+    ->MeasureProcessCPUTime()
+    ->UseManualTime();
+
+BENCHMARK(BM_MainThread)->Iterations(1)->Threads(2);
+BENCHMARK(BM_MainThread)->Iterations(1)->Threads(2)->UseRealTime();
+BENCHMARK(BM_MainThread)->Iterations(1)->Threads(2)->UseManualTime();
+BENCHMARK(BM_MainThread)->Iterations(1)->Threads(2)->MeasureProcessCPUTime();
+BENCHMARK(BM_MainThread)
+    ->Iterations(1)
+    ->Threads(2)
+    ->MeasureProcessCPUTime()
+    ->UseRealTime();
+BENCHMARK(BM_MainThread)
+    ->Iterations(1)
+    ->Threads(2)
+    ->MeasureProcessCPUTime()
+    ->UseManualTime();
+
+// ========================================================================= //
+// BM_WorkerThread
+
+void BM_WorkerThread(benchmark::State& state) {
+  for (auto _ : state) {
+    std::thread Worker(&MyBusySpinwait);
+    Worker.join();
+    state.SetIterationTime(time_frame_in_sec);
+  }
+  state.counters["invtime"] =
+      benchmark::Counter{1, benchmark::Counter::kIsRate};
+}
+
+BENCHMARK(BM_WorkerThread)->Iterations(1)->Threads(1);
+BENCHMARK(BM_WorkerThread)->Iterations(1)->Threads(1)->UseRealTime();
+BENCHMARK(BM_WorkerThread)->Iterations(1)->Threads(1)->UseManualTime();
+BENCHMARK(BM_WorkerThread)->Iterations(1)->Threads(1)->MeasureProcessCPUTime();
+BENCHMARK(BM_WorkerThread)
+    ->Iterations(1)
+    ->Threads(1)
+    ->MeasureProcessCPUTime()
+    ->UseRealTime();
+BENCHMARK(BM_WorkerThread)
+    ->Iterations(1)
+    ->Threads(1)
+    ->MeasureProcessCPUTime()
+    ->UseManualTime();
+
+BENCHMARK(BM_WorkerThread)->Iterations(1)->Threads(2);
+BENCHMARK(BM_WorkerThread)->Iterations(1)->Threads(2)->UseRealTime();
+BENCHMARK(BM_WorkerThread)->Iterations(1)->Threads(2)->UseManualTime();
+BENCHMARK(BM_WorkerThread)->Iterations(1)->Threads(2)->MeasureProcessCPUTime();
+BENCHMARK(BM_WorkerThread)
+    ->Iterations(1)
+    ->Threads(2)
+    ->MeasureProcessCPUTime()
+    ->UseRealTime();
+BENCHMARK(BM_WorkerThread)
+    ->Iterations(1)
+    ->Threads(2)
+    ->MeasureProcessCPUTime()
+    ->UseManualTime();
+
+// ========================================================================= //
+// BM_MainThreadAndWorkerThread
+
+void BM_MainThreadAndWorkerThread(benchmark::State& state) {
+  for (auto _ : state) {
+    std::thread Worker(&MyBusySpinwait);
+    MyBusySpinwait();
+    Worker.join();
+    state.SetIterationTime(time_frame_in_sec);
+  }
+  state.counters["invtime"] =
+      benchmark::Counter{1, benchmark::Counter::kIsRate};
+}
+
+BENCHMARK(BM_MainThreadAndWorkerThread)->Iterations(1)->Threads(1);
+BENCHMARK(BM_MainThreadAndWorkerThread)
+    ->Iterations(1)
+    ->Threads(1)
+    ->UseRealTime();
+BENCHMARK(BM_MainThreadAndWorkerThread)
+    ->Iterations(1)
+    ->Threads(1)
+    ->UseManualTime();
+BENCHMARK(BM_MainThreadAndWorkerThread)
+    ->Iterations(1)
+    ->Threads(1)
+    ->MeasureProcessCPUTime();
+BENCHMARK(BM_MainThreadAndWorkerThread)
+    ->Iterations(1)
+    ->Threads(1)
+    ->MeasureProcessCPUTime()
+    ->UseRealTime();
+BENCHMARK(BM_MainThreadAndWorkerThread)
+    ->Iterations(1)
+    ->Threads(1)
+    ->MeasureProcessCPUTime()
+    ->UseManualTime();
+
+BENCHMARK(BM_MainThreadAndWorkerThread)->Iterations(1)->Threads(2);
+BENCHMARK(BM_MainThreadAndWorkerThread)
+    ->Iterations(1)
+    ->Threads(2)
+    ->UseRealTime();
+BENCHMARK(BM_MainThreadAndWorkerThread)
+    ->Iterations(1)
+    ->Threads(2)
+    ->UseManualTime();
+BENCHMARK(BM_MainThreadAndWorkerThread)
+    ->Iterations(1)
+    ->Threads(2)
+    ->MeasureProcessCPUTime();
+BENCHMARK(BM_MainThreadAndWorkerThread)
+    ->Iterations(1)
+    ->Threads(2)
+    ->MeasureProcessCPUTime()
+    ->UseRealTime();
+BENCHMARK(BM_MainThreadAndWorkerThread)
+    ->Iterations(1)
+    ->Threads(2)
+    ->MeasureProcessCPUTime()
+    ->UseManualTime();
+
+// ========================================================================= //
+// ---------------------------- TEST CASES END ----------------------------- //
+// ========================================================================= //
+
+int main(int argc, char* argv[]) { RunOutputTests(argc, argv); }
diff --git a/test/link_main_test.cc b/test/link_main_test.cc
new file mode 100644
index 0000000000..241ad5c390
--- /dev/null
+++ b/test/link_main_test.cc
@@ -0,0 +1,8 @@
+#include "benchmark/benchmark.h"
+
+void BM_empty(benchmark::State& state) {
+  for (auto _ : state) {
+    benchmark::DoNotOptimize(state.iterations());
+  }
+}
+BENCHMARK(BM_empty);
diff --git a/test/map_test.cc b/test/map_test.cc
index 311d2d22b8..86391b3601 100644
--- a/test/map_test.cc
+++ b/test/map_test.cc
@@ -8,7 +8,7 @@ namespace {
 std::map<int, int> ConstructRandomMap(int size) {
   std::map<int, int> m;
   for (int i = 0; i < size; ++i) {
-    m.insert(std::make_pair(rand() % size, rand() % size));
+    m.insert(std::make_pair(std::rand() % size, std::rand() % size));
   }
   return m;
 }
@@ -17,14 +17,14 @@ std::map<int, int> ConstructRandomMap(int size) {
 
 // Basic version.
 static void BM_MapLookup(benchmark::State& state) {
-  const int size = state.range(0);
+  const int size = static_cast<int>(state.range(0));
   std::map<int, int> m;
   for (auto _ : state) {
     state.PauseTiming();
     m = ConstructRandomMap(size);
     state.ResumeTiming();
     for (int i = 0; i < size; ++i) {
-      benchmark::DoNotOptimize(m.find(rand() % size));
+      benchmark::DoNotOptimize(m.find(std::rand() % size));
     }
   }
   state.SetItemsProcessed(state.iterations() * size);
@@ -34,20 +34,20 @@ BENCHMARK(BM_MapLookup)->Range(1 << 3, 1 << 12);
 // Using fixtures.
 class MapFixture : public ::benchmark::Fixture {
  public:
-  void SetUp(const ::benchmark::State& st) {
-    m = ConstructRandomMap(st.range(0));
+  void SetUp(const ::benchmark::State& st) BENCHMARK_OVERRIDE {
+    m = ConstructRandomMap(static_cast<int>(st.range(0)));
   }
 
-  void TearDown(const ::benchmark::State&) { m.clear(); }
+  void TearDown(const ::benchmark::State&) BENCHMARK_OVERRIDE { m.clear(); }
 
   std::map<int, int> m;
 };
 
 BENCHMARK_DEFINE_F(MapFixture, Lookup)(benchmark::State& state) {
-  const int size = state.range(0);
+  const int size = static_cast<int>(state.range(0));
   for (auto _ : state) {
     for (int i = 0; i < size; ++i) {
-      benchmark::DoNotOptimize(m.find(rand() % size));
+      benchmark::DoNotOptimize(m.find(std::rand() % size));
     }
   }
   state.SetItemsProcessed(state.iterations() * size);
diff --git a/test/memory_manager_test.cc b/test/memory_manager_test.cc
new file mode 100644
index 0000000000..f0c192fcbd
--- /dev/null
+++ b/test/memory_manager_test.cc
@@ -0,0 +1,46 @@
+#include <memory>
+
+#include "../src/check.h"
+#include "benchmark/benchmark.h"
+#include "output_test.h"
+
+class TestMemoryManager : public benchmark::MemoryManager {
+  void Start() BENCHMARK_OVERRIDE {}
+  void Stop(Result* result) BENCHMARK_OVERRIDE {
+    result->num_allocs = 42;
+    result->max_bytes_used = 42000;
+  }
+};
+
+void BM_empty(benchmark::State& state) {
+  for (auto _ : state) {
+    benchmark::DoNotOptimize(state.iterations());
+  }
+}
+BENCHMARK(BM_empty);
+
+ADD_CASES(TC_ConsoleOut, {{"^BM_empty %console_report$"}});
+ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_empty\",$"},
+                       {"\"family_index\": 0,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
+                       {"\"run_name\": \"BM_empty\",$", MR_Next},
+                       {"\"run_type\": \"iteration\",$", MR_Next},
+                       {"\"repetitions\": 1,$", MR_Next},
+                       {"\"repetition_index\": 0,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
+                       {"\"iterations\": %int,$", MR_Next},
+                       {"\"real_time\": %float,$", MR_Next},
+                       {"\"cpu_time\": %float,$", MR_Next},
+                       {"\"time_unit\": \"ns\",$", MR_Next},
+                       {"\"allocs_per_iter\": %float,$", MR_Next},
+                       {"\"max_bytes_used\": 42000$", MR_Next},
+                       {"}", MR_Next}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_empty\",%csv_report$"}});
+
+int main(int argc, char* argv[]) {
+  std::unique_ptr<benchmark::MemoryManager> mm(new TestMemoryManager());
+
+  benchmark::RegisterMemoryManager(mm.get());
+  RunOutputTests(argc, argv);
+  benchmark::RegisterMemoryManager(nullptr);
+}
diff --git a/test/multiple_ranges_test.cc b/test/multiple_ranges_test.cc
index 0a82382f3c..6b61f3af47 100644
--- a/test/multiple_ranges_test.cc
+++ b/test/multiple_ranges_test.cc
@@ -1,7 +1,9 @@
 #include "benchmark/benchmark.h"
 
 #include <cassert>
+#include <iostream>
 #include <set>
+#include <vector>
 
 class MultipleRangesFixture : public ::benchmark::Fixture {
  public:
@@ -26,26 +28,46 @@ class MultipleRangesFixture : public ::benchmark::Fixture {
                         {2, 7, 15},
                         {7, 6, 3}}) {}
 
-  void SetUp(const ::benchmark::State& state) {
-    std::vector<int> ranges = {state.range(0), state.range(1), state.range(2)};
+  void SetUp(const ::benchmark::State& state) BENCHMARK_OVERRIDE {
+    std::vector<int64_t> ranges = {state.range(0), state.range(1),
+                                   state.range(2)};
 
     assert(expectedValues.find(ranges) != expectedValues.end());
 
     actualValues.insert(ranges);
   }
 
+  // NOTE: This is not TearDown as we want to check after _all_ runs are
+  // complete.
   virtual ~MultipleRangesFixture() {
-    assert(actualValues.size() == expectedValues.size());
+    if (actualValues != expectedValues) {
+      std::cout << "EXPECTED\n";
+      for (auto v : expectedValues) {
+        std::cout << "{";
+        for (int64_t iv : v) {
+          std::cout << iv << ", ";
+        }
+        std::cout << "}\n";
+      }
+      std::cout << "ACTUAL\n";
+      for (auto v : actualValues) {
+        std::cout << "{";
+        for (int64_t iv : v) {
+          std::cout << iv << ", ";
+        }
+        std::cout << "}\n";
+      }
+    }
   }
 
-  std::set<std::vector<int>> expectedValues;
-  std::set<std::vector<int>> actualValues;
+  std::set<std::vector<int64_t>> expectedValues;
+  std::set<std::vector<int64_t>> actualValues;
 };
 
 BENCHMARK_DEFINE_F(MultipleRangesFixture, Empty)(benchmark::State& state) {
   for (auto _ : state) {
-    int product = state.range(0) * state.range(1) * state.range(2);
-    for (int x = 0; x < product; x++) {
+    int64_t product = state.range(0) * state.range(1) * state.range(2);
+    for (int64_t x = 0; x < product; x++) {
       benchmark::DoNotOptimize(x);
     }
   }
diff --git a/test/options_test.cc b/test/options_test.cc
index fdec69174e..9f9a78667c 100644
--- a/test/options_test.cc
+++ b/test/options_test.cc
@@ -25,6 +25,7 @@ BENCHMARK(BM_basic)->Arg(42);
 BENCHMARK(BM_basic_slow)->Arg(10)->Unit(benchmark::kNanosecond);
 BENCHMARK(BM_basic_slow)->Arg(100)->Unit(benchmark::kMicrosecond);
 BENCHMARK(BM_basic_slow)->Arg(1000)->Unit(benchmark::kMillisecond);
+BENCHMARK(BM_basic_slow)->Arg(1000)->Unit(benchmark::kSecond);
 BENCHMARK(BM_basic)->Range(1, 8);
 BENCHMARK(BM_basic)->RangeMultiplier(2)->Range(1, 8);
 BENCHMARK(BM_basic)->DenseRange(10, 15);
@@ -35,6 +36,16 @@ BENCHMARK(BM_basic)->UseRealTime();
 BENCHMARK(BM_basic)->ThreadRange(2, 4);
 BENCHMARK(BM_basic)->ThreadPerCpu();
 BENCHMARK(BM_basic)->Repetitions(3);
+BENCHMARK(BM_basic)
+    ->RangeMultiplier(std::numeric_limits<int>::max())
+    ->Range(std::numeric_limits<int64_t>::min(),
+            std::numeric_limits<int64_t>::max());
+
+// Negative ranges
+BENCHMARK(BM_basic)->Range(-64, -1);
+BENCHMARK(BM_basic)->RangeMultiplier(4)->Range(-8, 8);
+BENCHMARK(BM_basic)->DenseRange(-2, 2, 1);
+BENCHMARK(BM_basic)->Ranges({{-64, 1}, {-8, -1}});
 
 void CustomArgs(benchmark::internal::Benchmark* b) {
   for (int i = 0; i < 10; ++i) {
diff --git a/test/output_test.h b/test/output_test.h
index 897a13866b..3b12f3777d 100644
--- a/test/output_test.h
+++ b/test/output_test.h
@@ -2,13 +2,13 @@
 #define TEST_OUTPUT_TEST_H
 
 #undef NDEBUG
+#include <functional>
 #include <initializer_list>
 #include <memory>
+#include <sstream>
 #include <string>
 #include <utility>
 #include <vector>
-#include <functional>
-#include <sstream>
 
 #include "../src/re.h"
 #include "benchmark/benchmark.h"
@@ -60,6 +60,13 @@ int SetSubstitutions(
 // Run all output tests.
 void RunOutputTests(int argc, char* argv[]);
 
+// Count the number of 'pat' substrings in the 'haystack' string.
+int SubstrCnt(const std::string& haystack, const std::string& pat);
+
+// Run registered benchmarks with file reporter enabled, and return the content
+// outputted by the file reporter.
+std::string GetFileReporterOutput(int argc, char* argv[]);
+
 // ========================================================================= //
 // ------------------------- Results checking ------------------------------ //
 // ========================================================================= //
@@ -73,26 +80,27 @@ void RunOutputTests(int argc, char* argv[]);
 //                  will be the subject of a call to checker_function
 // checker_function: should be of type ResultsCheckFn (see below)
 #define CHECK_BENCHMARK_RESULTS(bm_name_pattern, checker_function) \
-    size_t CONCAT(dummy, __LINE__) = AddChecker(bm_name_pattern, checker_function)
+  size_t CONCAT(dummy, __LINE__) = AddChecker(bm_name_pattern, checker_function)
 
 struct Results;
-typedef std::function< void(Results const&) > ResultsCheckFn;
+typedef std::function<void(Results const&)> ResultsCheckFn;
 
 size_t AddChecker(const char* bm_name_pattern, ResultsCheckFn fn);
 
 // Class holding the results of a benchmark.
 // It is passed in calls to checker functions.
 struct Results {
-
   // the benchmark name
   std::string name;
   // the benchmark fields
-  std::map< std::string, std::string > values;
+  std::map<std::string, std::string> values;
 
   Results(const std::string& n) : name(n) {}
 
   int NumThreads() const;
 
+  double NumIterations() const;
+
   typedef enum { kCpuTime, kRealTime } BenchmarkTime;
 
   // get cpu_time or real_time in seconds
@@ -102,18 +110,18 @@ struct Results {
   // it is better to use fuzzy float checks for this, as the float
   // ASCII formatting is lossy.
   double DurationRealTime() const {
-    return GetAs< double >("iterations") * GetTime(kRealTime);
+    return NumIterations() * GetTime(kRealTime);
   }
   // get the cpu_time duration of the benchmark in seconds
   double DurationCPUTime() const {
-    return GetAs< double >("iterations") * GetTime(kCpuTime);
+    return NumIterations() * GetTime(kCpuTime);
   }
 
   // get the string for a result by name, or nullptr if the name
   // is not found
   const std::string* Get(const char* entry_name) const {
     auto it = values.find(entry_name);
-    if(it == values.end()) return nullptr;
+    if (it == values.end()) return nullptr;
     return &it->second;
   }
 
@@ -126,21 +134,21 @@ struct Results {
   // as a double, and only then converted to the asked type.
   template <class T>
   T GetCounterAs(const char* entry_name) const {
-    double dval = GetAs< double >(entry_name);
-    T tval = static_cast< T >(dval);
+    double dval = GetAs<double>(entry_name);
+    T tval = static_cast<T>(dval);
     return tval;
   }
 };
 
 template <class T>
 T Results::GetAs(const char* entry_name) const {
-  auto *sv = Get(entry_name);
-  CHECK(sv != nullptr && !sv->empty());
+  auto* sv = Get(entry_name);
+  BM_CHECK(sv != nullptr && !sv->empty());
   std::stringstream ss;
   ss << *sv;
   T out;
   ss >> out;
-  CHECK(!ss.fail());
+  BM_CHECK(!ss.fail());
   return out;
 }
 
@@ -148,8 +156,10 @@ T Results::GetAs(const char* entry_name) const {
 // Macros to help in result checking. Do not use them with arguments causing
 // side-effects.
 
-#define _CHECK_RESULT_VALUE(entry, getfn, var_type, var_name, relationship, value) \
-    CONCAT(CHECK_, relationship)                                        \
+// clang-format off
+
+#define CHECK_RESULT_VALUE_IMPL(entry, getfn, var_type, var_name, relationship, value) \
+    CONCAT(BM_CHECK_, relationship)                                        \
     (entry.getfn< var_type >(var_name), (value)) << "\n"                \
     << __FILE__ << ":" << __LINE__ << ": " << (entry).name << ":\n"     \
     << __FILE__ << ":" << __LINE__ << ": "                              \
@@ -159,8 +169,8 @@ T Results::GetAs(const char* entry_name) const {
 
 // check with tolerance. eps_factor is the tolerance window, which is
 // interpreted relative to value (eg, 0.1 means 10% of value).
-#define _CHECK_FLOAT_RESULT_VALUE(entry, getfn, var_type, var_name, relationship, value, eps_factor) \
-    CONCAT(CHECK_FLOAT_, relationship)                                  \
+#define CHECK_FLOAT_RESULT_VALUE_IMPL(entry, getfn, var_type, var_name, relationship, value, eps_factor) \
+    CONCAT(BM_CHECK_FLOAT_, relationship)                                  \
     (entry.getfn< var_type >(var_name), (value), (eps_factor) * (value)) << "\n" \
     << __FILE__ << ":" << __LINE__ << ": " << (entry).name << ":\n"     \
     << __FILE__ << ":" << __LINE__ << ": "                              \
@@ -177,16 +187,18 @@ T Results::GetAs(const char* entry_name) const {
     << "%)"
 
 #define CHECK_RESULT_VALUE(entry, var_type, var_name, relationship, value) \
-    _CHECK_RESULT_VALUE(entry, GetAs, var_type, var_name, relationship, value)
+    CHECK_RESULT_VALUE_IMPL(entry, GetAs, var_type, var_name, relationship, value)
 
 #define CHECK_COUNTER_VALUE(entry, var_type, var_name, relationship, value) \
-    _CHECK_RESULT_VALUE(entry, GetCounterAs, var_type, var_name, relationship, value)
+    CHECK_RESULT_VALUE_IMPL(entry, GetCounterAs, var_type, var_name, relationship, value)
 
 #define CHECK_FLOAT_RESULT_VALUE(entry, var_name, relationship, value, eps_factor) \
-    _CHECK_FLOAT_RESULT_VALUE(entry, GetAs, double, var_name, relationship, value, eps_factor)
+    CHECK_FLOAT_RESULT_VALUE_IMPL(entry, GetAs, double, var_name, relationship, value, eps_factor)
 
 #define CHECK_FLOAT_COUNTER_VALUE(entry, var_name, relationship, value, eps_factor) \
-    _CHECK_FLOAT_RESULT_VALUE(entry, GetCounterAs, double, var_name, relationship, value, eps_factor)
+    CHECK_FLOAT_RESULT_VALUE_IMPL(entry, GetCounterAs, double, var_name, relationship, value, eps_factor)
+
+// clang-format on
 
 // ========================================================================= //
 // --------------------------- Misc Utilities ------------------------------ //
diff --git a/test/output_test_helper.cc b/test/output_test_helper.cc
index 24746f6d27..4603459987 100644
--- a/test/output_test_helper.cc
+++ b/test/output_test_helper.cc
@@ -1,13 +1,18 @@
+#include <cstdio>
+#include <cstring>
+#include <fstream>
 #include <iostream>
 #include <map>
 #include <memory>
+#include <random>
 #include <sstream>
-#include <cstring>
+#include <streambuf>
 
+#include "../src/benchmark_api_internal.h"
 #include "../src/check.h"  // NOTE: check.h is for internal use only!
+#include "../src/log.h"    // NOTE: log.h is for internal use only
 #include "../src/re.h"     // NOTE: re.h is for internal use only
 #include "output_test.h"
-#include "../src/benchmark_api_internal.h"
 
 // ========================================================================= //
 // ------------------------------ Internals -------------------------------- //
@@ -33,21 +38,30 @@ TestCaseList& GetTestCaseList(TestCaseID ID) {
 
 SubMap& GetSubstitutions() {
   // Don't use 'dec_re' from header because it may not yet be initialized.
+  // clang-format off
   static std::string safe_dec_re = "[0-9]*[.]?[0-9]+([eE][-+][0-9]+)?";
+  static std::string time_re = "([0-9]+[.])?[0-9]+";
   static SubMap map = {
       {"%float", "[0-9]*[.]?[0-9]+([eE][-+][0-9]+)?"},
       // human-readable float
       {"%hrfloat", "[0-9]*[.]?[0-9]+([eE][-+][0-9]+)?[kMGTPEZYmunpfazy]?"},
       {"%int", "[ ]*[0-9]+"},
       {" %s ", "[ ]+"},
-      {"%time", "[ ]*[0-9]{1,5} ns"},
-      {"%console_report", "[ ]*[0-9]{1,5} ns [ ]*[0-9]{1,5} ns [ ]*[0-9]+"},
-      {"%console_us_report", "[ ]*[0-9] us [ ]*[0-9] us [ ]*[0-9]+"},
+      {"%time", "[ ]*" + time_re + "[ ]+ns"},
+      {"%console_report", "[ ]*" + time_re + "[ ]+ns [ ]*" + time_re + "[ ]+ns [ ]*[0-9]+"},
+      {"%console_us_report", "[ ]*" + time_re + "[ ]+us [ ]*" + time_re + "[ ]+us [ ]*[0-9]+"},
+      {"%console_ms_report", "[ ]*" + time_re + "[ ]+ms [ ]*" + time_re + "[ ]+ms [ ]*[0-9]+"},
+      {"%console_s_report", "[ ]*" + time_re + "[ ]+s [ ]*" + time_re + "[ ]+s [ ]*[0-9]+"},
+      {"%console_time_only_report", "[ ]*" + time_re + "[ ]+ns [ ]*" + time_re + "[ ]+ns"},
+      {"%console_us_report", "[ ]*" + time_re + "[ ]+us [ ]*" + time_re + "[ ]+us [ ]*[0-9]+"},
+      {"%console_us_time_only_report", "[ ]*" + time_re + "[ ]+us [ ]*" + time_re + "[ ]+us"},
       {"%csv_header",
        "name,iterations,real_time,cpu_time,time_unit,bytes_per_second,"
        "items_per_second,label,error_occurred,error_message"},
       {"%csv_report", "[0-9]+," + safe_dec_re + "," + safe_dec_re + ",ns,,,,,"},
       {"%csv_us_report", "[0-9]+," + safe_dec_re + "," + safe_dec_re + ",us,,,,,"},
+      {"%csv_ms_report", "[0-9]+," + safe_dec_re + "," + safe_dec_re + ",ms,,,,,"},
+      {"%csv_s_report", "[0-9]+," + safe_dec_re + "," + safe_dec_re + ",s,,,,,"},
       {"%csv_bytes_report",
        "[0-9]+," + safe_dec_re + "," + safe_dec_re + ",ns," + safe_dec_re + ",,,,"},
       {"%csv_items_report",
@@ -57,6 +71,7 @@ SubMap& GetSubstitutions() {
        "," + safe_dec_re + ",,,"},
       {"%csv_label_report_begin", "[0-9]+," + safe_dec_re + "," + safe_dec_re + ",ns,,,"},
       {"%csv_label_report_end", ",,"}};
+  // clang-format on
   return map;
 }
 
@@ -80,27 +95,27 @@ void CheckCase(std::stringstream& remaining_output, TestCase const& TC,
   bool on_first = true;
   std::string line;
   while (remaining_output.eof() == false) {
-    CHECK(remaining_output.good());
+    BM_CHECK(remaining_output.good());
     std::getline(remaining_output, line);
     if (on_first) {
       first_line = line;
       on_first = false;
     }
     for (const auto& NC : not_checks) {
-      CHECK(!NC.regex->Match(line))
+      BM_CHECK(!NC.regex->Match(line))
           << "Unexpected match for line \"" << line << "\" for MR_Not regex \""
           << NC.regex_str << "\""
           << "\n    actual regex string \"" << TC.substituted_regex << "\""
           << "\n    started matching near: " << first_line;
     }
     if (TC.regex->Match(line)) return;
-    CHECK(TC.match_rule != MR_Next)
+    BM_CHECK(TC.match_rule != MR_Next)
         << "Expected line \"" << line << "\" to match regex \"" << TC.regex_str
         << "\""
         << "\n    actual regex string \"" << TC.substituted_regex << "\""
         << "\n    started matching near: " << first_line;
   }
-  CHECK(remaining_output.eof() == false)
+  BM_CHECK(remaining_output.eof() == false)
       << "End of output reached before match for regex \"" << TC.regex_str
       << "\" was found"
       << "\n    actual regex string \"" << TC.substituted_regex << "\""
@@ -125,12 +140,12 @@ class TestReporter : public benchmark::BenchmarkReporter {
   TestReporter(std::vector<benchmark::BenchmarkReporter*> reps)
       : reporters_(reps) {}
 
-  virtual bool ReportContext(const Context& context) {
+  virtual bool ReportContext(const Context& context) BENCHMARK_OVERRIDE {
     bool last_ret = false;
     bool first = true;
     for (auto rep : reporters_) {
       bool new_ret = rep->ReportContext(context);
-      CHECK(first || new_ret == last_ret)
+      BM_CHECK(first || new_ret == last_ret)
           << "Reports return different values for ReportContext";
       first = false;
       last_ret = new_ret;
@@ -139,17 +154,17 @@ class TestReporter : public benchmark::BenchmarkReporter {
     return last_ret;
   }
 
-  void ReportRuns(const std::vector<Run>& report) {
+  void ReportRuns(const std::vector<Run>& report) BENCHMARK_OVERRIDE {
     for (auto rep : reporters_) rep->ReportRuns(report);
   }
-  void Finalize() {
+  void Finalize() BENCHMARK_OVERRIDE {
     for (auto rep : reporters_) rep->Finalize();
   }
 
  private:
-  std::vector<benchmark::BenchmarkReporter *> reporters_;
+  std::vector<benchmark::BenchmarkReporter*> reporters_;
 };
-}
+}  // namespace
 
 }  // end namespace internal
 
@@ -163,28 +178,25 @@ namespace internal {
 // It works by parsing the CSV output to read the results.
 class ResultsChecker {
  public:
-
-  struct PatternAndFn : public TestCase { // reusing TestCase for its regexes
+  struct PatternAndFn : public TestCase {  // reusing TestCase for its regexes
     PatternAndFn(const std::string& rx, ResultsCheckFn fn_)
-    : TestCase(rx), fn(fn_) {}
+        : TestCase(rx), fn(fn_) {}
     ResultsCheckFn fn;
   };
 
-  std::vector< PatternAndFn > check_patterns;
-  std::vector< Results > results;
-  std::vector< std::string > field_names;
+  std::vector<PatternAndFn> check_patterns;
+  std::vector<Results> results;
+  std::vector<std::string> field_names;
 
   void Add(const std::string& entry_pattern, ResultsCheckFn fn);
 
   void CheckResults(std::stringstream& output);
 
  private:
-
   void SetHeader_(const std::string& csv_header);
   void SetValues_(const std::string& entry_csv_line);
 
-  std::vector< std::string > SplitCsv_(const std::string& line);
-
+  std::vector<std::string> SplitCsv_(const std::string& line);
 };
 
 // store the static ResultsChecker in a function to prevent initialization
@@ -203,11 +215,11 @@ void ResultsChecker::Add(const std::string& entry_pattern, ResultsCheckFn fn) {
 void ResultsChecker::CheckResults(std::stringstream& output) {
   // first reset the stream to the start
   {
-    auto start = std::ios::streampos(0);
+    auto start = std::stringstream::pos_type(0);
     // clear before calling tellg()
     output.clear();
     // seek to zero only when needed
-    if(output.tellg() > start) output.seekg(start);
+    if (output.tellg() > start) output.seekg(start);
     // and just in case
     output.clear();
   }
@@ -215,29 +227,29 @@ void ResultsChecker::CheckResults(std::stringstream& output) {
   std::string line;
   bool on_first = true;
   while (output.eof() == false) {
-    CHECK(output.good());
+    BM_CHECK(output.good());
     std::getline(output, line);
     if (on_first) {
-      SetHeader_(line); // this is important
+      SetHeader_(line);  // this is important
       on_first = false;
       continue;
     }
     SetValues_(line);
   }
   // finally we can call the subscribed check functions
-  for(const auto& p : check_patterns) {
-    VLOG(2) << "--------------------------------\n";
-    VLOG(2) << "checking for benchmarks matching " << p.regex_str << "...\n";
-    for(const auto& r : results) {
-      if(!p.regex->Match(r.name)) {
-        VLOG(2) << p.regex_str << " is not matched by " << r.name << "\n";
+  for (const auto& p : check_patterns) {
+    BM_VLOG(2) << "--------------------------------\n";
+    BM_VLOG(2) << "checking for benchmarks matching " << p.regex_str << "...\n";
+    for (const auto& r : results) {
+      if (!p.regex->Match(r.name)) {
+        BM_VLOG(2) << p.regex_str << " is not matched by " << r.name << "\n";
         continue;
       } else {
-        VLOG(2) << p.regex_str << " is matched by " << r.name << "\n";
+        BM_VLOG(2) << p.regex_str << " is matched by " << r.name << "\n";
       }
-      VLOG(1) << "Checking results of " << r.name << ": ... \n";
+      BM_VLOG(1) << "Checking results of " << r.name << ": ... \n";
       p.fn(r);
-      VLOG(1) << "Checking results of " << r.name << ": OK.\n";
+      BM_VLOG(1) << "Checking results of " << r.name << ": OK.\n";
     }
   }
 }
@@ -249,76 +261,79 @@ void ResultsChecker::SetHeader_(const std::string& csv_header) {
 
 // set the values for a benchmark
 void ResultsChecker::SetValues_(const std::string& entry_csv_line) {
-  if(entry_csv_line.empty()) return; // some lines are empty
-  CHECK(!field_names.empty());
+  if (entry_csv_line.empty()) return;  // some lines are empty
+  BM_CHECK(!field_names.empty());
   auto vals = SplitCsv_(entry_csv_line);
-  CHECK_EQ(vals.size(), field_names.size());
-  results.emplace_back(vals[0]); // vals[0] is the benchmark name
-  auto &entry = results.back();
+  BM_CHECK_EQ(vals.size(), field_names.size());
+  results.emplace_back(vals[0]);  // vals[0] is the benchmark name
+  auto& entry = results.back();
   for (size_t i = 1, e = vals.size(); i < e; ++i) {
     entry.values[field_names[i]] = vals[i];
   }
 }
 
 // a quick'n'dirty csv splitter (eliminating quotes)
-std::vector< std::string > ResultsChecker::SplitCsv_(const std::string& line) {
-  std::vector< std::string > out;
-  if(line.empty()) return out;
-  if(!field_names.empty()) out.reserve(field_names.size());
+std::vector<std::string> ResultsChecker::SplitCsv_(const std::string& line) {
+  std::vector<std::string> out;
+  if (line.empty()) return out;
+  if (!field_names.empty()) out.reserve(field_names.size());
   size_t prev = 0, pos = line.find_first_of(','), curr = pos;
-  while(pos != line.npos) {
-    CHECK(curr > 0);
-    if(line[prev] == '"') ++prev;
-    if(line[curr-1] == '"') --curr;
-    out.push_back(line.substr(prev, curr-prev));
+  while (pos != line.npos) {
+    BM_CHECK(curr > 0);
+    if (line[prev] == '"') ++prev;
+    if (line[curr - 1] == '"') --curr;
+    out.push_back(line.substr(prev, curr - prev));
     prev = pos + 1;
     pos = line.find_first_of(',', pos + 1);
     curr = pos;
   }
   curr = line.size();
-  if(line[prev] == '"') ++prev;
-  if(line[curr-1] == '"') --curr;
-  out.push_back(line.substr(prev, curr-prev));
+  if (line[prev] == '"') ++prev;
+  if (line[curr - 1] == '"') --curr;
+  out.push_back(line.substr(prev, curr - prev));
   return out;
 }
 
 }  // end namespace internal
 
-size_t AddChecker(const char* bm_name, ResultsCheckFn fn)
-{
-  auto &rc = internal::GetResultsChecker();
+size_t AddChecker(const char* bm_name, ResultsCheckFn fn) {
+  auto& rc = internal::GetResultsChecker();
   rc.Add(bm_name, fn);
   return rc.results.size();
 }
 
 int Results::NumThreads() const {
   auto pos = name.find("/threads:");
-  if(pos == name.npos) return 1;
+  if (pos == name.npos) return 1;
   auto end = name.find('/', pos + 9);
   std::stringstream ss;
   ss << name.substr(pos + 9, end);
   int num = 1;
   ss >> num;
-  CHECK(!ss.fail());
+  BM_CHECK(!ss.fail());
   return num;
 }
 
+double Results::NumIterations() const {
+  return GetAs<double>("iterations");
+}
+
 double Results::GetTime(BenchmarkTime which) const {
-  CHECK(which == kCpuTime || which == kRealTime);
-  const char *which_str = which == kCpuTime ? "cpu_time" : "real_time";
-  double val = GetAs< double >(which_str);
+  BM_CHECK(which == kCpuTime || which == kRealTime);
+  const char* which_str = which == kCpuTime ? "cpu_time" : "real_time";
+  double val = GetAs<double>(which_str);
   auto unit = Get("time_unit");
-  CHECK(unit);
-  if(*unit == "ns") {
+  BM_CHECK(unit);
+  if (*unit == "ns") {
     return val * 1.e-9;
-  } else if(*unit == "us") {
+  } else if (*unit == "us") {
     return val * 1.e-6;
-  } else if(*unit == "ms") {
+  } else if (*unit == "ms") {
     return val * 1.e-3;
-  } else if(*unit == "s") {
+  } else if (*unit == "s") {
     return val;
   } else {
-    CHECK(1 == 0) << "unknown time unit: " << *unit;
+    BM_CHECK(1 == 0) << "unknown time unit: " << *unit;
     return 0;
   }
 }
@@ -333,11 +348,11 @@ TestCase::TestCase(std::string re, int rule)
       substituted_regex(internal::PerformSubstitutions(regex_str)),
       regex(std::make_shared<benchmark::Regex>()) {
   std::string err_str;
-  regex->Init(substituted_regex,& err_str);
-  CHECK(err_str.empty()) << "Could not construct regex \"" << substituted_regex
-                         << "\""
-                         << "\n    originally \"" << regex_str << "\""
-                         << "\n    got error: " << err_str;
+  regex->Init(substituted_regex, &err_str);
+  BM_CHECK(err_str.empty())
+      << "Could not construct regex \"" << substituted_regex << "\""
+      << "\n    originally \"" << regex_str << "\""
+      << "\n    got error: " << err_str;
 }
 
 int AddCases(TestCaseID ID, std::initializer_list<TestCase> il) {
@@ -364,10 +379,16 @@ int SetSubstitutions(
   return 0;
 }
 
+// Disable deprecated warnings temporarily because we need to reference
+// CSVReporter but don't want to trigger -Werror=-Wdeprecated-declarations
+#ifdef __GNUC__
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+#endif
 void RunOutputTests(int argc, char* argv[]) {
   using internal::GetTestCaseList;
   benchmark::Initialize(&argc, argv);
-  auto options = benchmark::internal::GetOutputOptions(/*force_no_color*/true);
+  auto options = benchmark::internal::GetOutputOptions(/*force_no_color*/ true);
   benchmark::ConsoleReporter CR(options);
   benchmark::JSONReporter JR;
   benchmark::CSVReporter CSVR;
@@ -416,8 +437,85 @@ void RunOutputTests(int argc, char* argv[]) {
 
   // now that we know the output is as expected, we can dispatch
   // the checks to subscribees.
-  auto &csv = TestCases[2];
+  auto& csv = TestCases[2];
   // would use == but gcc spits a warning
-  CHECK(std::strcmp(csv.name, "CSVReporter") == 0);
+  BM_CHECK(std::strcmp(csv.name, "CSVReporter") == 0);
   internal::GetResultsChecker().CheckResults(csv.out_stream);
 }
+
+#ifdef __GNUC__
+#pragma GCC diagnostic pop
+#endif
+
+int SubstrCnt(const std::string& haystack, const std::string& pat) {
+  if (pat.length() == 0) return 0;
+  int count = 0;
+  for (size_t offset = haystack.find(pat); offset != std::string::npos;
+       offset = haystack.find(pat, offset + pat.length()))
+    ++count;
+  return count;
+}
+
+static char ToHex(int ch) {
+  return ch < 10 ? static_cast<char>('0' + ch)
+                 : static_cast<char>('a' + (ch - 10));
+}
+
+static char RandomHexChar() {
+  static std::mt19937 rd{std::random_device{}()};
+  static std::uniform_int_distribution<int> mrand{0, 15};
+  return ToHex(mrand(rd));
+}
+
+static std::string GetRandomFileName() {
+  std::string model = "test.%%%%%%";
+  for (auto & ch :  model) {
+    if (ch == '%')
+      ch = RandomHexChar();
+  }
+  return model;
+}
+
+static bool FileExists(std::string const& name) {
+  std::ifstream in(name.c_str());
+  return in.good();
+}
+
+static std::string GetTempFileName() {
+  // This function attempts to avoid race conditions where two tests
+  // create the same file at the same time. However, it still introduces races
+  // similar to tmpnam.
+  int retries = 3;
+  while (--retries) {
+    std::string name = GetRandomFileName();
+    if (!FileExists(name))
+      return name;
+  }
+  std::cerr << "Failed to create unique temporary file name" << std::endl;
+  std::abort();
+}
+
+std::string GetFileReporterOutput(int argc, char* argv[]) {
+  std::vector<char*> new_argv(argv, argv + argc);
+  assert(static_cast<decltype(new_argv)::size_type>(argc) == new_argv.size());
+
+  std::string tmp_file_name = GetTempFileName();
+  std::cout << "Will be using this as the tmp file: " << tmp_file_name << '\n';
+
+  std::string tmp = "--benchmark_out=";
+  tmp += tmp_file_name;
+  new_argv.emplace_back(const_cast<char*>(tmp.c_str()));
+
+  argc = int(new_argv.size());
+
+  benchmark::Initialize(&argc, new_argv.data());
+  benchmark::RunSpecifiedBenchmarks();
+
+  // Read the output back from the file, and delete the file.
+  std::ifstream tmp_stream(tmp_file_name);
+  std::string output = std::string((std::istreambuf_iterator<char>(tmp_stream)),
+                                   std::istreambuf_iterator<char>());
+  std::remove(tmp_file_name.c_str());
+
+  return output;
+}
diff --git a/test/perf_counters_gtest.cc b/test/perf_counters_gtest.cc
new file mode 100644
index 0000000000..ddeddf4f4d
--- /dev/null
+++ b/test/perf_counters_gtest.cc
@@ -0,0 +1,145 @@
+#include <thread>
+
+#include "../src/perf_counters.h"
+#include "gtest/gtest.h"
+
+#ifndef GTEST_SKIP
+struct MsgHandler {
+  void operator=(std::ostream&){}
+};
+#define GTEST_SKIP() return MsgHandler() = std::cout
+#endif
+
+using benchmark::internal::PerfCounters;
+using benchmark::internal::PerfCounterValues;
+
+namespace {
+const char kGenericPerfEvent1[] = "CYCLES";
+const char kGenericPerfEvent2[] = "BRANCHES";
+const char kGenericPerfEvent3[] = "INSTRUCTIONS";
+
+TEST(PerfCountersTest, Init) {
+  EXPECT_EQ(PerfCounters::Initialize(), PerfCounters::kSupported);
+}
+
+TEST(PerfCountersTest, OneCounter) {
+  if (!PerfCounters::kSupported) {
+    GTEST_SKIP() << "Performance counters not supported.\n";
+  }
+  EXPECT_TRUE(PerfCounters::Initialize());
+  EXPECT_TRUE(PerfCounters::Create({kGenericPerfEvent1}).IsValid());
+}
+
+TEST(PerfCountersTest, NegativeTest) {
+  if (!PerfCounters::kSupported) {
+    EXPECT_FALSE(PerfCounters::Initialize());
+    return;
+  }
+  EXPECT_TRUE(PerfCounters::Initialize());
+  EXPECT_FALSE(PerfCounters::Create({}).IsValid());
+  EXPECT_FALSE(PerfCounters::Create({""}).IsValid());
+  EXPECT_FALSE(PerfCounters::Create({"not a counter name"}).IsValid());
+  {
+    EXPECT_TRUE(PerfCounters::Create({kGenericPerfEvent1, kGenericPerfEvent2,
+                                      kGenericPerfEvent3})
+                    .IsValid());
+  }
+  EXPECT_FALSE(
+      PerfCounters::Create({kGenericPerfEvent2, "", kGenericPerfEvent1})
+          .IsValid());
+  EXPECT_FALSE(PerfCounters::Create({kGenericPerfEvent3, "not a counter name",
+                                     kGenericPerfEvent1})
+                   .IsValid());
+  {
+    EXPECT_TRUE(PerfCounters::Create({kGenericPerfEvent1, kGenericPerfEvent2,
+                                      kGenericPerfEvent3})
+                    .IsValid());
+  }
+  EXPECT_FALSE(
+      PerfCounters::Create({kGenericPerfEvent1, kGenericPerfEvent2,
+                            kGenericPerfEvent3, "MISPREDICTED_BRANCH_RETIRED"})
+          .IsValid());
+}
+
+TEST(PerfCountersTest, Read1Counter) {
+  if (!PerfCounters::kSupported) {
+    GTEST_SKIP() << "Test skipped because libpfm is not supported.\n";
+  }
+  EXPECT_TRUE(PerfCounters::Initialize());
+  auto counters = PerfCounters::Create({kGenericPerfEvent1});
+  EXPECT_TRUE(counters.IsValid());
+  PerfCounterValues values1(1);
+  EXPECT_TRUE(counters.Snapshot(&values1));
+  EXPECT_GT(values1[0], 0);
+  PerfCounterValues values2(1);
+  EXPECT_TRUE(counters.Snapshot(&values2));
+  EXPECT_GT(values2[0], 0);
+  EXPECT_GT(values2[0], values1[0]);
+}
+
+TEST(PerfCountersTest, Read2Counters) {
+  if (!PerfCounters::kSupported) {
+    GTEST_SKIP() << "Test skipped because libpfm is not supported.\n";
+  }
+  EXPECT_TRUE(PerfCounters::Initialize());
+  auto counters =
+      PerfCounters::Create({kGenericPerfEvent1, kGenericPerfEvent2});
+  EXPECT_TRUE(counters.IsValid());
+  PerfCounterValues values1(2);
+  EXPECT_TRUE(counters.Snapshot(&values1));
+  EXPECT_GT(values1[0], 0);
+  EXPECT_GT(values1[1], 0);
+  PerfCounterValues values2(2);
+  EXPECT_TRUE(counters.Snapshot(&values2));
+  EXPECT_GT(values2[0], 0);
+  EXPECT_GT(values2[1], 0);
+}
+
+size_t do_work() {
+  size_t res = 0;
+  for (size_t i = 0; i < 100000000; ++i) res += i * i;
+  return res;
+}
+
+void measure(size_t threadcount, PerfCounterValues* values1,
+             PerfCounterValues* values2) {
+  BM_CHECK_NE(values1, nullptr);
+  BM_CHECK_NE(values2, nullptr);
+  std::vector<std::thread> threads(threadcount);
+  auto work = [&]() { BM_CHECK(do_work() > 1000); };
+
+  // We need to first set up the counters, then start the threads, so the
+  // threads would inherit the counters. But later, we need to first destroy the
+  // thread pool (so all the work finishes), then measure the counters. So the
+  // scopes overlap, and we need to explicitly control the scope of the
+  // threadpool.
+  auto counters =
+      PerfCounters::Create({kGenericPerfEvent1, kGenericPerfEvent3});
+  for (auto& t : threads) t = std::thread(work);
+  counters.Snapshot(values1);
+  for (auto& t : threads) t.join();
+  counters.Snapshot(values2);
+}
+
+TEST(PerfCountersTest, MultiThreaded) {
+  if (!PerfCounters::kSupported) {
+    GTEST_SKIP() << "Test skipped because libpfm is not supported.";
+  }
+  EXPECT_TRUE(PerfCounters::Initialize());
+  PerfCounterValues values1(2);
+  PerfCounterValues values2(2);
+
+  measure(2, &values1, &values2);
+  std::vector<double> D1{static_cast<double>(values2[0] - values1[0]),
+                         static_cast<double>(values2[1] - values1[1])};
+
+  measure(4, &values1, &values2);
+  std::vector<double> D2{static_cast<double>(values2[0] - values1[0]),
+                         static_cast<double>(values2[1] - values1[1])};
+
+  // Some extra work will happen on the main thread - like joining the threads
+  // - so the ratio won't be quite 2.0, but very close.
+  EXPECT_GE(D2[0], 1.9 * D1[0]);
+  EXPECT_GE(D2[1], 1.9 * D1[1]);
+}
+}  // namespace
diff --git a/test/perf_counters_test.cc b/test/perf_counters_test.cc
new file mode 100644
index 0000000000..d6e0284d4d
--- /dev/null
+++ b/test/perf_counters_test.cc
@@ -0,0 +1,27 @@
+#undef NDEBUG
+
+#include "../src/perf_counters.h"
+
+#include "benchmark/benchmark.h"
+#include "output_test.h"
+
+void BM_Simple(benchmark::State& state) {
+  for (auto _ : state) {
+    benchmark::DoNotOptimize(state.iterations());
+  }
+}
+BENCHMARK(BM_Simple);
+ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_Simple\",$"}});
+
+void CheckSimple(Results const& e) {
+  CHECK_COUNTER_VALUE(e, double, "CYCLES", GT, 0);
+  CHECK_COUNTER_VALUE(e, double, "BRANCHES", GT, 0.0);
+}
+CHECK_BENCHMARK_RESULTS("BM_Simple", &CheckSimple);
+
+int main(int argc, char* argv[]) {
+  if (!benchmark::internal::PerfCounters::kSupported) {
+    return 0;
+  }
+  RunOutputTests(argc, argv);
+}
diff --git a/test/register_benchmark_test.cc b/test/register_benchmark_test.cc
index 8ab2c29939..8613acba4f 100644
--- a/test/register_benchmark_test.cc
+++ b/test/register_benchmark_test.cc
@@ -10,7 +10,7 @@ namespace {
 
 class TestReporter : public benchmark::ConsoleReporter {
  public:
-  virtual void ReportRuns(const std::vector<Run>& report) {
+  virtual void ReportRuns(const std::vector<Run>& report) BENCHMARK_OVERRIDE {
     all_runs_.insert(all_runs_.end(), begin(report), end(report));
     ConsoleReporter::ReportRuns(report);
   }
@@ -29,14 +29,16 @@ struct TestCase {
   typedef benchmark::BenchmarkReporter::Run Run;
 
   void CheckRun(Run const& run) const {
-    CHECK(name == run.benchmark_name) << "expected " << name << " got "
-                                      << run.benchmark_name;
+    // clang-format off
+    BM_CHECK(name == run.benchmark_name()) << "expected " << name << " got "
+                                      << run.benchmark_name();
     if (label) {
-      CHECK(run.report_label == label) << "expected " << label << " got "
+      BM_CHECK(run.report_label == label) << "expected " << label << " got "
                                        << run.report_label;
     } else {
-      CHECK(run.report_label == "");
+      BM_CHECK(run.report_label == "");
     }
+    // clang-format on
   }
 };
 
diff --git a/test/repetitions_test.cc b/test/repetitions_test.cc
new file mode 100644
index 0000000000..f93de502a3
--- /dev/null
+++ b/test/repetitions_test.cc
@@ -0,0 +1,208 @@
+
+#include "benchmark/benchmark.h"
+#include "output_test.h"
+
+// ========================================================================= //
+// ------------------------ Testing Basic Output --------------------------- //
+// ========================================================================= //
+
+void BM_ExplicitRepetitions(benchmark::State& state) {
+  for (auto _ : state) {
+  }
+}
+BENCHMARK(BM_ExplicitRepetitions)->Repetitions(2);
+
+ADD_CASES(TC_ConsoleOut,
+          {{"^BM_ExplicitRepetitions/repeats:2 %console_report$"}});
+ADD_CASES(TC_ConsoleOut,
+          {{"^BM_ExplicitRepetitions/repeats:2 %console_report$"}});
+ADD_CASES(TC_ConsoleOut,
+          {{"^BM_ExplicitRepetitions/repeats:2_mean %console_report$"}});
+ADD_CASES(TC_ConsoleOut,
+          {{"^BM_ExplicitRepetitions/repeats:2_median %console_report$"}});
+ADD_CASES(TC_ConsoleOut,
+          {{"^BM_ExplicitRepetitions/repeats:2_stddev %console_report$"}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_ExplicitRepetitions/repeats:2\",$"},
+           {"\"family_index\": 0,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
+           {"\"run_name\": \"BM_ExplicitRepetitions/repeats:2\",$", MR_Next},
+           {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"repetitions\": 2,$", MR_Next},
+           {"\"repetition_index\": 0,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\"$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_ExplicitRepetitions/repeats:2\",$"},
+           {"\"family_index\": 0,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
+           {"\"run_name\": \"BM_ExplicitRepetitions/repeats:2\",$", MR_Next},
+           {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"repetitions\": 2,$", MR_Next},
+           {"\"repetition_index\": 1,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\"$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_ExplicitRepetitions/repeats:2_mean\",$"},
+           {"\"family_index\": 0,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
+           {"\"run_name\": \"BM_ExplicitRepetitions/repeats:2\",$", MR_Next},
+           {"\"run_type\": \"aggregate\",$", MR_Next},
+           {"\"repetitions\": 2,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"aggregate_name\": \"mean\",$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\"$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_ExplicitRepetitions/repeats:2_median\",$"},
+           {"\"family_index\": 0,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
+           {"\"run_name\": \"BM_ExplicitRepetitions/repeats:2\",$", MR_Next},
+           {"\"run_type\": \"aggregate\",$", MR_Next},
+           {"\"repetitions\": 2,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"aggregate_name\": \"median\",$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\"$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_ExplicitRepetitions/repeats:2_stddev\",$"},
+           {"\"family_index\": 0,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
+           {"\"run_name\": \"BM_ExplicitRepetitions/repeats:2\",$", MR_Next},
+           {"\"run_type\": \"aggregate\",$", MR_Next},
+           {"\"repetitions\": 2,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"aggregate_name\": \"stddev\",$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\"$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_ExplicitRepetitions/repeats:2\",%csv_report$"}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_ExplicitRepetitions/repeats:2\",%csv_report$"}});
+ADD_CASES(TC_CSVOut,
+          {{"^\"BM_ExplicitRepetitions/repeats:2_mean\",%csv_report$"}});
+ADD_CASES(TC_CSVOut,
+          {{"^\"BM_ExplicitRepetitions/repeats:2_median\",%csv_report$"}});
+ADD_CASES(TC_CSVOut,
+          {{"^\"BM_ExplicitRepetitions/repeats:2_stddev\",%csv_report$"}});
+
+// ========================================================================= //
+// ------------------------ Testing Basic Output --------------------------- //
+// ========================================================================= //
+
+void BM_ImplicitRepetitions(benchmark::State& state) {
+  for (auto _ : state) {
+  }
+}
+BENCHMARK(BM_ImplicitRepetitions);
+
+ADD_CASES(TC_ConsoleOut, {{"^BM_ImplicitRepetitions %console_report$"}});
+ADD_CASES(TC_ConsoleOut, {{"^BM_ImplicitRepetitions %console_report$"}});
+ADD_CASES(TC_ConsoleOut, {{"^BM_ImplicitRepetitions %console_report$"}});
+ADD_CASES(TC_ConsoleOut, {{"^BM_ImplicitRepetitions_mean %console_report$"}});
+ADD_CASES(TC_ConsoleOut, {{"^BM_ImplicitRepetitions_median %console_report$"}});
+ADD_CASES(TC_ConsoleOut, {{"^BM_ImplicitRepetitions_stddev %console_report$"}});
+ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_ImplicitRepetitions\",$"},
+                       {"\"family_index\": 1,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
+                       {"\"run_name\": \"BM_ImplicitRepetitions\",$", MR_Next},
+                       {"\"run_type\": \"iteration\",$", MR_Next},
+                       {"\"repetitions\": 3,$", MR_Next},
+                       {"\"repetition_index\": 0,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
+                       {"\"iterations\": %int,$", MR_Next},
+                       {"\"real_time\": %float,$", MR_Next},
+                       {"\"cpu_time\": %float,$", MR_Next},
+                       {"\"time_unit\": \"ns\"$", MR_Next},
+                       {"}", MR_Next}});
+ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_ImplicitRepetitions\",$"},
+                       {"\"family_index\": 1,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
+                       {"\"run_name\": \"BM_ImplicitRepetitions\",$", MR_Next},
+                       {"\"run_type\": \"iteration\",$", MR_Next},
+                       {"\"repetitions\": 3,$", MR_Next},
+                       {"\"repetition_index\": 1,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
+                       {"\"iterations\": %int,$", MR_Next},
+                       {"\"real_time\": %float,$", MR_Next},
+                       {"\"cpu_time\": %float,$", MR_Next},
+                       {"\"time_unit\": \"ns\"$", MR_Next},
+                       {"}", MR_Next}});
+ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_ImplicitRepetitions\",$"},
+                       {"\"family_index\": 1,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
+                       {"\"run_name\": \"BM_ImplicitRepetitions\",$", MR_Next},
+                       {"\"run_type\": \"iteration\",$", MR_Next},
+                       {"\"repetitions\": 3,$", MR_Next},
+                       {"\"repetition_index\": 2,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
+                       {"\"iterations\": %int,$", MR_Next},
+                       {"\"real_time\": %float,$", MR_Next},
+                       {"\"cpu_time\": %float,$", MR_Next},
+                       {"\"time_unit\": \"ns\"$", MR_Next},
+                       {"}", MR_Next}});
+ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_ImplicitRepetitions_mean\",$"},
+                       {"\"family_index\": 1,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
+                       {"\"run_name\": \"BM_ImplicitRepetitions\",$", MR_Next},
+                       {"\"run_type\": \"aggregate\",$", MR_Next},
+                       {"\"repetitions\": 3,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
+                       {"\"aggregate_name\": \"mean\",$", MR_Next},
+                       {"\"iterations\": %int,$", MR_Next},
+                       {"\"real_time\": %float,$", MR_Next},
+                       {"\"cpu_time\": %float,$", MR_Next},
+                       {"\"time_unit\": \"ns\"$", MR_Next},
+                       {"}", MR_Next}});
+ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_ImplicitRepetitions_median\",$"},
+                       {"\"family_index\": 1,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
+                       {"\"run_name\": \"BM_ImplicitRepetitions\",$", MR_Next},
+                       {"\"run_type\": \"aggregate\",$", MR_Next},
+                       {"\"repetitions\": 3,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
+                       {"\"aggregate_name\": \"median\",$", MR_Next},
+                       {"\"iterations\": %int,$", MR_Next},
+                       {"\"real_time\": %float,$", MR_Next},
+                       {"\"cpu_time\": %float,$", MR_Next},
+                       {"\"time_unit\": \"ns\"$", MR_Next},
+                       {"}", MR_Next}});
+ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_ImplicitRepetitions_stddev\",$"},
+                       {"\"family_index\": 1,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
+                       {"\"run_name\": \"BM_ImplicitRepetitions\",$", MR_Next},
+                       {"\"run_type\": \"aggregate\",$", MR_Next},
+                       {"\"repetitions\": 3,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
+                       {"\"aggregate_name\": \"stddev\",$", MR_Next},
+                       {"\"iterations\": %int,$", MR_Next},
+                       {"\"real_time\": %float,$", MR_Next},
+                       {"\"cpu_time\": %float,$", MR_Next},
+                       {"\"time_unit\": \"ns\"$", MR_Next},
+                       {"}", MR_Next}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_ImplicitRepetitions\",%csv_report$"}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_ImplicitRepetitions\",%csv_report$"}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_ImplicitRepetitions_mean\",%csv_report$"}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_ImplicitRepetitions_median\",%csv_report$"}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_ImplicitRepetitions_stddev\",%csv_report$"}});
+
+// ========================================================================= //
+// --------------------------- TEST CASES END ------------------------------ //
+// ========================================================================= //
+
+int main(int argc, char* argv[]) { RunOutputTests(argc, argv); }
diff --git a/test/report_aggregates_only_test.cc b/test/report_aggregates_only_test.cc
new file mode 100644
index 0000000000..9646b9be53
--- /dev/null
+++ b/test/report_aggregates_only_test.cc
@@ -0,0 +1,39 @@
+
+#undef NDEBUG
+#include <cstdio>
+#include <string>
+
+#include "benchmark/benchmark.h"
+#include "output_test.h"
+
+// Ok this test is super ugly. We want to check what happens with the file
+// reporter in the presence of ReportAggregatesOnly().
+// We do not care about console output, the normal tests check that already.
+
+void BM_SummaryRepeat(benchmark::State& state) {
+  for (auto _ : state) {
+  }
+}
+BENCHMARK(BM_SummaryRepeat)->Repetitions(3)->ReportAggregatesOnly();
+
+int main(int argc, char* argv[]) {
+  const std::string output = GetFileReporterOutput(argc, argv);
+
+  if (SubstrCnt(output, "\"name\": \"BM_SummaryRepeat/repeats:3") != 3 ||
+      SubstrCnt(output, "\"name\": \"BM_SummaryRepeat/repeats:3_mean\"") != 1 ||
+      SubstrCnt(output, "\"name\": \"BM_SummaryRepeat/repeats:3_median\"") !=
+          1 ||
+      SubstrCnt(output, "\"name\": \"BM_SummaryRepeat/repeats:3_stddev\"") !=
+          1) {
+    std::cout << "Precondition mismatch. Expected to only find three "
+                 "occurrences of \"BM_SummaryRepeat/repeats:3\" substring:\n"
+                 "\"name\": \"BM_SummaryRepeat/repeats:3_mean\", "
+                 "\"name\": \"BM_SummaryRepeat/repeats:3_median\", "
+                 "\"name\": \"BM_SummaryRepeat/repeats:3_stddev\"\nThe entire "
+                 "output:\n";
+    std::cout << output;
+    return 1;
+  }
+
+  return 0;
+}
diff --git a/test/reporter_output_test.cc b/test/reporter_output_test.cc
index dcc15f0471..c10625d5bc 100644
--- a/test/reporter_output_test.cc
+++ b/test/reporter_output_test.cc
@@ -9,33 +9,37 @@
 // ---------------------- Testing Prologue Output -------------------------- //
 // ========================================================================= //
 
-ADD_CASES(TC_ConsoleOut,
-          {{"^[-]+$", MR_Next},
-           {"^Benchmark %s Time %s CPU %s Iterations$", MR_Next},
-           {"^[-]+$", MR_Next}});
+ADD_CASES(TC_ConsoleOut, {{"^[-]+$", MR_Next},
+                          {"^Benchmark %s Time %s CPU %s Iterations$", MR_Next},
+                          {"^[-]+$", MR_Next}});
 static int AddContextCases() {
   AddCases(TC_ConsoleErr,
            {
-               {"%int[-/]%int[-/]%int %int:%int:%int$", MR_Default},
-               {"Run on \\(%int X %float MHz CPU s\\)", MR_Next},
+               {"^%int-%int-%intT%int:%int:%int[-+]%int:%int$", MR_Default},
+               {"Running .*/reporter_output_test(\\.exe)?$", MR_Next},
+               {"Run on \\(%int X %float MHz CPU s?\\)", MR_Next},
            });
-  AddCases(TC_JSONOut, {{"^\\{", MR_Default},
-                        {"\"context\":", MR_Next},
-                        {"\"date\": \"", MR_Next},
-                        {"\"num_cpus\": %int,$", MR_Next},
-                        {"\"mhz_per_cpu\": %float,$", MR_Next},
-                        {"\"cpu_scaling_enabled\": ", MR_Next},
-                        {"\"caches\": \\[$", MR_Next}});
-  auto const& Caches = benchmark::CPUInfo::Get().caches;
+  AddCases(TC_JSONOut,
+           {{"^\\{", MR_Default},
+            {"\"context\":", MR_Next},
+            {"\"date\": \"", MR_Next},
+            {"\"host_name\":", MR_Next},
+            {"\"executable\": \".*(/|\\\\)reporter_output_test(\\.exe)?\",",
+             MR_Next},
+            {"\"num_cpus\": %int,$", MR_Next},
+            {"\"mhz_per_cpu\": %float,$", MR_Next},
+            {"\"caches\": \\[$", MR_Default}});
+  auto const& Info = benchmark::CPUInfo::Get();
+  auto const& Caches = Info.caches;
   if (!Caches.empty()) {
     AddCases(TC_ConsoleErr, {{"CPU Caches:$", MR_Next}});
   }
   for (size_t I = 0; I < Caches.size(); ++I) {
     std::string num_caches_str =
         Caches[I].num_sharing != 0 ? " \\(x%int\\)$" : "$";
-    AddCases(
-        TC_ConsoleErr,
-        {{"L%int (Data|Instruction|Unified) %intK" + num_caches_str, MR_Next}});
+    AddCases(TC_ConsoleErr,
+             {{"L%int (Data|Instruction|Unified) %int KiB" + num_caches_str,
+               MR_Next}});
     AddCases(TC_JSONOut, {{"\\{$", MR_Next},
                           {"\"type\": \"", MR_Next},
                           {"\"level\": %int,$", MR_Next},
@@ -43,8 +47,13 @@ static int AddContextCases() {
                           {"\"num_sharing\": %int$", MR_Next},
                           {"}[,]{0,1}$", MR_Next}});
   }
-
   AddCases(TC_JSONOut, {{"],$"}});
+  auto const& LoadAvg = Info.load_avg;
+  if (!LoadAvg.empty()) {
+    AddCases(TC_ConsoleErr,
+             {{"Load Average: (%float, ){0,2}%float$", MR_Next}});
+  }
+  AddCases(TC_JSONOut, {{"\"load_avg\": \\[(%float,?){0,3}],$", MR_Next}});
   return 0;
 }
 int dummy_register = AddContextCases();
@@ -62,6 +71,13 @@ BENCHMARK(BM_basic);
 
 ADD_CASES(TC_ConsoleOut, {{"^BM_basic %console_report$"}});
 ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_basic\",$"},
+                       {"\"family_index\": 0,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
+                       {"\"run_name\": \"BM_basic\",$", MR_Next},
+                       {"\"run_type\": \"iteration\",$", MR_Next},
+                       {"\"repetitions\": 1,$", MR_Next},
+                       {"\"repetition_index\": 0,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
                        {"\"iterations\": %int,$", MR_Next},
                        {"\"real_time\": %float,$", MR_Next},
                        {"\"cpu_time\": %float,$", MR_Next},
@@ -75,14 +91,23 @@ ADD_CASES(TC_CSVOut, {{"^\"BM_basic\",%csv_report$"}});
 
 void BM_bytes_per_second(benchmark::State& state) {
   for (auto _ : state) {
+    // This test requires a non-zero CPU time to avoid divide-by-zero
+    benchmark::DoNotOptimize(state.iterations());
   }
   state.SetBytesProcessed(1);
 }
 BENCHMARK(BM_bytes_per_second);
 
-ADD_CASES(TC_ConsoleOut,
-          {{"^BM_bytes_per_second %console_report +%float[kM]{0,1}B/s$"}});
+ADD_CASES(TC_ConsoleOut, {{"^BM_bytes_per_second %console_report "
+                           "bytes_per_second=%float[kM]{0,1}/s$"}});
 ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_bytes_per_second\",$"},
+                       {"\"family_index\": 1,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
+                       {"\"run_name\": \"BM_bytes_per_second\",$", MR_Next},
+                       {"\"run_type\": \"iteration\",$", MR_Next},
+                       {"\"repetitions\": 1,$", MR_Next},
+                       {"\"repetition_index\": 0,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
                        {"\"iterations\": %int,$", MR_Next},
                        {"\"real_time\": %float,$", MR_Next},
                        {"\"cpu_time\": %float,$", MR_Next},
@@ -97,14 +122,23 @@ ADD_CASES(TC_CSVOut, {{"^\"BM_bytes_per_second\",%csv_bytes_report$"}});
 
 void BM_items_per_second(benchmark::State& state) {
   for (auto _ : state) {
+    // This test requires a non-zero CPU time to avoid divide-by-zero
+    benchmark::DoNotOptimize(state.iterations());
   }
   state.SetItemsProcessed(1);
 }
 BENCHMARK(BM_items_per_second);
 
-ADD_CASES(TC_ConsoleOut,
-          {{"^BM_items_per_second %console_report +%float[kM]{0,1} items/s$"}});
+ADD_CASES(TC_ConsoleOut, {{"^BM_items_per_second %console_report "
+                           "items_per_second=%float[kM]{0,1}/s$"}});
 ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_items_per_second\",$"},
+                       {"\"family_index\": 2,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
+                       {"\"run_name\": \"BM_items_per_second\",$", MR_Next},
+                       {"\"run_type\": \"iteration\",$", MR_Next},
+                       {"\"repetitions\": 1,$", MR_Next},
+                       {"\"repetition_index\": 0,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
                        {"\"iterations\": %int,$", MR_Next},
                        {"\"real_time\": %float,$", MR_Next},
                        {"\"cpu_time\": %float,$", MR_Next},
@@ -126,6 +160,13 @@ BENCHMARK(BM_label);
 
 ADD_CASES(TC_ConsoleOut, {{"^BM_label %console_report some label$"}});
 ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_label\",$"},
+                       {"\"family_index\": 3,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
+                       {"\"run_name\": \"BM_label\",$", MR_Next},
+                       {"\"run_type\": \"iteration\",$", MR_Next},
+                       {"\"repetitions\": 1,$", MR_Next},
+                       {"\"repetition_index\": 0,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
                        {"\"iterations\": %int,$", MR_Next},
                        {"\"real_time\": %float,$", MR_Next},
                        {"\"cpu_time\": %float,$", MR_Next},
@@ -135,6 +176,101 @@ ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_label\",$"},
 ADD_CASES(TC_CSVOut, {{"^\"BM_label\",%csv_label_report_begin\"some "
                        "label\"%csv_label_report_end$"}});
 
+// ========================================================================= //
+// ------------------------ Testing Time Label Output ---------------------- //
+// ========================================================================= //
+
+void BM_time_label_nanosecond(benchmark::State& state) {
+  for (auto _ : state) {
+  }
+}
+BENCHMARK(BM_time_label_nanosecond)->Unit(benchmark::kNanosecond);
+
+ADD_CASES(TC_ConsoleOut, {{"^BM_time_label_nanosecond %console_report$"}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_time_label_nanosecond\",$"},
+           {"\"family_index\": 4,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
+           {"\"run_name\": \"BM_time_label_nanosecond\",$", MR_Next},
+           {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"repetitions\": 1,$", MR_Next},
+           {"\"repetition_index\": 0,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\"$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_time_label_nanosecond\",%csv_report$"}});
+
+void BM_time_label_microsecond(benchmark::State& state) {
+  for (auto _ : state) {
+  }
+}
+BENCHMARK(BM_time_label_microsecond)->Unit(benchmark::kMicrosecond);
+
+ADD_CASES(TC_ConsoleOut, {{"^BM_time_label_microsecond %console_us_report$"}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_time_label_microsecond\",$"},
+           {"\"family_index\": 5,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
+           {"\"run_name\": \"BM_time_label_microsecond\",$", MR_Next},
+           {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"repetitions\": 1,$", MR_Next},
+           {"\"repetition_index\": 0,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"us\"$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_time_label_microsecond\",%csv_us_report$"}});
+
+void BM_time_label_millisecond(benchmark::State& state) {
+  for (auto _ : state) {
+  }
+}
+BENCHMARK(BM_time_label_millisecond)->Unit(benchmark::kMillisecond);
+
+ADD_CASES(TC_ConsoleOut, {{"^BM_time_label_millisecond %console_ms_report$"}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_time_label_millisecond\",$"},
+           {"\"family_index\": 6,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
+           {"\"run_name\": \"BM_time_label_millisecond\",$", MR_Next},
+           {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"repetitions\": 1,$", MR_Next},
+           {"\"repetition_index\": 0,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ms\"$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_time_label_millisecond\",%csv_ms_report$"}});
+
+void BM_time_label_second(benchmark::State& state) {
+  for (auto _ : state) {
+  }
+}
+BENCHMARK(BM_time_label_second)->Unit(benchmark::kSecond);
+
+ADD_CASES(TC_ConsoleOut, {{"^BM_time_label_second %console_s_report$"}});
+ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_time_label_second\",$"},
+                       {"\"family_index\": 7,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
+                       {"\"run_name\": \"BM_time_label_second\",$", MR_Next},
+                       {"\"run_type\": \"iteration\",$", MR_Next},
+                       {"\"repetitions\": 1,$", MR_Next},
+                       {"\"repetition_index\": 0,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
+                       {"\"iterations\": %int,$", MR_Next},
+                       {"\"real_time\": %float,$", MR_Next},
+                       {"\"cpu_time\": %float,$", MR_Next},
+                       {"\"time_unit\": \"s\"$", MR_Next},
+                       {"}", MR_Next}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_time_label_second\",%csv_s_report$"}});
+
 // ========================================================================= //
 // ------------------------ Testing Error Output --------------------------- //
 // ========================================================================= //
@@ -147,6 +283,13 @@ void BM_error(benchmark::State& state) {
 BENCHMARK(BM_error);
 ADD_CASES(TC_ConsoleOut, {{"^BM_error[ ]+ERROR OCCURRED: 'message'$"}});
 ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_error\",$"},
+                       {"\"family_index\": 8,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
+                       {"\"run_name\": \"BM_error\",$", MR_Next},
+                       {"\"run_type\": \"iteration\",$", MR_Next},
+                       {"\"repetitions\": 1,$", MR_Next},
+                       {"\"repetition_index\": 0,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
                        {"\"error_occurred\": true,$", MR_Next},
                        {"\"error_message\": \"message\",$", MR_Next}});
 
@@ -163,7 +306,14 @@ void BM_no_arg_name(benchmark::State& state) {
 }
 BENCHMARK(BM_no_arg_name)->Arg(3);
 ADD_CASES(TC_ConsoleOut, {{"^BM_no_arg_name/3 %console_report$"}});
-ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_no_arg_name/3\",$"}});
+ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_no_arg_name/3\",$"},
+                       {"\"family_index\": 9,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
+                       {"\"run_name\": \"BM_no_arg_name/3\",$", MR_Next},
+                       {"\"run_type\": \"iteration\",$", MR_Next},
+                       {"\"repetitions\": 1,$", MR_Next},
+                       {"\"repetition_index\": 0,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next}});
 ADD_CASES(TC_CSVOut, {{"^\"BM_no_arg_name/3\",%csv_report$"}});
 
 // ========================================================================= //
@@ -176,7 +326,14 @@ void BM_arg_name(benchmark::State& state) {
 }
 BENCHMARK(BM_arg_name)->ArgName("first")->Arg(3);
 ADD_CASES(TC_ConsoleOut, {{"^BM_arg_name/first:3 %console_report$"}});
-ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_arg_name/first:3\",$"}});
+ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_arg_name/first:3\",$"},
+                       {"\"family_index\": 10,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
+                       {"\"run_name\": \"BM_arg_name/first:3\",$", MR_Next},
+                       {"\"run_type\": \"iteration\",$", MR_Next},
+                       {"\"repetitions\": 1,$", MR_Next},
+                       {"\"repetition_index\": 0,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next}});
 ADD_CASES(TC_CSVOut, {{"^\"BM_arg_name/first:3\",%csv_report$"}});
 
 // ========================================================================= //
@@ -190,15 +347,63 @@ void BM_arg_names(benchmark::State& state) {
 BENCHMARK(BM_arg_names)->Args({2, 5, 4})->ArgNames({"first", "", "third"});
 ADD_CASES(TC_ConsoleOut,
           {{"^BM_arg_names/first:2/5/third:4 %console_report$"}});
-ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_arg_names/first:2/5/third:4\",$"}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_arg_names/first:2/5/third:4\",$"},
+           {"\"family_index\": 11,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
+           {"\"run_name\": \"BM_arg_names/first:2/5/third:4\",$", MR_Next},
+           {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"repetitions\": 1,$", MR_Next},
+           {"\"repetition_index\": 0,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next}});
 ADD_CASES(TC_CSVOut, {{"^\"BM_arg_names/first:2/5/third:4\",%csv_report$"}});
 
+// ========================================================================= //
+// ------------------------ Testing Name Output ---------------------------- //
+// ========================================================================= //
+
+void BM_name(benchmark::State& state) {
+  for (auto _ : state) {
+  }
+}
+BENCHMARK(BM_name)->Name("BM_custom_name");
+
+ADD_CASES(TC_ConsoleOut, {{"^BM_custom_name %console_report$"}});
+ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_custom_name\",$"},
+                       {"\"family_index\": 12,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
+                       {"\"run_name\": \"BM_custom_name\",$", MR_Next},
+                       {"\"run_type\": \"iteration\",$", MR_Next},
+                       {"\"repetitions\": 1,$", MR_Next},
+                       {"\"repetition_index\": 0,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
+                       {"\"iterations\": %int,$", MR_Next},
+                       {"\"real_time\": %float,$", MR_Next},
+                       {"\"cpu_time\": %float,$", MR_Next},
+                       {"\"time_unit\": \"ns\"$", MR_Next},
+                       {"}", MR_Next}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_custom_name\",%csv_report$"}});
+
+// ========================================================================= //
+// ------------------------ Testing Big Args Output ------------------------ //
+// ========================================================================= //
+
+void BM_BigArgs(benchmark::State& state) {
+  for (auto _ : state) {
+  }
+}
+BENCHMARK(BM_BigArgs)->RangeMultiplier(2)->Range(1U << 30U, 1U << 31U);
+ADD_CASES(TC_ConsoleOut, {{"^BM_BigArgs/1073741824 %console_report$"},
+                          {"^BM_BigArgs/2147483648 %console_report$"}});
+
 // ========================================================================= //
 // ----------------------- Testing Complexity Output ----------------------- //
 // ========================================================================= //
 
 void BM_Complexity_O1(benchmark::State& state) {
   for (auto _ : state) {
+    // This test requires a non-zero CPU time to avoid divide-by-zero
+    benchmark::DoNotOptimize(state.iterations());
   }
   state.SetComplexityN(state.range(0));
 }
@@ -219,16 +424,55 @@ void BM_Repeat(benchmark::State& state) {
 }
 // need two repetitions min to be able to output any aggregate output
 BENCHMARK(BM_Repeat)->Repetitions(2);
-ADD_CASES(TC_ConsoleOut, {{"^BM_Repeat/repeats:2 %console_report$"},
-                          {"^BM_Repeat/repeats:2 %console_report$"},
-                          {"^BM_Repeat/repeats:2_mean %console_report$"},
-                          {"^BM_Repeat/repeats:2_median %console_report$"},
-                          {"^BM_Repeat/repeats:2_stddev %console_report$"}});
+ADD_CASES(TC_ConsoleOut,
+          {{"^BM_Repeat/repeats:2 %console_report$"},
+           {"^BM_Repeat/repeats:2 %console_report$"},
+           {"^BM_Repeat/repeats:2_mean %console_time_only_report [ ]*2$"},
+           {"^BM_Repeat/repeats:2_median %console_time_only_report [ ]*2$"},
+           {"^BM_Repeat/repeats:2_stddev %console_time_only_report [ ]*2$"}});
 ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_Repeat/repeats:2\",$"},
+                       {"\"family_index\": 15,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
+                       {"\"run_name\": \"BM_Repeat/repeats:2\"", MR_Next},
+                       {"\"run_type\": \"iteration\",$", MR_Next},
+                       {"\"repetitions\": 2,$", MR_Next},
+                       {"\"repetition_index\": 0,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
                        {"\"name\": \"BM_Repeat/repeats:2\",$"},
+                       {"\"family_index\": 15,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
+                       {"\"run_name\": \"BM_Repeat/repeats:2\",$", MR_Next},
+                       {"\"run_type\": \"iteration\",$", MR_Next},
+                       {"\"repetitions\": 2,$", MR_Next},
+                       {"\"repetition_index\": 1,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
                        {"\"name\": \"BM_Repeat/repeats:2_mean\",$"},
+                       {"\"family_index\": 15,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
+                       {"\"run_name\": \"BM_Repeat/repeats:2\",$", MR_Next},
+                       {"\"run_type\": \"aggregate\",$", MR_Next},
+                       {"\"repetitions\": 2,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
+                       {"\"aggregate_name\": \"mean\",$", MR_Next},
+                       {"\"iterations\": 2,$", MR_Next},
                        {"\"name\": \"BM_Repeat/repeats:2_median\",$"},
-                       {"\"name\": \"BM_Repeat/repeats:2_stddev\",$"}});
+                       {"\"family_index\": 15,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
+                       {"\"run_name\": \"BM_Repeat/repeats:2\",$", MR_Next},
+                       {"\"run_type\": \"aggregate\",$", MR_Next},
+                       {"\"repetitions\": 2,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
+                       {"\"aggregate_name\": \"median\",$", MR_Next},
+                       {"\"iterations\": 2,$", MR_Next},
+                       {"\"name\": \"BM_Repeat/repeats:2_stddev\",$"},
+                       {"\"family_index\": 15,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
+                       {"\"run_name\": \"BM_Repeat/repeats:2\",$", MR_Next},
+                       {"\"run_type\": \"aggregate\",$", MR_Next},
+                       {"\"repetitions\": 2,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
+                       {"\"aggregate_name\": \"stddev\",$", MR_Next},
+                       {"\"iterations\": 2,$", MR_Next}});
 ADD_CASES(TC_CSVOut, {{"^\"BM_Repeat/repeats:2\",%csv_report$"},
                       {"^\"BM_Repeat/repeats:2\",%csv_report$"},
                       {"^\"BM_Repeat/repeats:2_mean\",%csv_report$"},
@@ -236,18 +480,64 @@ ADD_CASES(TC_CSVOut, {{"^\"BM_Repeat/repeats:2\",%csv_report$"},
                       {"^\"BM_Repeat/repeats:2_stddev\",%csv_report$"}});
 // but for two repetitions, mean and median is the same, so let's repeat..
 BENCHMARK(BM_Repeat)->Repetitions(3);
-ADD_CASES(TC_ConsoleOut, {{"^BM_Repeat/repeats:3 %console_report$"},
-                          {"^BM_Repeat/repeats:3 %console_report$"},
-                          {"^BM_Repeat/repeats:3 %console_report$"},
-                          {"^BM_Repeat/repeats:3_mean %console_report$"},
-                          {"^BM_Repeat/repeats:3_median %console_report$"},
-                          {"^BM_Repeat/repeats:3_stddev %console_report$"}});
+ADD_CASES(TC_ConsoleOut,
+          {{"^BM_Repeat/repeats:3 %console_report$"},
+           {"^BM_Repeat/repeats:3 %console_report$"},
+           {"^BM_Repeat/repeats:3 %console_report$"},
+           {"^BM_Repeat/repeats:3_mean %console_time_only_report [ ]*3$"},
+           {"^BM_Repeat/repeats:3_median %console_time_only_report [ ]*3$"},
+           {"^BM_Repeat/repeats:3_stddev %console_time_only_report [ ]*3$"}});
 ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_Repeat/repeats:3\",$"},
+                       {"\"family_index\": 16,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
+                       {"\"run_name\": \"BM_Repeat/repeats:3\",$", MR_Next},
+                       {"\"run_type\": \"iteration\",$", MR_Next},
+                       {"\"repetitions\": 3,$", MR_Next},
+                       {"\"repetition_index\": 0,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
                        {"\"name\": \"BM_Repeat/repeats:3\",$"},
+                       {"\"family_index\": 16,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
+                       {"\"run_name\": \"BM_Repeat/repeats:3\",$", MR_Next},
+                       {"\"run_type\": \"iteration\",$", MR_Next},
+                       {"\"repetitions\": 3,$", MR_Next},
+                       {"\"repetition_index\": 1,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
                        {"\"name\": \"BM_Repeat/repeats:3\",$"},
+                       {"\"family_index\": 16,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
+                       {"\"run_name\": \"BM_Repeat/repeats:3\",$", MR_Next},
+                       {"\"run_type\": \"iteration\",$", MR_Next},
+                       {"\"repetitions\": 3,$", MR_Next},
+                       {"\"repetition_index\": 2,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
                        {"\"name\": \"BM_Repeat/repeats:3_mean\",$"},
+                       {"\"family_index\": 16,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
+                       {"\"run_name\": \"BM_Repeat/repeats:3\",$", MR_Next},
+                       {"\"run_type\": \"aggregate\",$", MR_Next},
+                       {"\"repetitions\": 3,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
+                       {"\"aggregate_name\": \"mean\",$", MR_Next},
+                       {"\"iterations\": 3,$", MR_Next},
                        {"\"name\": \"BM_Repeat/repeats:3_median\",$"},
-                       {"\"name\": \"BM_Repeat/repeats:3_stddev\",$"}});
+                       {"\"family_index\": 16,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
+                       {"\"run_name\": \"BM_Repeat/repeats:3\",$", MR_Next},
+                       {"\"run_type\": \"aggregate\",$", MR_Next},
+                       {"\"repetitions\": 3,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
+                       {"\"aggregate_name\": \"median\",$", MR_Next},
+                       {"\"iterations\": 3,$", MR_Next},
+                       {"\"name\": \"BM_Repeat/repeats:3_stddev\",$"},
+                       {"\"family_index\": 16,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
+                       {"\"run_name\": \"BM_Repeat/repeats:3\",$", MR_Next},
+                       {"\"run_type\": \"aggregate\",$", MR_Next},
+                       {"\"repetitions\": 3,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
+                       {"\"aggregate_name\": \"stddev\",$", MR_Next},
+                       {"\"iterations\": 3,$", MR_Next}});
 ADD_CASES(TC_CSVOut, {{"^\"BM_Repeat/repeats:3\",%csv_report$"},
                       {"^\"BM_Repeat/repeats:3\",%csv_report$"},
                       {"^\"BM_Repeat/repeats:3\",%csv_report$"},
@@ -256,20 +546,73 @@ ADD_CASES(TC_CSVOut, {{"^\"BM_Repeat/repeats:3\",%csv_report$"},
                       {"^\"BM_Repeat/repeats:3_stddev\",%csv_report$"}});
 // median differs between even/odd number of repetitions, so just to be sure
 BENCHMARK(BM_Repeat)->Repetitions(4);
-ADD_CASES(TC_ConsoleOut, {{"^BM_Repeat/repeats:4 %console_report$"},
-                          {"^BM_Repeat/repeats:4 %console_report$"},
-                          {"^BM_Repeat/repeats:4 %console_report$"},
-                          {"^BM_Repeat/repeats:4 %console_report$"},
-                          {"^BM_Repeat/repeats:4_mean %console_report$"},
-                          {"^BM_Repeat/repeats:4_median %console_report$"},
-                          {"^BM_Repeat/repeats:4_stddev %console_report$"}});
+ADD_CASES(TC_ConsoleOut,
+          {{"^BM_Repeat/repeats:4 %console_report$"},
+           {"^BM_Repeat/repeats:4 %console_report$"},
+           {"^BM_Repeat/repeats:4 %console_report$"},
+           {"^BM_Repeat/repeats:4 %console_report$"},
+           {"^BM_Repeat/repeats:4_mean %console_time_only_report [ ]*4$"},
+           {"^BM_Repeat/repeats:4_median %console_time_only_report [ ]*4$"},
+           {"^BM_Repeat/repeats:4_stddev %console_time_only_report [ ]*4$"}});
 ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_Repeat/repeats:4\",$"},
+                       {"\"family_index\": 17,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
+                       {"\"run_name\": \"BM_Repeat/repeats:4\",$", MR_Next},
+                       {"\"run_type\": \"iteration\",$", MR_Next},
+                       {"\"repetitions\": 4,$", MR_Next},
+                       {"\"repetition_index\": 0,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
                        {"\"name\": \"BM_Repeat/repeats:4\",$"},
+                       {"\"family_index\": 17,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
+                       {"\"run_name\": \"BM_Repeat/repeats:4\",$", MR_Next},
+                       {"\"run_type\": \"iteration\",$", MR_Next},
+                       {"\"repetitions\": 4,$", MR_Next},
+                       {"\"repetition_index\": 1,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
                        {"\"name\": \"BM_Repeat/repeats:4\",$"},
+                       {"\"family_index\": 17,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
+                       {"\"run_name\": \"BM_Repeat/repeats:4\",$", MR_Next},
+                       {"\"run_type\": \"iteration\",$", MR_Next},
+                       {"\"repetitions\": 4,$", MR_Next},
+                       {"\"repetition_index\": 2,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
                        {"\"name\": \"BM_Repeat/repeats:4\",$"},
+                       {"\"family_index\": 17,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
+                       {"\"run_name\": \"BM_Repeat/repeats:4\",$", MR_Next},
+                       {"\"run_type\": \"iteration\",$", MR_Next},
+                       {"\"repetitions\": 4,$", MR_Next},
+                       {"\"repetition_index\": 3,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
                        {"\"name\": \"BM_Repeat/repeats:4_mean\",$"},
+                       {"\"family_index\": 17,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
+                       {"\"run_name\": \"BM_Repeat/repeats:4\",$", MR_Next},
+                       {"\"run_type\": \"aggregate\",$", MR_Next},
+                       {"\"repetitions\": 4,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
+                       {"\"aggregate_name\": \"mean\",$", MR_Next},
+                       {"\"iterations\": 4,$", MR_Next},
                        {"\"name\": \"BM_Repeat/repeats:4_median\",$"},
-                       {"\"name\": \"BM_Repeat/repeats:4_stddev\",$"}});
+                       {"\"family_index\": 17,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
+                       {"\"run_name\": \"BM_Repeat/repeats:4\",$", MR_Next},
+                       {"\"run_type\": \"aggregate\",$", MR_Next},
+                       {"\"repetitions\": 4,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
+                       {"\"aggregate_name\": \"median\",$", MR_Next},
+                       {"\"iterations\": 4,$", MR_Next},
+                       {"\"name\": \"BM_Repeat/repeats:4_stddev\",$"},
+                       {"\"family_index\": 17,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
+                       {"\"run_name\": \"BM_Repeat/repeats:4\",$", MR_Next},
+                       {"\"run_type\": \"aggregate\",$", MR_Next},
+                       {"\"repetitions\": 4,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
+                       {"\"aggregate_name\": \"stddev\",$", MR_Next},
+                       {"\"iterations\": 4,$", MR_Next}});
 ADD_CASES(TC_CSVOut, {{"^\"BM_Repeat/repeats:4\",%csv_report$"},
                       {"^\"BM_Repeat/repeats:4\",%csv_report$"},
                       {"^\"BM_Repeat/repeats:4\",%csv_report$"},
@@ -286,7 +629,14 @@ void BM_RepeatOnce(benchmark::State& state) {
 }
 BENCHMARK(BM_RepeatOnce)->Repetitions(1)->ReportAggregatesOnly();
 ADD_CASES(TC_ConsoleOut, {{"^BM_RepeatOnce/repeats:1 %console_report$"}});
-ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_RepeatOnce/repeats:1\",$"}});
+ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_RepeatOnce/repeats:1\",$"},
+                       {"\"family_index\": 18,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
+                       {"\"run_name\": \"BM_RepeatOnce/repeats:1\",$", MR_Next},
+                       {"\"run_type\": \"iteration\",$", MR_Next},
+                       {"\"repetitions\": 1,$", MR_Next},
+                       {"\"repetition_index\": 0,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next}});
 ADD_CASES(TC_CSVOut, {{"^\"BM_RepeatOnce/repeats:1\",%csv_report$"}});
 
 // Test that non-aggregate data is not reported
@@ -295,20 +645,96 @@ void BM_SummaryRepeat(benchmark::State& state) {
   }
 }
 BENCHMARK(BM_SummaryRepeat)->Repetitions(3)->ReportAggregatesOnly();
-ADD_CASES(TC_ConsoleOut,
+ADD_CASES(
+    TC_ConsoleOut,
+    {{".*BM_SummaryRepeat/repeats:3 ", MR_Not},
+     {"^BM_SummaryRepeat/repeats:3_mean %console_time_only_report [ ]*3$"},
+     {"^BM_SummaryRepeat/repeats:3_median %console_time_only_report [ ]*3$"},
+     {"^BM_SummaryRepeat/repeats:3_stddev %console_time_only_report [ ]*3$"}});
+ADD_CASES(TC_JSONOut,
           {{".*BM_SummaryRepeat/repeats:3 ", MR_Not},
-           {"^BM_SummaryRepeat/repeats:3_mean %console_report$"},
-           {"^BM_SummaryRepeat/repeats:3_median %console_report$"},
-           {"^BM_SummaryRepeat/repeats:3_stddev %console_report$"}});
-ADD_CASES(TC_JSONOut, {{".*BM_SummaryRepeat/repeats:3 ", MR_Not},
-                       {"\"name\": \"BM_SummaryRepeat/repeats:3_mean\",$"},
-                       {"\"name\": \"BM_SummaryRepeat/repeats:3_median\",$"},
-                       {"\"name\": \"BM_SummaryRepeat/repeats:3_stddev\",$"}});
+           {"\"name\": \"BM_SummaryRepeat/repeats:3_mean\",$"},
+           {"\"family_index\": 19,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
+           {"\"run_name\": \"BM_SummaryRepeat/repeats:3\",$", MR_Next},
+           {"\"run_type\": \"aggregate\",$", MR_Next},
+           {"\"repetitions\": 3,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"aggregate_name\": \"mean\",$", MR_Next},
+           {"\"iterations\": 3,$", MR_Next},
+           {"\"name\": \"BM_SummaryRepeat/repeats:3_median\",$"},
+           {"\"family_index\": 19,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
+           {"\"run_name\": \"BM_SummaryRepeat/repeats:3\",$", MR_Next},
+           {"\"run_type\": \"aggregate\",$", MR_Next},
+           {"\"repetitions\": 3,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"aggregate_name\": \"median\",$", MR_Next},
+           {"\"iterations\": 3,$", MR_Next},
+           {"\"name\": \"BM_SummaryRepeat/repeats:3_stddev\",$"},
+           {"\"family_index\": 19,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
+           {"\"run_name\": \"BM_SummaryRepeat/repeats:3\",$", MR_Next},
+           {"\"run_type\": \"aggregate\",$", MR_Next},
+           {"\"repetitions\": 3,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"aggregate_name\": \"stddev\",$", MR_Next},
+           {"\"iterations\": 3,$", MR_Next}});
 ADD_CASES(TC_CSVOut, {{".*BM_SummaryRepeat/repeats:3 ", MR_Not},
                       {"^\"BM_SummaryRepeat/repeats:3_mean\",%csv_report$"},
                       {"^\"BM_SummaryRepeat/repeats:3_median\",%csv_report$"},
                       {"^\"BM_SummaryRepeat/repeats:3_stddev\",%csv_report$"}});
 
+// Test that non-aggregate data is not displayed.
+// NOTE: this test is kinda bad. we are only testing the display output.
+//       But we don't check that the file output still contains everything...
+void BM_SummaryDisplay(benchmark::State& state) {
+  for (auto _ : state) {
+  }
+}
+BENCHMARK(BM_SummaryDisplay)->Repetitions(2)->DisplayAggregatesOnly();
+ADD_CASES(
+    TC_ConsoleOut,
+    {{".*BM_SummaryDisplay/repeats:2 ", MR_Not},
+     {"^BM_SummaryDisplay/repeats:2_mean %console_time_only_report [ ]*2$"},
+     {"^BM_SummaryDisplay/repeats:2_median %console_time_only_report [ ]*2$"},
+     {"^BM_SummaryDisplay/repeats:2_stddev %console_time_only_report [ ]*2$"}});
+ADD_CASES(TC_JSONOut,
+          {{".*BM_SummaryDisplay/repeats:2 ", MR_Not},
+           {"\"name\": \"BM_SummaryDisplay/repeats:2_mean\",$"},
+           {"\"family_index\": 20,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
+           {"\"run_name\": \"BM_SummaryDisplay/repeats:2\",$", MR_Next},
+           {"\"run_type\": \"aggregate\",$", MR_Next},
+           {"\"repetitions\": 2,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"aggregate_name\": \"mean\",$", MR_Next},
+           {"\"iterations\": 2,$", MR_Next},
+           {"\"name\": \"BM_SummaryDisplay/repeats:2_median\",$"},
+           {"\"family_index\": 20,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
+           {"\"run_name\": \"BM_SummaryDisplay/repeats:2\",$", MR_Next},
+           {"\"run_type\": \"aggregate\",$", MR_Next},
+           {"\"repetitions\": 2,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"aggregate_name\": \"median\",$", MR_Next},
+           {"\"iterations\": 2,$", MR_Next},
+           {"\"name\": \"BM_SummaryDisplay/repeats:2_stddev\",$"},
+           {"\"family_index\": 20,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
+           {"\"run_name\": \"BM_SummaryDisplay/repeats:2\",$", MR_Next},
+           {"\"run_type\": \"aggregate\",$", MR_Next},
+           {"\"repetitions\": 2,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"aggregate_name\": \"stddev\",$", MR_Next},
+           {"\"iterations\": 2,$", MR_Next}});
+ADD_CASES(TC_CSVOut,
+          {{".*BM_SummaryDisplay/repeats:2 ", MR_Not},
+           {"^\"BM_SummaryDisplay/repeats:2_mean\",%csv_report$"},
+           {"^\"BM_SummaryDisplay/repeats:2_median\",%csv_report$"},
+           {"^\"BM_SummaryDisplay/repeats:2_stddev\",%csv_report$"}});
+
+// Test repeats with custom time unit.
 void BM_RepeatTimeUnit(benchmark::State& state) {
   for (auto _ : state) {
   }
@@ -317,18 +743,46 @@ BENCHMARK(BM_RepeatTimeUnit)
     ->Repetitions(3)
     ->ReportAggregatesOnly()
     ->Unit(benchmark::kMicrosecond);
-ADD_CASES(TC_ConsoleOut,
+ADD_CASES(
+    TC_ConsoleOut,
+    {{".*BM_RepeatTimeUnit/repeats:3 ", MR_Not},
+     {"^BM_RepeatTimeUnit/repeats:3_mean %console_us_time_only_report [ ]*3$"},
+     {"^BM_RepeatTimeUnit/repeats:3_median %console_us_time_only_report [ "
+      "]*3$"},
+     {"^BM_RepeatTimeUnit/repeats:3_stddev %console_us_time_only_report [ "
+      "]*3$"}});
+ADD_CASES(TC_JSONOut,
           {{".*BM_RepeatTimeUnit/repeats:3 ", MR_Not},
-           {"^BM_RepeatTimeUnit/repeats:3_mean %console_us_report$"},
-           {"^BM_RepeatTimeUnit/repeats:3_median %console_us_report$"},
-           {"^BM_RepeatTimeUnit/repeats:3_stddev %console_us_report$"}});
-ADD_CASES(TC_JSONOut, {{".*BM_RepeatTimeUnit/repeats:3 ", MR_Not},
-                       {"\"name\": \"BM_RepeatTimeUnit/repeats:3_mean\",$"},
-                       {"\"time_unit\": \"us\",?$"},
-                       {"\"name\": \"BM_RepeatTimeUnit/repeats:3_median\",$"},
-                       {"\"time_unit\": \"us\",?$"},
-                       {"\"name\": \"BM_RepeatTimeUnit/repeats:3_stddev\",$"},
-                       {"\"time_unit\": \"us\",?$"}});
+           {"\"name\": \"BM_RepeatTimeUnit/repeats:3_mean\",$"},
+           {"\"family_index\": 21,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
+           {"\"run_name\": \"BM_RepeatTimeUnit/repeats:3\",$", MR_Next},
+           {"\"run_type\": \"aggregate\",$", MR_Next},
+           {"\"repetitions\": 3,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"aggregate_name\": \"mean\",$", MR_Next},
+           {"\"iterations\": 3,$", MR_Next},
+           {"\"time_unit\": \"us\",?$"},
+           {"\"name\": \"BM_RepeatTimeUnit/repeats:3_median\",$"},
+           {"\"family_index\": 21,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
+           {"\"run_name\": \"BM_RepeatTimeUnit/repeats:3\",$", MR_Next},
+           {"\"run_type\": \"aggregate\",$", MR_Next},
+           {"\"repetitions\": 3,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"aggregate_name\": \"median\",$", MR_Next},
+           {"\"iterations\": 3,$", MR_Next},
+           {"\"time_unit\": \"us\",?$"},
+           {"\"name\": \"BM_RepeatTimeUnit/repeats:3_stddev\",$"},
+           {"\"family_index\": 21,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
+           {"\"run_name\": \"BM_RepeatTimeUnit/repeats:3\",$", MR_Next},
+           {"\"run_type\": \"aggregate\",$", MR_Next},
+           {"\"repetitions\": 3,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"aggregate_name\": \"stddev\",$", MR_Next},
+           {"\"iterations\": 3,$", MR_Next},
+           {"\"time_unit\": \"us\",?$"}});
 ADD_CASES(TC_CSVOut,
           {{".*BM_RepeatTimeUnit/repeats:3 ", MR_Not},
            {"^\"BM_RepeatTimeUnit/repeats:3_mean\",%csv_us_report$"},
@@ -344,34 +798,156 @@ const auto UserStatistics = [](const std::vector<double>& v) {
 };
 void BM_UserStats(benchmark::State& state) {
   for (auto _ : state) {
+    state.SetIterationTime(150 / 10e8);
   }
 }
+// clang-format off
 BENCHMARK(BM_UserStats)
-    ->Repetitions(3)
-    ->ComputeStatistics("", UserStatistics);
+  ->Repetitions(3)
+  ->Iterations(5)
+  ->UseManualTime()
+  ->ComputeStatistics("", UserStatistics);
+// clang-format on
+
 // check that user-provided stats is calculated, and is after the default-ones
 // empty string as name is intentional, it would sort before anything else
-ADD_CASES(TC_ConsoleOut, {{"^BM_UserStats/repeats:3 %console_report$"},
-                          {"^BM_UserStats/repeats:3 %console_report$"},
-                          {"^BM_UserStats/repeats:3 %console_report$"},
-                          {"^BM_UserStats/repeats:3_mean %console_report$"},
-                          {"^BM_UserStats/repeats:3_median %console_report$"},
-                          {"^BM_UserStats/repeats:3_stddev %console_report$"},
-                          {"^BM_UserStats/repeats:3_ %console_report$"}});
-ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_UserStats/repeats:3\",$"},
-                       {"\"name\": \"BM_UserStats/repeats:3\",$"},
-                       {"\"name\": \"BM_UserStats/repeats:3\",$"},
-                       {"\"name\": \"BM_UserStats/repeats:3_mean\",$"},
-                       {"\"name\": \"BM_UserStats/repeats:3_median\",$"},
-                       {"\"name\": \"BM_UserStats/repeats:3_stddev\",$"},
-                       {"\"name\": \"BM_UserStats/repeats:3_\",$"}});
-ADD_CASES(TC_CSVOut, {{"^\"BM_UserStats/repeats:3\",%csv_report$"},
-                      {"^\"BM_UserStats/repeats:3\",%csv_report$"},
-                      {"^\"BM_UserStats/repeats:3\",%csv_report$"},
-                      {"^\"BM_UserStats/repeats:3_mean\",%csv_report$"},
-                      {"^\"BM_UserStats/repeats:3_median\",%csv_report$"},
-                      {"^\"BM_UserStats/repeats:3_stddev\",%csv_report$"},
-                      {"^\"BM_UserStats/repeats:3_\",%csv_report$"}});
+ADD_CASES(TC_ConsoleOut, {{"^BM_UserStats/iterations:5/repeats:3/manual_time [ "
+                           "]* 150 ns %time [ ]*5$"},
+                          {"^BM_UserStats/iterations:5/repeats:3/manual_time [ "
+                           "]* 150 ns %time [ ]*5$"},
+                          {"^BM_UserStats/iterations:5/repeats:3/manual_time [ "
+                           "]* 150 ns %time [ ]*5$"},
+                          {"^BM_UserStats/iterations:5/repeats:3/"
+                           "manual_time_mean [ ]* 150 ns %time [ ]*3$"},
+                          {"^BM_UserStats/iterations:5/repeats:3/"
+                           "manual_time_median [ ]* 150 ns %time [ ]*3$"},
+                          {"^BM_UserStats/iterations:5/repeats:3/"
+                           "manual_time_stddev [ ]* 0.000 ns %time [ ]*3$"},
+                          {"^BM_UserStats/iterations:5/repeats:3/manual_time_ "
+                           "[ ]* 150 ns %time [ ]*3$"}});
+ADD_CASES(
+    TC_JSONOut,
+    {{"\"name\": \"BM_UserStats/iterations:5/repeats:3/manual_time\",$"},
+     {"\"family_index\": 22,$", MR_Next},
+     {"\"per_family_instance_index\": 0,$", MR_Next},
+     {"\"run_name\": \"BM_UserStats/iterations:5/repeats:3/manual_time\",$",
+      MR_Next},
+     {"\"run_type\": \"iteration\",$", MR_Next},
+     {"\"repetitions\": 3,$", MR_Next},
+     {"\"repetition_index\": 0,$", MR_Next},
+     {"\"threads\": 1,$", MR_Next},
+     {"\"iterations\": 5,$", MR_Next},
+     {"\"real_time\": 1\\.5(0)*e\\+(0)*2,$", MR_Next},
+     {"\"name\": \"BM_UserStats/iterations:5/repeats:3/manual_time\",$"},
+     {"\"family_index\": 22,$", MR_Next},
+     {"\"per_family_instance_index\": 0,$", MR_Next},
+     {"\"run_name\": \"BM_UserStats/iterations:5/repeats:3/manual_time\",$",
+      MR_Next},
+     {"\"run_type\": \"iteration\",$", MR_Next},
+     {"\"repetitions\": 3,$", MR_Next},
+     {"\"repetition_index\": 1,$", MR_Next},
+     {"\"threads\": 1,$", MR_Next},
+     {"\"iterations\": 5,$", MR_Next},
+     {"\"real_time\": 1\\.5(0)*e\\+(0)*2,$", MR_Next},
+     {"\"name\": \"BM_UserStats/iterations:5/repeats:3/manual_time\",$"},
+     {"\"family_index\": 22,$", MR_Next},
+     {"\"per_family_instance_index\": 0,$", MR_Next},
+     {"\"run_name\": \"BM_UserStats/iterations:5/repeats:3/manual_time\",$",
+      MR_Next},
+     {"\"run_type\": \"iteration\",$", MR_Next},
+     {"\"repetitions\": 3,$", MR_Next},
+     {"\"repetition_index\": 2,$", MR_Next},
+     {"\"threads\": 1,$", MR_Next},
+     {"\"iterations\": 5,$", MR_Next},
+     {"\"real_time\": 1\\.5(0)*e\\+(0)*2,$", MR_Next},
+     {"\"name\": \"BM_UserStats/iterations:5/repeats:3/manual_time_mean\",$"},
+     {"\"family_index\": 22,$", MR_Next},
+     {"\"per_family_instance_index\": 0,$", MR_Next},
+     {"\"run_name\": \"BM_UserStats/iterations:5/repeats:3/manual_time\",$",
+      MR_Next},
+     {"\"run_type\": \"aggregate\",$", MR_Next},
+     {"\"repetitions\": 3,$", MR_Next},
+     {"\"threads\": 1,$", MR_Next},
+     {"\"aggregate_name\": \"mean\",$", MR_Next},
+     {"\"iterations\": 3,$", MR_Next},
+     {"\"real_time\": 1\\.5(0)*e\\+(0)*2,$", MR_Next},
+     {"\"name\": \"BM_UserStats/iterations:5/repeats:3/manual_time_median\",$"},
+     {"\"family_index\": 22,$", MR_Next},
+     {"\"per_family_instance_index\": 0,$", MR_Next},
+     {"\"run_name\": \"BM_UserStats/iterations:5/repeats:3/manual_time\",$",
+      MR_Next},
+     {"\"run_type\": \"aggregate\",$", MR_Next},
+     {"\"repetitions\": 3,$", MR_Next},
+     {"\"threads\": 1,$", MR_Next},
+     {"\"aggregate_name\": \"median\",$", MR_Next},
+     {"\"iterations\": 3,$", MR_Next},
+     {"\"real_time\": 1\\.5(0)*e\\+(0)*2,$", MR_Next},
+     {"\"name\": \"BM_UserStats/iterations:5/repeats:3/manual_time_stddev\",$"},
+     {"\"family_index\": 22,$", MR_Next},
+     {"\"per_family_instance_index\": 0,$", MR_Next},
+     {"\"run_name\": \"BM_UserStats/iterations:5/repeats:3/manual_time\",$",
+      MR_Next},
+     {"\"run_type\": \"aggregate\",$", MR_Next},
+     {"\"repetitions\": 3,$", MR_Next},
+     {"\"threads\": 1,$", MR_Next},
+     {"\"aggregate_name\": \"stddev\",$", MR_Next},
+     {"\"iterations\": 3,$", MR_Next},
+     {"\"real_time\": %float,$", MR_Next},
+     {"\"name\": \"BM_UserStats/iterations:5/repeats:3/manual_time_\",$"},
+     {"\"family_index\": 22,$", MR_Next},
+     {"\"per_family_instance_index\": 0,$", MR_Next},
+     {"\"run_name\": \"BM_UserStats/iterations:5/repeats:3/manual_time\",$",
+      MR_Next},
+     {"\"run_type\": \"aggregate\",$", MR_Next},
+     {"\"repetitions\": 3,$", MR_Next},
+     {"\"threads\": 1,$", MR_Next},
+     {"\"aggregate_name\": \"\",$", MR_Next},
+     {"\"iterations\": 3,$", MR_Next},
+     {"\"real_time\": 1\\.5(0)*e\\+(0)*2,$", MR_Next}});
+ADD_CASES(
+    TC_CSVOut,
+    {{"^\"BM_UserStats/iterations:5/repeats:3/manual_time\",%csv_report$"},
+     {"^\"BM_UserStats/iterations:5/repeats:3/manual_time\",%csv_report$"},
+     {"^\"BM_UserStats/iterations:5/repeats:3/manual_time\",%csv_report$"},
+     {"^\"BM_UserStats/iterations:5/repeats:3/manual_time_mean\",%csv_report$"},
+     {"^\"BM_UserStats/iterations:5/repeats:3/"
+      "manual_time_median\",%csv_report$"},
+     {"^\"BM_UserStats/iterations:5/repeats:3/"
+      "manual_time_stddev\",%csv_report$"},
+     {"^\"BM_UserStats/iterations:5/repeats:3/manual_time_\",%csv_report$"}});
+
+// ========================================================================= //
+// ------------------------- Testing StrEscape JSON ------------------------ //
+// ========================================================================= //
+#if 0  // enable when csv testing code correctly handles multi-line fields
+void BM_JSON_Format(benchmark::State& state) {
+  state.SkipWithError("val\b\f\n\r\t\\\"with\"es,capes");
+  for (auto _ : state) {
+  }
+}
+BENCHMARK(BM_JSON_Format);
+ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_JSON_Format\",$"},
+                                              {"\"family_index\": 23,$", MR_Next},
+{"\"per_family_instance_index\": 0,$", MR_Next},
+                       {"\"run_name\": \"BM_JSON_Format\",$", MR_Next},
+                       {"\"run_type\": \"iteration\",$", MR_Next},
+                       {"\"repetitions\": 1,$", MR_Next},
+                       {"\"repetition_index\": 0,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
+                       {"\"error_occurred\": true,$", MR_Next},
+                       {R"("error_message": "val\\b\\f\\n\\r\\t\\\\\\"with\\"es,capes",$)", MR_Next}});
+#endif
+// ========================================================================= //
+// -------------------------- Testing CsvEscape ---------------------------- //
+// ========================================================================= //
+
+void BM_CSV_Format(benchmark::State& state) {
+  state.SkipWithError("\"freedom\"");
+  for (auto _ : state) {
+  }
+}
+BENCHMARK(BM_CSV_Format);
+ADD_CASES(TC_CSVOut, {{"^\"BM_CSV_Format\",,,,,,,,true,\"\"\"freedom\"\"\"$"}});
 
 void BM_JSONInputTest(benchmark::State& st) {
   for (auto _ : st) {
diff --git a/test/skip_with_error_test.cc b/test/skip_with_error_test.cc
index 8d2c342a9a..1156bc0ad6 100644
--- a/test/skip_with_error_test.cc
+++ b/test/skip_with_error_test.cc
@@ -10,11 +10,11 @@ namespace {
 
 class TestReporter : public benchmark::ConsoleReporter {
  public:
-  virtual bool ReportContext(const Context& context) {
+  virtual bool ReportContext(const Context& context) BENCHMARK_OVERRIDE {
     return ConsoleReporter::ReportContext(context);
   };
 
-  virtual void ReportRuns(const std::vector<Run>& report) {
+  virtual void ReportRuns(const std::vector<Run>& report) BENCHMARK_OVERRIDE {
     all_runs_.insert(all_runs_.end(), begin(report), end(report));
     ConsoleReporter::ReportRuns(report);
   }
@@ -33,14 +33,14 @@ struct TestCase {
   typedef benchmark::BenchmarkReporter::Run Run;
 
   void CheckRun(Run const& run) const {
-    CHECK(name == run.benchmark_name) << "expected " << name << " got "
-                                      << run.benchmark_name;
-    CHECK(error_occurred == run.error_occurred);
-    CHECK(error_message == run.error_message);
+    BM_CHECK(name == run.benchmark_name())
+        << "expected " << name << " got " << run.benchmark_name();
+    BM_CHECK(error_occurred == run.error_occurred);
+    BM_CHECK(error_message == run.error_message);
     if (error_occurred) {
-      // CHECK(run.iterations == 0);
+      // BM_CHECK(run.iterations == 0);
     } else {
-      CHECK(run.iterations != 0);
+      BM_CHECK(run.iterations != 0);
     }
   }
 };
@@ -61,6 +61,12 @@ int AddCases(const char* base_name, std::initializer_list<TestCase> const& v) {
 
 }  // end namespace
 
+void BM_error_no_running(benchmark::State& state) {
+  state.SkipWithError("error message");
+}
+BENCHMARK(BM_error_no_running);
+ADD_CASES("BM_error_no_running", {{"", true, "error message"}});
+
 void BM_error_before_running(benchmark::State& state) {
   state.SkipWithError("error message");
   while (state.KeepRunning()) {
@@ -70,7 +76,6 @@ void BM_error_before_running(benchmark::State& state) {
 BENCHMARK(BM_error_before_running);
 ADD_CASES("BM_error_before_running", {{"", true, "error message"}});
 
-
 void BM_error_before_running_batch(benchmark::State& state) {
   state.SkipWithError("error message");
   while (state.KeepRunningBatch(17)) {
@@ -124,7 +129,7 @@ void BM_error_during_running_ranged_for(benchmark::State& state) {
       // Test the unfortunate but documented behavior that the ranged-for loop
       // doesn't automatically terminate when SkipWithError is set.
       assert(++It != End);
-      break; // Required behavior
+      break;  // Required behavior
     }
   }
 }
@@ -133,8 +138,6 @@ ADD_CASES("BM_error_during_running_ranged_for",
           {{"/1/iterations:5", true, "error message"},
            {"/2/iterations:5", false, ""}});
 
-
-
 void BM_error_after_running(benchmark::State& state) {
   for (auto _ : state) {
     benchmark::DoNotOptimize(state.iterations());
diff --git a/test/state_assembly_test.cc b/test/state_assembly_test.cc
new file mode 100644
index 0000000000..7ddbb3b2a9
--- /dev/null
+++ b/test/state_assembly_test.cc
@@ -0,0 +1,68 @@
+#include <benchmark/benchmark.h>
+
+#ifdef __clang__
+#pragma clang diagnostic ignored "-Wreturn-type"
+#endif
+
+// clang-format off
+extern "C" {
+  extern int ExternInt;
+  benchmark::State& GetState();
+  void Fn();
+}
+// clang-format on
+
+using benchmark::State;
+
+// CHECK-LABEL: test_for_auto_loop:
+extern "C" int test_for_auto_loop() {
+  State& S = GetState();
+  int x = 42;
+  // CHECK: 	[[CALL:call(q)*]]	_ZN9benchmark5State16StartKeepRunningEv
+  // CHECK-NEXT: testq %rbx, %rbx
+  // CHECK-NEXT: je [[LOOP_END:.*]]
+
+  for (auto _ : S) {
+    // CHECK: .L[[LOOP_HEAD:[a-zA-Z0-9_]+]]:
+    // CHECK-GNU-NEXT: subq $1, %rbx
+    // CHECK-CLANG-NEXT: {{(addq \$1, %rax|incq %rax|addq \$-1, %rbx)}}
+    // CHECK-NEXT: jne .L[[LOOP_HEAD]]
+    benchmark::DoNotOptimize(x);
+  }
+  // CHECK: [[LOOP_END]]:
+  // CHECK: [[CALL]]	_ZN9benchmark5State17FinishKeepRunningEv
+
+  // CHECK: movl $101, %eax
+  // CHECK: ret
+  return 101;
+}
+
+// CHECK-LABEL: test_while_loop:
+extern "C" int test_while_loop() {
+  State& S = GetState();
+  int x = 42;
+
+  // CHECK: j{{(e|mp)}} .L[[LOOP_HEADER:[a-zA-Z0-9_]+]]
+  // CHECK-NEXT: .L[[LOOP_BODY:[a-zA-Z0-9_]+]]:
+  while (S.KeepRunning()) {
+    // CHECK-GNU-NEXT: subq $1, %[[IREG:[a-z]+]]
+    // CHECK-CLANG-NEXT: {{(addq \$-1,|decq)}} %[[IREG:[a-z]+]]
+    // CHECK: movq %[[IREG]], [[DEST:.*]]
+    benchmark::DoNotOptimize(x);
+  }
+  // CHECK-DAG: movq [[DEST]], %[[IREG]]
+  // CHECK-DAG: testq %[[IREG]], %[[IREG]]
+  // CHECK-DAG: jne .L[[LOOP_BODY]]
+  // CHECK-DAG: .L[[LOOP_HEADER]]:
+
+  // CHECK: cmpb $0
+  // CHECK-NEXT: jne .L[[LOOP_END:[a-zA-Z0-9_]+]]
+  // CHECK: [[CALL:call(q)*]] _ZN9benchmark5State16StartKeepRunningEv
+
+  // CHECK: .L[[LOOP_END]]:
+  // CHECK: [[CALL]] _ZN9benchmark5State17FinishKeepRunningEv
+
+  // CHECK: movl $101, %eax
+  // CHECK: ret
+  return 101;
+}
diff --git a/test/statistics_gtest.cc b/test/statistics_gtest.cc
new file mode 100644
index 0000000000..3ddc72dd7a
--- /dev/null
+++ b/test/statistics_gtest.cc
@@ -0,0 +1,28 @@
+//===---------------------------------------------------------------------===//
+// statistics_test - Unit tests for src/statistics.cc
+//===---------------------------------------------------------------------===//
+
+#include "../src/statistics.h"
+#include "gtest/gtest.h"
+
+namespace {
+TEST(StatisticsTest, Mean) {
+  EXPECT_DOUBLE_EQ(benchmark::StatisticsMean({42, 42, 42, 42}), 42.0);
+  EXPECT_DOUBLE_EQ(benchmark::StatisticsMean({1, 2, 3, 4}), 2.5);
+  EXPECT_DOUBLE_EQ(benchmark::StatisticsMean({1, 2, 5, 10, 10, 14}), 7.0);
+}
+
+TEST(StatisticsTest, Median) {
+  EXPECT_DOUBLE_EQ(benchmark::StatisticsMedian({42, 42, 42, 42}), 42.0);
+  EXPECT_DOUBLE_EQ(benchmark::StatisticsMedian({1, 2, 3, 4}), 2.5);
+  EXPECT_DOUBLE_EQ(benchmark::StatisticsMedian({1, 2, 5, 10, 10}), 5.0);
+}
+
+TEST(StatisticsTest, StdDev) {
+  EXPECT_DOUBLE_EQ(benchmark::StatisticsStdDev({101, 101, 101, 101}), 0.0);
+  EXPECT_DOUBLE_EQ(benchmark::StatisticsStdDev({1, 2, 3}), 1.0);
+  EXPECT_DOUBLE_EQ(benchmark::StatisticsStdDev({2.5, 2.4, 3.3, 4.2, 5.1}),
+                   1.151086443322134);
+}
+
+}  // end namespace
diff --git a/test/statistics_test.cc b/test/statistics_test.cc
deleted file mode 100644
index b4d6abbb57..0000000000
--- a/test/statistics_test.cc
+++ /dev/null
@@ -1,61 +0,0 @@
-//===---------------------------------------------------------------------===//
-// statistics_test - Unit tests for src/statistics.cc
-//===---------------------------------------------------------------------===//
-
-#include "../src/statistics.h"
-#include "gtest/gtest.h"
-
-namespace {
-TEST(StatisticsTest, Mean) {
-  std::vector<double> Inputs;
-  {
-    Inputs = {42, 42, 42, 42};
-    double Res = benchmark::StatisticsMean(Inputs);
-    EXPECT_DOUBLE_EQ(Res, 42.0);
-  }
-  {
-    Inputs = {1, 2, 3, 4};
-    double Res = benchmark::StatisticsMean(Inputs);
-    EXPECT_DOUBLE_EQ(Res, 2.5);
-  }
-  {
-    Inputs = {1, 2, 5, 10, 10, 14};
-    double Res = benchmark::StatisticsMean(Inputs);
-    EXPECT_DOUBLE_EQ(Res, 7.0);
-  }
-}
-
-TEST(StatisticsTest, Median) {
-  std::vector<double> Inputs;
-  {
-    Inputs = {42, 42, 42, 42};
-    double Res = benchmark::StatisticsMedian(Inputs);
-    EXPECT_DOUBLE_EQ(Res, 42.0);
-  }
-  {
-    Inputs = {1, 2, 3, 4};
-    double Res = benchmark::StatisticsMedian(Inputs);
-    EXPECT_DOUBLE_EQ(Res, 2.5);
-  }
-  {
-    Inputs = {1, 2, 5, 10, 10};
-    double Res = benchmark::StatisticsMedian(Inputs);
-    EXPECT_DOUBLE_EQ(Res, 5.0);
-  }
-}
-
-TEST(StatisticsTest, StdDev) {
-  std::vector<double> Inputs;
-  {
-    Inputs = {101, 101, 101, 101};
-    double Res = benchmark::StatisticsStdDev(Inputs);
-    EXPECT_DOUBLE_EQ(Res, 0.0);
-  }
-  {
-    Inputs = {1, 2, 3};
-    double Res = benchmark::StatisticsStdDev(Inputs);
-    EXPECT_DOUBLE_EQ(Res, 1.0);
-  }
-}
-
-}  // end namespace
diff --git a/test/string_util_gtest.cc b/test/string_util_gtest.cc
new file mode 100644
index 0000000000..c7061b409e
--- /dev/null
+++ b/test/string_util_gtest.cc
@@ -0,0 +1,161 @@
+//===---------------------------------------------------------------------===//
+// statistics_test - Unit tests for src/statistics.cc
+//===---------------------------------------------------------------------===//
+
+#include "../src/string_util.h"
+#include "../src/internal_macros.h"
+#include "gtest/gtest.h"
+
+namespace {
+TEST(StringUtilTest, stoul) {
+  {
+    size_t pos = 0;
+    EXPECT_EQ(0ul, benchmark::stoul("0", &pos));
+    EXPECT_EQ(1ul, pos);
+  }
+  {
+    size_t pos = 0;
+    EXPECT_EQ(7ul, benchmark::stoul("7", &pos));
+    EXPECT_EQ(1ul, pos);
+  }
+  {
+    size_t pos = 0;
+    EXPECT_EQ(135ul, benchmark::stoul("135", &pos));
+    EXPECT_EQ(3ul, pos);
+  }
+#if ULONG_MAX == 0xFFFFFFFFul
+  {
+    size_t pos = 0;
+    EXPECT_EQ(0xFFFFFFFFul, benchmark::stoul("4294967295", &pos));
+    EXPECT_EQ(10ul, pos);
+  }
+#elif ULONG_MAX == 0xFFFFFFFFFFFFFFFFul
+  {
+    size_t pos = 0;
+    EXPECT_EQ(0xFFFFFFFFFFFFFFFFul, benchmark::stoul("18446744073709551615", &pos));
+    EXPECT_EQ(20ul, pos);
+  }
+#endif
+  {
+    size_t pos = 0;
+    EXPECT_EQ(10ul, benchmark::stoul("1010", &pos, 2));
+    EXPECT_EQ(4ul, pos);
+  }
+  {
+    size_t pos = 0;
+    EXPECT_EQ(520ul, benchmark::stoul("1010", &pos, 8));
+    EXPECT_EQ(4ul, pos);
+  }
+  {
+    size_t pos = 0;
+    EXPECT_EQ(1010ul, benchmark::stoul("1010", &pos, 10));
+    EXPECT_EQ(4ul, pos);
+  }
+  {
+    size_t pos = 0;
+    EXPECT_EQ(4112ul, benchmark::stoul("1010", &pos, 16));
+    EXPECT_EQ(4ul, pos);
+  }
+  {
+    size_t pos = 0;
+    EXPECT_EQ(0xBEEFul, benchmark::stoul("BEEF", &pos, 16));
+    EXPECT_EQ(4ul, pos);
+  }
+#ifndef BENCHMARK_HAS_NO_EXCEPTIONS
+  {
+    ASSERT_THROW(benchmark::stoul("this is a test"), std::invalid_argument);
+  }
+#endif
+}
+
+TEST(StringUtilTest, stoi) {
+  {
+    size_t pos = 0;
+    EXPECT_EQ(0, benchmark::stoi("0", &pos));
+    EXPECT_EQ(1ul, pos);
+  }
+  {
+    size_t pos = 0;
+    EXPECT_EQ(-17, benchmark::stoi("-17", &pos));
+    EXPECT_EQ(3ul, pos);
+  }
+  {
+    size_t pos = 0;
+    EXPECT_EQ(1357, benchmark::stoi("1357", &pos));
+    EXPECT_EQ(4ul, pos);
+  }
+  {
+    size_t pos = 0;
+    EXPECT_EQ(10, benchmark::stoi("1010", &pos, 2));
+    EXPECT_EQ(4ul, pos);
+  }
+  {
+    size_t pos = 0;
+    EXPECT_EQ(520, benchmark::stoi("1010", &pos, 8));
+    EXPECT_EQ(4ul, pos);
+  }
+  {
+    size_t pos = 0;
+    EXPECT_EQ(1010, benchmark::stoi("1010", &pos, 10));
+    EXPECT_EQ(4ul, pos);
+  }
+  {
+    size_t pos = 0;
+    EXPECT_EQ(4112, benchmark::stoi("1010", &pos, 16));
+    EXPECT_EQ(4ul, pos);
+  }
+  {
+    size_t pos = 0;
+    EXPECT_EQ(0xBEEF, benchmark::stoi("BEEF", &pos, 16));
+    EXPECT_EQ(4ul, pos);
+  }
+#ifndef BENCHMARK_HAS_NO_EXCEPTIONS
+  {
+    ASSERT_THROW(benchmark::stoi("this is a test"), std::invalid_argument);
+  }
+#endif
+}
+
+TEST(StringUtilTest, stod) {
+  {
+    size_t pos = 0;
+    EXPECT_EQ(0.0, benchmark::stod("0", &pos));
+    EXPECT_EQ(1ul, pos);
+  }
+  {
+    size_t pos = 0;
+    EXPECT_EQ(-84.0, benchmark::stod("-84", &pos));
+    EXPECT_EQ(3ul, pos);
+  }
+  {
+    size_t pos = 0;
+    EXPECT_EQ(1234.0, benchmark::stod("1234", &pos));
+    EXPECT_EQ(4ul, pos);
+  }
+  {
+    size_t pos = 0;
+    EXPECT_EQ(1.5, benchmark::stod("1.5", &pos));
+    EXPECT_EQ(3ul, pos);
+  }
+  {
+    size_t pos = 0;
+    /* Note: exactly representable as double */
+    EXPECT_EQ(-1.25e+9, benchmark::stod("-1.25e+9", &pos));
+    EXPECT_EQ(8ul, pos);
+  }
+#ifndef BENCHMARK_HAS_NO_EXCEPTIONS
+  {
+    ASSERT_THROW(benchmark::stod("this is a test"), std::invalid_argument);
+  }
+#endif
+}
+
+TEST(StringUtilTest, StrSplit) {
+  EXPECT_EQ(benchmark::StrSplit("", ','), std::vector<std::string>{});
+  EXPECT_EQ(benchmark::StrSplit("hello", ','),
+            std::vector<std::string>({"hello"}));
+  EXPECT_EQ(benchmark::StrSplit("hello,there,is,more", ','),
+            std::vector<std::string>({"hello", "there", "is", "more"}));
+}
+
+}  // end namespace
diff --git a/test/templated_fixture_test.cc b/test/templated_fixture_test.cc
index ec5b4c0cc0..fe9865cc77 100644
--- a/test/templated_fixture_test.cc
+++ b/test/templated_fixture_test.cc
@@ -4,15 +4,15 @@
 #include <cassert>
 #include <memory>
 
-template<typename T>
+template <typename T>
 class MyFixture : public ::benchmark::Fixture {
-public:
+ public:
   MyFixture() : data(0) {}
 
   T data;
 };
 
-BENCHMARK_TEMPLATE_F(MyFixture, Foo, int)(benchmark::State &st) {
+BENCHMARK_TEMPLATE_F(MyFixture, Foo, int)(benchmark::State& st) {
   for (auto _ : st) {
     data += 1;
   }
diff --git a/test/user_counters_tabular_test.cc b/test/user_counters_tabular_test.cc
index 9b8a6132e6..421f27b5cb 100644
--- a/test/user_counters_tabular_test.cc
+++ b/test/user_counters_tabular_test.cc
@@ -7,17 +7,23 @@
 // @todo: <jpmag> this checks the full output at once; the rule for
 // CounterSet1 was failing because it was not matching "^[-]+$".
 // @todo: <jpmag> check that the counters are vertically aligned.
-ADD_CASES(TC_ConsoleOut, {
-// keeping these lines long improves readability, so:
-// clang-format off
+ADD_CASES(TC_ConsoleOut,
+          {
+              // keeping these lines long improves readability, so:
+              // clang-format off
     {"^[-]+$", MR_Next},
     {"^Benchmark %s Time %s CPU %s Iterations %s Bar %s Bat %s Baz %s Foo %s Frob %s Lob$", MR_Next},
     {"^[-]+$", MR_Next},
-    {"^BM_Counters_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
-    {"^BM_Counters_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
-    {"^BM_Counters_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
-    {"^BM_Counters_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
-    {"^BM_Counters_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
+      {"^BM_Counters_Tabular/repeats:2/threads:1 %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
+      {"^BM_Counters_Tabular/repeats:2/threads:1 %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
+      {"^BM_Counters_Tabular/repeats:2/threads:1_mean %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
+      {"^BM_Counters_Tabular/repeats:2/threads:1_median %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
+      {"^BM_Counters_Tabular/repeats:2/threads:1_stddev %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
+      {"^BM_Counters_Tabular/repeats:2/threads:2 %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
+      {"^BM_Counters_Tabular/repeats:2/threads:2 %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
+      {"^BM_Counters_Tabular/repeats:2/threads:2_mean %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
+      {"^BM_Counters_Tabular/repeats:2/threads:2_median %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
+      {"^BM_Counters_Tabular/repeats:2/threads:2_stddev %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
     {"^BM_CounterRates_Tabular/threads:%int %console_report [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s$", MR_Next},
     {"^BM_CounterRates_Tabular/threads:%int %console_report [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s$", MR_Next},
     {"^BM_CounterRates_Tabular/threads:%int %console_report [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s$", MR_Next},
@@ -44,8 +50,8 @@ ADD_CASES(TC_ConsoleOut, {
     {"^BM_CounterSet2_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
     {"^BM_CounterSet2_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
     {"^BM_CounterSet2_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$"},
-// clang-format on
-});
+              // clang-format on
+          });
 ADD_CASES(TC_CSVOut, {{"%csv_header,"
                        "\"Bar\",\"Bat\",\"Baz\",\"Foo\",\"Frob\",\"Lob\""}});
 
@@ -58,29 +64,235 @@ void BM_Counters_Tabular(benchmark::State& state) {
   }
   namespace bm = benchmark;
   state.counters.insert({
-    {"Foo",  { 1, bm::Counter::kAvgThreads}},
-    {"Bar",  { 2, bm::Counter::kAvgThreads}},
-    {"Baz",  { 4, bm::Counter::kAvgThreads}},
-    {"Bat",  { 8, bm::Counter::kAvgThreads}},
-    {"Frob", {16, bm::Counter::kAvgThreads}},
-    {"Lob",  {32, bm::Counter::kAvgThreads}},
+      {"Foo", {1, bm::Counter::kAvgThreads}},
+      {"Bar", {2, bm::Counter::kAvgThreads}},
+      {"Baz", {4, bm::Counter::kAvgThreads}},
+      {"Bat", {8, bm::Counter::kAvgThreads}},
+      {"Frob", {16, bm::Counter::kAvgThreads}},
+      {"Lob", {32, bm::Counter::kAvgThreads}},
   });
 }
-BENCHMARK(BM_Counters_Tabular)->ThreadRange(1, 16);
-ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_Counters_Tabular/threads:%int\",$"},
-                       {"\"iterations\": %int,$", MR_Next},
-                       {"\"real_time\": %float,$", MR_Next},
-                       {"\"cpu_time\": %float,$", MR_Next},
-                       {"\"time_unit\": \"ns\",$", MR_Next},
-                       {"\"Bar\": %float,$", MR_Next},
-                       {"\"Bat\": %float,$", MR_Next},
-                       {"\"Baz\": %float,$", MR_Next},
-                       {"\"Foo\": %float,$", MR_Next},
-                       {"\"Frob\": %float,$", MR_Next},
-                       {"\"Lob\": %float$", MR_Next},
-                       {"}", MR_Next}});
-ADD_CASES(TC_CSVOut, {{"^\"BM_Counters_Tabular/threads:%int\",%csv_report,"
-                       "%float,%float,%float,%float,%float,%float$"}});
+BENCHMARK(BM_Counters_Tabular)->ThreadRange(1, 2)->Repetitions(2);
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_Counters_Tabular/repeats:2/threads:1\",$"},
+           {"\"family_index\": 0,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
+           {"\"run_name\": \"BM_Counters_Tabular/repeats:2/threads:1\",$",
+            MR_Next},
+           {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"repetitions\": 2,$", MR_Next},
+           {"\"repetition_index\": 0,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"Bar\": %float,$", MR_Next},
+           {"\"Bat\": %float,$", MR_Next},
+           {"\"Baz\": %float,$", MR_Next},
+           {"\"Foo\": %float,$", MR_Next},
+           {"\"Frob\": %float,$", MR_Next},
+           {"\"Lob\": %float$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_Counters_Tabular/repeats:2/threads:1\",$"},
+           {"\"family_index\": 0,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
+           {"\"run_name\": \"BM_Counters_Tabular/repeats:2/threads:1\",$",
+            MR_Next},
+           {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"repetitions\": 2,$", MR_Next},
+           {"\"repetition_index\": 1,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"Bar\": %float,$", MR_Next},
+           {"\"Bat\": %float,$", MR_Next},
+           {"\"Baz\": %float,$", MR_Next},
+           {"\"Foo\": %float,$", MR_Next},
+           {"\"Frob\": %float,$", MR_Next},
+           {"\"Lob\": %float$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_Counters_Tabular/repeats:2/threads:1_mean\",$"},
+           {"\"family_index\": 0,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
+           {"\"run_name\": \"BM_Counters_Tabular/repeats:2/threads:1\",$",
+            MR_Next},
+           {"\"run_type\": \"aggregate\",$", MR_Next},
+           {"\"repetitions\": 2,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"aggregate_name\": \"mean\",$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"Bar\": %float,$", MR_Next},
+           {"\"Bat\": %float,$", MR_Next},
+           {"\"Baz\": %float,$", MR_Next},
+           {"\"Foo\": %float,$", MR_Next},
+           {"\"Frob\": %float,$", MR_Next},
+           {"\"Lob\": %float$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_Counters_Tabular/repeats:2/threads:1_median\",$"},
+           {"\"family_index\": 0,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
+           {"\"run_name\": \"BM_Counters_Tabular/repeats:2/threads:1\",$",
+            MR_Next},
+           {"\"run_type\": \"aggregate\",$", MR_Next},
+           {"\"repetitions\": 2,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"aggregate_name\": \"median\",$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"Bar\": %float,$", MR_Next},
+           {"\"Bat\": %float,$", MR_Next},
+           {"\"Baz\": %float,$", MR_Next},
+           {"\"Foo\": %float,$", MR_Next},
+           {"\"Frob\": %float,$", MR_Next},
+           {"\"Lob\": %float$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_Counters_Tabular/repeats:2/threads:1_stddev\",$"},
+           {"\"family_index\": 0,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
+           {"\"run_name\": \"BM_Counters_Tabular/repeats:2/threads:1\",$",
+            MR_Next},
+           {"\"run_type\": \"aggregate\",$", MR_Next},
+           {"\"repetitions\": 2,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"aggregate_name\": \"stddev\",$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"Bar\": %float,$", MR_Next},
+           {"\"Bat\": %float,$", MR_Next},
+           {"\"Baz\": %float,$", MR_Next},
+           {"\"Foo\": %float,$", MR_Next},
+           {"\"Frob\": %float,$", MR_Next},
+           {"\"Lob\": %float$", MR_Next},
+           {"}", MR_Next}});
+
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_Counters_Tabular/repeats:2/threads:2\",$"},
+           {"\"family_index\": 0,$", MR_Next},
+           {"\"per_family_instance_index\": 1,$", MR_Next},
+           {"\"run_name\": \"BM_Counters_Tabular/repeats:2/threads:2\",$",
+            MR_Next},
+           {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"repetitions\": 2,$", MR_Next},
+           {"\"repetition_index\": 0,$", MR_Next},
+           {"\"threads\": 2,$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"Bar\": %float,$", MR_Next},
+           {"\"Bat\": %float,$", MR_Next},
+           {"\"Baz\": %float,$", MR_Next},
+           {"\"Foo\": %float,$", MR_Next},
+           {"\"Frob\": %float,$", MR_Next},
+           {"\"Lob\": %float$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_Counters_Tabular/repeats:2/threads:2\",$"},
+           {"\"family_index\": 0,$", MR_Next},
+           {"\"per_family_instance_index\": 1,$", MR_Next},
+           {"\"run_name\": \"BM_Counters_Tabular/repeats:2/threads:2\",$",
+            MR_Next},
+           {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"repetitions\": 2,$", MR_Next},
+           {"\"repetition_index\": 1,$", MR_Next},
+           {"\"threads\": 2,$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"Bar\": %float,$", MR_Next},
+           {"\"Bat\": %float,$", MR_Next},
+           {"\"Baz\": %float,$", MR_Next},
+           {"\"Foo\": %float,$", MR_Next},
+           {"\"Frob\": %float,$", MR_Next},
+           {"\"Lob\": %float$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_Counters_Tabular/repeats:2/threads:2_median\",$"},
+           {"\"family_index\": 0,$", MR_Next},
+           {"\"per_family_instance_index\": 1,$", MR_Next},
+           {"\"run_name\": \"BM_Counters_Tabular/repeats:2/threads:2\",$",
+            MR_Next},
+           {"\"run_type\": \"aggregate\",$", MR_Next},
+           {"\"repetitions\": 2,$", MR_Next},
+           {"\"threads\": 2,$", MR_Next},
+           {"\"aggregate_name\": \"median\",$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"Bar\": %float,$", MR_Next},
+           {"\"Bat\": %float,$", MR_Next},
+           {"\"Baz\": %float,$", MR_Next},
+           {"\"Foo\": %float,$", MR_Next},
+           {"\"Frob\": %float,$", MR_Next},
+           {"\"Lob\": %float$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_Counters_Tabular/repeats:2/threads:2_stddev\",$"},
+           {"\"family_index\": 0,$", MR_Next},
+           {"\"per_family_instance_index\": 1,$", MR_Next},
+           {"\"run_name\": \"BM_Counters_Tabular/repeats:2/threads:2\",$",
+            MR_Next},
+           {"\"run_type\": \"aggregate\",$", MR_Next},
+           {"\"repetitions\": 2,$", MR_Next},
+           {"\"threads\": 2,$", MR_Next},
+           {"\"aggregate_name\": \"stddev\",$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"Bar\": %float,$", MR_Next},
+           {"\"Bat\": %float,$", MR_Next},
+           {"\"Baz\": %float,$", MR_Next},
+           {"\"Foo\": %float,$", MR_Next},
+           {"\"Frob\": %float,$", MR_Next},
+           {"\"Lob\": %float$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(TC_CSVOut,
+          {{"^\"BM_Counters_Tabular/repeats:2/threads:1\",%csv_report,"
+            "%float,%float,%float,%float,%float,%float$"}});
+ADD_CASES(TC_CSVOut,
+          {{"^\"BM_Counters_Tabular/repeats:2/threads:1\",%csv_report,"
+            "%float,%float,%float,%float,%float,%float$"}});
+ADD_CASES(TC_CSVOut,
+          {{"^\"BM_Counters_Tabular/repeats:2/threads:1_mean\",%csv_report,"
+            "%float,%float,%float,%float,%float,%float$"}});
+ADD_CASES(TC_CSVOut,
+          {{"^\"BM_Counters_Tabular/repeats:2/threads:1_median\",%csv_report,"
+            "%float,%float,%float,%float,%float,%float$"}});
+ADD_CASES(TC_CSVOut,
+          {{"^\"BM_Counters_Tabular/repeats:2/threads:1_stddev\",%csv_report,"
+            "%float,%float,%float,%float,%float,%float$"}});
+ADD_CASES(TC_CSVOut,
+          {{"^\"BM_Counters_Tabular/repeats:2/threads:2\",%csv_report,"
+            "%float,%float,%float,%float,%float,%float$"}});
+ADD_CASES(TC_CSVOut,
+          {{"^\"BM_Counters_Tabular/repeats:2/threads:2\",%csv_report,"
+            "%float,%float,%float,%float,%float,%float$"}});
+ADD_CASES(TC_CSVOut,
+          {{"^\"BM_Counters_Tabular/repeats:2/threads:2_mean\",%csv_report,"
+            "%float,%float,%float,%float,%float,%float$"}});
+ADD_CASES(TC_CSVOut,
+          {{"^\"BM_Counters_Tabular/repeats:2/threads:2_median\",%csv_report,"
+            "%float,%float,%float,%float,%float,%float$"}});
+ADD_CASES(TC_CSVOut,
+          {{"^\"BM_Counters_Tabular/repeats:2/threads:2_stddev\",%csv_report,"
+            "%float,%float,%float,%float,%float,%float$"}});
 // VS2013 does not allow this function to be passed as a lambda argument
 // to CHECK_BENCHMARK_RESULTS()
 void CheckTabular(Results const& e) {
@@ -91,7 +303,10 @@ void CheckTabular(Results const& e) {
   CHECK_COUNTER_VALUE(e, int, "Frob", EQ, 16);
   CHECK_COUNTER_VALUE(e, int, "Lob", EQ, 32);
 }
-CHECK_BENCHMARK_RESULTS("BM_Counters_Tabular/threads:%int", &CheckTabular);
+CHECK_BENCHMARK_RESULTS("BM_Counters_Tabular/repeats:2/threads:1$",
+                        &CheckTabular);
+CHECK_BENCHMARK_RESULTS("BM_Counters_Tabular/repeats:2/threads:2$",
+                        &CheckTabular);
 
 // ========================================================================= //
 // -------------------- Tabular+Rate Counters Output ----------------------- //
@@ -99,42 +314,53 @@ CHECK_BENCHMARK_RESULTS("BM_Counters_Tabular/threads:%int", &CheckTabular);
 
 void BM_CounterRates_Tabular(benchmark::State& state) {
   for (auto _ : state) {
+    // This test requires a non-zero CPU time to avoid divide-by-zero
+    benchmark::DoNotOptimize(state.iterations());
   }
   namespace bm = benchmark;
   state.counters.insert({
-    {"Foo",  { 1, bm::Counter::kAvgThreadsRate}},
-    {"Bar",  { 2, bm::Counter::kAvgThreadsRate}},
-    {"Baz",  { 4, bm::Counter::kAvgThreadsRate}},
-    {"Bat",  { 8, bm::Counter::kAvgThreadsRate}},
-    {"Frob", {16, bm::Counter::kAvgThreadsRate}},
-    {"Lob",  {32, bm::Counter::kAvgThreadsRate}},
+      {"Foo", {1, bm::Counter::kAvgThreadsRate}},
+      {"Bar", {2, bm::Counter::kAvgThreadsRate}},
+      {"Baz", {4, bm::Counter::kAvgThreadsRate}},
+      {"Bat", {8, bm::Counter::kAvgThreadsRate}},
+      {"Frob", {16, bm::Counter::kAvgThreadsRate}},
+      {"Lob", {32, bm::Counter::kAvgThreadsRate}},
   });
 }
 BENCHMARK(BM_CounterRates_Tabular)->ThreadRange(1, 16);
-ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_CounterRates_Tabular/threads:%int\",$"},
-                       {"\"iterations\": %int,$", MR_Next},
-                       {"\"real_time\": %float,$", MR_Next},
-                       {"\"cpu_time\": %float,$", MR_Next},
-                       {"\"time_unit\": \"ns\",$", MR_Next},
-                       {"\"Bar\": %float,$", MR_Next},
-                       {"\"Bat\": %float,$", MR_Next},
-                       {"\"Baz\": %float,$", MR_Next},
-                       {"\"Foo\": %float,$", MR_Next},
-                       {"\"Frob\": %float,$", MR_Next},
-                       {"\"Lob\": %float$", MR_Next},
-                       {"}", MR_Next}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_CounterRates_Tabular/threads:%int\",$"},
+           {"\"family_index\": 1,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
+           {"\"run_name\": \"BM_CounterRates_Tabular/threads:%int\",$",
+            MR_Next},
+           {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"repetitions\": 1,$", MR_Next},
+           {"\"repetition_index\": 0,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"Bar\": %float,$", MR_Next},
+           {"\"Bat\": %float,$", MR_Next},
+           {"\"Baz\": %float,$", MR_Next},
+           {"\"Foo\": %float,$", MR_Next},
+           {"\"Frob\": %float,$", MR_Next},
+           {"\"Lob\": %float$", MR_Next},
+           {"}", MR_Next}});
 ADD_CASES(TC_CSVOut, {{"^\"BM_CounterRates_Tabular/threads:%int\",%csv_report,"
                        "%float,%float,%float,%float,%float,%float$"}});
 // VS2013 does not allow this function to be passed as a lambda argument
 // to CHECK_BENCHMARK_RESULTS()
 void CheckTabularRate(Results const& e) {
   double t = e.DurationCPUTime();
-  CHECK_FLOAT_COUNTER_VALUE(e, "Foo", EQ, 1./t, 0.001);
-  CHECK_FLOAT_COUNTER_VALUE(e, "Bar", EQ, 2./t, 0.001);
-  CHECK_FLOAT_COUNTER_VALUE(e, "Baz", EQ, 4./t, 0.001);
-  CHECK_FLOAT_COUNTER_VALUE(e, "Bat", EQ, 8./t, 0.001);
-  CHECK_FLOAT_COUNTER_VALUE(e, "Frob", EQ, 16./t, 0.001);
-  CHECK_FLOAT_COUNTER_VALUE(e, "Lob", EQ, 32./t, 0.001);
+  CHECK_FLOAT_COUNTER_VALUE(e, "Foo", EQ, 1. / t, 0.001);
+  CHECK_FLOAT_COUNTER_VALUE(e, "Bar", EQ, 2. / t, 0.001);
+  CHECK_FLOAT_COUNTER_VALUE(e, "Baz", EQ, 4. / t, 0.001);
+  CHECK_FLOAT_COUNTER_VALUE(e, "Bat", EQ, 8. / t, 0.001);
+  CHECK_FLOAT_COUNTER_VALUE(e, "Frob", EQ, 16. / t, 0.001);
+  CHECK_FLOAT_COUNTER_VALUE(e, "Lob", EQ, 32. / t, 0.001);
 }
 CHECK_BENCHMARK_RESULTS("BM_CounterRates_Tabular/threads:%int",
                         &CheckTabularRate);
@@ -149,21 +375,29 @@ void BM_CounterSet0_Tabular(benchmark::State& state) {
   }
   namespace bm = benchmark;
   state.counters.insert({
-    {"Foo", {10, bm::Counter::kAvgThreads}},
-    {"Bar", {20, bm::Counter::kAvgThreads}},
-    {"Baz", {40, bm::Counter::kAvgThreads}},
+      {"Foo", {10, bm::Counter::kAvgThreads}},
+      {"Bar", {20, bm::Counter::kAvgThreads}},
+      {"Baz", {40, bm::Counter::kAvgThreads}},
   });
 }
 BENCHMARK(BM_CounterSet0_Tabular)->ThreadRange(1, 16);
-ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_CounterSet0_Tabular/threads:%int\",$"},
-                       {"\"iterations\": %int,$", MR_Next},
-                       {"\"real_time\": %float,$", MR_Next},
-                       {"\"cpu_time\": %float,$", MR_Next},
-                       {"\"time_unit\": \"ns\",$", MR_Next},
-                       {"\"Bar\": %float,$", MR_Next},
-                       {"\"Baz\": %float,$", MR_Next},
-                       {"\"Foo\": %float$", MR_Next},
-                       {"}", MR_Next}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_CounterSet0_Tabular/threads:%int\",$"},
+           {"\"family_index\": 2,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
+           {"\"run_name\": \"BM_CounterSet0_Tabular/threads:%int\",$", MR_Next},
+           {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"repetitions\": 1,$", MR_Next},
+           {"\"repetition_index\": 0,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"Bar\": %float,$", MR_Next},
+           {"\"Baz\": %float,$", MR_Next},
+           {"\"Foo\": %float$", MR_Next},
+           {"}", MR_Next}});
 ADD_CASES(TC_CSVOut, {{"^\"BM_CounterSet0_Tabular/threads:%int\",%csv_report,"
                        "%float,,%float,%float,,"}});
 // VS2013 does not allow this function to be passed as a lambda argument
@@ -181,21 +415,29 @@ void BM_CounterSet1_Tabular(benchmark::State& state) {
   }
   namespace bm = benchmark;
   state.counters.insert({
-    {"Foo", {15, bm::Counter::kAvgThreads}},
-    {"Bar", {25, bm::Counter::kAvgThreads}},
-    {"Baz", {45, bm::Counter::kAvgThreads}},
+      {"Foo", {15, bm::Counter::kAvgThreads}},
+      {"Bar", {25, bm::Counter::kAvgThreads}},
+      {"Baz", {45, bm::Counter::kAvgThreads}},
   });
 }
 BENCHMARK(BM_CounterSet1_Tabular)->ThreadRange(1, 16);
-ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_CounterSet1_Tabular/threads:%int\",$"},
-                       {"\"iterations\": %int,$", MR_Next},
-                       {"\"real_time\": %float,$", MR_Next},
-                       {"\"cpu_time\": %float,$", MR_Next},
-                       {"\"time_unit\": \"ns\",$", MR_Next},
-                       {"\"Bar\": %float,$", MR_Next},
-                       {"\"Baz\": %float,$", MR_Next},
-                       {"\"Foo\": %float$", MR_Next},
-                       {"}", MR_Next}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_CounterSet1_Tabular/threads:%int\",$"},
+           {"\"family_index\": 3,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
+           {"\"run_name\": \"BM_CounterSet1_Tabular/threads:%int\",$", MR_Next},
+           {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"repetitions\": 1,$", MR_Next},
+           {"\"repetition_index\": 0,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"Bar\": %float,$", MR_Next},
+           {"\"Baz\": %float,$", MR_Next},
+           {"\"Foo\": %float$", MR_Next},
+           {"}", MR_Next}});
 ADD_CASES(TC_CSVOut, {{"^\"BM_CounterSet1_Tabular/threads:%int\",%csv_report,"
                        "%float,,%float,%float,,"}});
 // VS2013 does not allow this function to be passed as a lambda argument
@@ -217,21 +459,29 @@ void BM_CounterSet2_Tabular(benchmark::State& state) {
   }
   namespace bm = benchmark;
   state.counters.insert({
-    {"Foo", {10, bm::Counter::kAvgThreads}},
-    {"Bat", {30, bm::Counter::kAvgThreads}},
-    {"Baz", {40, bm::Counter::kAvgThreads}},
+      {"Foo", {10, bm::Counter::kAvgThreads}},
+      {"Bat", {30, bm::Counter::kAvgThreads}},
+      {"Baz", {40, bm::Counter::kAvgThreads}},
   });
 }
 BENCHMARK(BM_CounterSet2_Tabular)->ThreadRange(1, 16);
-ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_CounterSet2_Tabular/threads:%int\",$"},
-                       {"\"iterations\": %int,$", MR_Next},
-                       {"\"real_time\": %float,$", MR_Next},
-                       {"\"cpu_time\": %float,$", MR_Next},
-                       {"\"time_unit\": \"ns\",$", MR_Next},
-                       {"\"Bat\": %float,$", MR_Next},
-                       {"\"Baz\": %float,$", MR_Next},
-                       {"\"Foo\": %float$", MR_Next},
-                       {"}", MR_Next}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_CounterSet2_Tabular/threads:%int\",$"},
+           {"\"family_index\": 4,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
+           {"\"run_name\": \"BM_CounterSet2_Tabular/threads:%int\",$", MR_Next},
+           {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"repetitions\": 1,$", MR_Next},
+           {"\"repetition_index\": 0,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"Bat\": %float,$", MR_Next},
+           {"\"Baz\": %float,$", MR_Next},
+           {"\"Foo\": %float$", MR_Next},
+           {"}", MR_Next}});
 ADD_CASES(TC_CSVOut, {{"^\"BM_CounterSet2_Tabular/threads:%int\",%csv_report,"
                        ",%float,%float,%float,,"}});
 // VS2013 does not allow this function to be passed as a lambda argument
diff --git a/test/user_counters_test.cc b/test/user_counters_test.cc
index 06aafb1fa1..377bb32ca9 100644
--- a/test/user_counters_test.cc
+++ b/test/user_counters_test.cc
@@ -8,12 +8,16 @@
 // ---------------------- Testing Prologue Output -------------------------- //
 // ========================================================================= //
 
+// clang-format off
+
 ADD_CASES(TC_ConsoleOut,
           {{"^[-]+$", MR_Next},
            {"^Benchmark %s Time %s CPU %s Iterations UserCounters...$", MR_Next},
            {"^[-]+$", MR_Next}});
 ADD_CASES(TC_CSVOut, {{"%csv_header,\"bar\",\"foo\""}});
 
+// clang-format on
+
 // ========================================================================= //
 // ------------------------- Simple Counters Output ------------------------ //
 // ========================================================================= //
@@ -25,8 +29,16 @@ void BM_Counters_Simple(benchmark::State& state) {
   state.counters["bar"] = 2 * (double)state.iterations();
 }
 BENCHMARK(BM_Counters_Simple);
-ADD_CASES(TC_ConsoleOut, {{"^BM_Counters_Simple %console_report bar=%hrfloat foo=%hrfloat$"}});
+ADD_CASES(TC_ConsoleOut,
+          {{"^BM_Counters_Simple %console_report bar=%hrfloat foo=%hrfloat$"}});
 ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_Counters_Simple\",$"},
+                       {"\"family_index\": 0,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
+                       {"\"run_name\": \"BM_Counters_Simple\",$", MR_Next},
+                       {"\"run_type\": \"iteration\",$", MR_Next},
+                       {"\"repetitions\": 1,$", MR_Next},
+                       {"\"repetition_index\": 0,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
                        {"\"iterations\": %int,$", MR_Next},
                        {"\"real_time\": %float,$", MR_Next},
                        {"\"cpu_time\": %float,$", MR_Next},
@@ -38,10 +50,10 @@ ADD_CASES(TC_CSVOut, {{"^\"BM_Counters_Simple\",%csv_report,%float,%float$"}});
 // VS2013 does not allow this function to be passed as a lambda argument
 // to CHECK_BENCHMARK_RESULTS()
 void CheckSimple(Results const& e) {
-  double its = e.GetAs< double >("iterations");
+  double its = e.NumIterations();
   CHECK_COUNTER_VALUE(e, int, "foo", EQ, 1);
   // check that the value of bar is within 0.1% of the expected value
-  CHECK_FLOAT_COUNTER_VALUE(e, "bar", EQ, 2.*its, 0.001);
+  CHECK_FLOAT_COUNTER_VALUE(e, "bar", EQ, 2. * its, 0.001);
 }
 CHECK_BENCHMARK_RESULTS("BM_Counters_Simple", &CheckSimple);
 
@@ -49,9 +61,13 @@ CHECK_BENCHMARK_RESULTS("BM_Counters_Simple", &CheckSimple);
 // --------------------- Counters+Items+Bytes/s Output --------------------- //
 // ========================================================================= //
 
-namespace { int num_calls1 = 0; }
+namespace {
+int num_calls1 = 0;
+}
 void BM_Counters_WithBytesAndItemsPSec(benchmark::State& state) {
   for (auto _ : state) {
+    // This test requires a non-zero CPU time to avoid divide-by-zero
+    benchmark::DoNotOptimize(state.iterations());
   }
   state.counters["foo"] = 1;
   state.counters["bar"] = ++num_calls1;
@@ -59,30 +75,38 @@ void BM_Counters_WithBytesAndItemsPSec(benchmark::State& state) {
   state.SetItemsProcessed(150);
 }
 BENCHMARK(BM_Counters_WithBytesAndItemsPSec);
-ADD_CASES(TC_ConsoleOut,
-          {{"^BM_Counters_WithBytesAndItemsPSec %console_report "
-            "bar=%hrfloat foo=%hrfloat +%hrfloatB/s +%hrfloat items/s$"}});
-ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_Counters_WithBytesAndItemsPSec\",$"},
-                       {"\"iterations\": %int,$", MR_Next},
-                       {"\"real_time\": %float,$", MR_Next},
-                       {"\"cpu_time\": %float,$", MR_Next},
-                       {"\"time_unit\": \"ns\",$", MR_Next},
-                       {"\"bytes_per_second\": %float,$", MR_Next},
-                       {"\"items_per_second\": %float,$", MR_Next},
-                       {"\"bar\": %float,$", MR_Next},
-                       {"\"foo\": %float$", MR_Next},
-                       {"}", MR_Next}});
+ADD_CASES(TC_ConsoleOut, {{"^BM_Counters_WithBytesAndItemsPSec %console_report "
+                           "bar=%hrfloat bytes_per_second=%hrfloat/s "
+                           "foo=%hrfloat items_per_second=%hrfloat/s$"}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_Counters_WithBytesAndItemsPSec\",$"},
+           {"\"family_index\": 1,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
+           {"\"run_name\": \"BM_Counters_WithBytesAndItemsPSec\",$", MR_Next},
+           {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"repetitions\": 1,$", MR_Next},
+           {"\"repetition_index\": 0,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"bar\": %float,$", MR_Next},
+           {"\"bytes_per_second\": %float,$", MR_Next},
+           {"\"foo\": %float,$", MR_Next},
+           {"\"items_per_second\": %float$", MR_Next},
+           {"}", MR_Next}});
 ADD_CASES(TC_CSVOut, {{"^\"BM_Counters_WithBytesAndItemsPSec\","
                        "%csv_bytes_items_report,%float,%float$"}});
 // VS2013 does not allow this function to be passed as a lambda argument
 // to CHECK_BENCHMARK_RESULTS()
 void CheckBytesAndItemsPSec(Results const& e) {
-  double t = e.DurationCPUTime(); // this (and not real time) is the time used
+  double t = e.DurationCPUTime();  // this (and not real time) is the time used
   CHECK_COUNTER_VALUE(e, int, "foo", EQ, 1);
   CHECK_COUNTER_VALUE(e, int, "bar", EQ, num_calls1);
   // check that the values are within 0.1% of the expected values
-  CHECK_FLOAT_RESULT_VALUE(e, "bytes_per_second", EQ, 364./t, 0.001);
-  CHECK_FLOAT_RESULT_VALUE(e, "items_per_second", EQ, 150./t, 0.001);
+  CHECK_FLOAT_RESULT_VALUE(e, "bytes_per_second", EQ, 364. / t, 0.001);
+  CHECK_FLOAT_RESULT_VALUE(e, "items_per_second", EQ, 150. / t, 0.001);
 }
 CHECK_BENCHMARK_RESULTS("BM_Counters_WithBytesAndItemsPSec",
                         &CheckBytesAndItemsPSec);
@@ -93,14 +117,25 @@ CHECK_BENCHMARK_RESULTS("BM_Counters_WithBytesAndItemsPSec",
 
 void BM_Counters_Rate(benchmark::State& state) {
   for (auto _ : state) {
+    // This test requires a non-zero CPU time to avoid divide-by-zero
+    benchmark::DoNotOptimize(state.iterations());
   }
   namespace bm = benchmark;
   state.counters["foo"] = bm::Counter{1, bm::Counter::kIsRate};
   state.counters["bar"] = bm::Counter{2, bm::Counter::kIsRate};
 }
 BENCHMARK(BM_Counters_Rate);
-ADD_CASES(TC_ConsoleOut, {{"^BM_Counters_Rate %console_report bar=%hrfloat/s foo=%hrfloat/s$"}});
+ADD_CASES(
+    TC_ConsoleOut,
+    {{"^BM_Counters_Rate %console_report bar=%hrfloat/s foo=%hrfloat/s$"}});
 ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_Counters_Rate\",$"},
+                       {"\"family_index\": 2,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
+                       {"\"run_name\": \"BM_Counters_Rate\",$", MR_Next},
+                       {"\"run_type\": \"iteration\",$", MR_Next},
+                       {"\"repetitions\": 1,$", MR_Next},
+                       {"\"repetition_index\": 0,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
                        {"\"iterations\": %int,$", MR_Next},
                        {"\"real_time\": %float,$", MR_Next},
                        {"\"cpu_time\": %float,$", MR_Next},
@@ -112,26 +147,37 @@ ADD_CASES(TC_CSVOut, {{"^\"BM_Counters_Rate\",%csv_report,%float,%float$"}});
 // VS2013 does not allow this function to be passed as a lambda argument
 // to CHECK_BENCHMARK_RESULTS()
 void CheckRate(Results const& e) {
-  double t = e.DurationCPUTime(); // this (and not real time) is the time used
+  double t = e.DurationCPUTime();  // this (and not real time) is the time used
   // check that the values are within 0.1% of the expected values
-  CHECK_FLOAT_COUNTER_VALUE(e, "foo", EQ, 1./t, 0.001);
-  CHECK_FLOAT_COUNTER_VALUE(e, "bar", EQ, 2./t, 0.001);
+  CHECK_FLOAT_COUNTER_VALUE(e, "foo", EQ, 1. / t, 0.001);
+  CHECK_FLOAT_COUNTER_VALUE(e, "bar", EQ, 2. / t, 0.001);
 }
 CHECK_BENCHMARK_RESULTS("BM_Counters_Rate", &CheckRate);
 
 // ========================================================================= //
-// ------------------------- Thread Counters Output ------------------------ //
+// ----------------------- Inverted Counters Output ------------------------ //
 // ========================================================================= //
 
-void BM_Counters_Threads(benchmark::State& state) {
+void BM_Invert(benchmark::State& state) {
   for (auto _ : state) {
+    // This test requires a non-zero CPU time to avoid divide-by-zero
+    benchmark::DoNotOptimize(state.iterations());
   }
-  state.counters["foo"] = 1;
-  state.counters["bar"] = 2;
+  namespace bm = benchmark;
+  state.counters["foo"] = bm::Counter{0.0001, bm::Counter::kInvert};
+  state.counters["bar"] = bm::Counter{10000, bm::Counter::kInvert};
 }
-BENCHMARK(BM_Counters_Threads)->ThreadRange(1, 8);
-ADD_CASES(TC_ConsoleOut, {{"^BM_Counters_Threads/threads:%int %console_report bar=%hrfloat foo=%hrfloat$"}});
-ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_Counters_Threads/threads:%int\",$"},
+BENCHMARK(BM_Invert);
+ADD_CASES(TC_ConsoleOut,
+          {{"^BM_Invert %console_report bar=%hrfloatu foo=%hrfloatk$"}});
+ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_Invert\",$"},
+                       {"\"family_index\": 3,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
+                       {"\"run_name\": \"BM_Invert\",$", MR_Next},
+                       {"\"run_type\": \"iteration\",$", MR_Next},
+                       {"\"repetitions\": 1,$", MR_Next},
+                       {"\"repetition_index\": 0,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
                        {"\"iterations\": %int,$", MR_Next},
                        {"\"real_time\": %float,$", MR_Next},
                        {"\"cpu_time\": %float,$", MR_Next},
@@ -139,7 +185,94 @@ ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_Counters_Threads/threads:%int\",$"},
                        {"\"bar\": %float,$", MR_Next},
                        {"\"foo\": %float$", MR_Next},
                        {"}", MR_Next}});
-ADD_CASES(TC_CSVOut, {{"^\"BM_Counters_Threads/threads:%int\",%csv_report,%float,%float$"}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_Invert\",%csv_report,%float,%float$"}});
+// VS2013 does not allow this function to be passed as a lambda argument
+// to CHECK_BENCHMARK_RESULTS()
+void CheckInvert(Results const& e) {
+  CHECK_FLOAT_COUNTER_VALUE(e, "foo", EQ, 10000, 0.0001);
+  CHECK_FLOAT_COUNTER_VALUE(e, "bar", EQ, 0.0001, 0.0001);
+}
+CHECK_BENCHMARK_RESULTS("BM_Invert", &CheckInvert);
+
+// ========================================================================= //
+// ------------------------- InvertedRate Counters Output
+// -------------------------- //
+// ========================================================================= //
+
+void BM_Counters_InvertedRate(benchmark::State& state) {
+  for (auto _ : state) {
+    // This test requires a non-zero CPU time to avoid divide-by-zero
+    benchmark::DoNotOptimize(state.iterations());
+  }
+  namespace bm = benchmark;
+  state.counters["foo"] =
+      bm::Counter{1, bm::Counter::kIsRate | bm::Counter::kInvert};
+  state.counters["bar"] =
+      bm::Counter{8192, bm::Counter::kIsRate | bm::Counter::kInvert};
+}
+BENCHMARK(BM_Counters_InvertedRate);
+ADD_CASES(TC_ConsoleOut, {{"^BM_Counters_InvertedRate %console_report "
+                           "bar=%hrfloats foo=%hrfloats$"}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_Counters_InvertedRate\",$"},
+           {"\"family_index\": 4,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
+           {"\"run_name\": \"BM_Counters_InvertedRate\",$", MR_Next},
+           {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"repetitions\": 1,$", MR_Next},
+           {"\"repetition_index\": 0,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"bar\": %float,$", MR_Next},
+           {"\"foo\": %float$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(TC_CSVOut,
+          {{"^\"BM_Counters_InvertedRate\",%csv_report,%float,%float$"}});
+// VS2013 does not allow this function to be passed as a lambda argument
+// to CHECK_BENCHMARK_RESULTS()
+void CheckInvertedRate(Results const& e) {
+  double t = e.DurationCPUTime();  // this (and not real time) is the time used
+  // check that the values are within 0.1% of the expected values
+  CHECK_FLOAT_COUNTER_VALUE(e, "foo", EQ, t, 0.001);
+  CHECK_FLOAT_COUNTER_VALUE(e, "bar", EQ, t / 8192.0, 0.001);
+}
+CHECK_BENCHMARK_RESULTS("BM_Counters_InvertedRate", &CheckInvertedRate);
+
+// ========================================================================= //
+// ------------------------- Thread Counters Output ------------------------ //
+// ========================================================================= //
+
+void BM_Counters_Threads(benchmark::State& state) {
+  for (auto _ : state) {
+  }
+  state.counters["foo"] = 1;
+  state.counters["bar"] = 2;
+}
+BENCHMARK(BM_Counters_Threads)->ThreadRange(1, 8);
+ADD_CASES(TC_ConsoleOut, {{"^BM_Counters_Threads/threads:%int %console_report "
+                           "bar=%hrfloat foo=%hrfloat$"}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_Counters_Threads/threads:%int\",$"},
+           {"\"family_index\": 5,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
+           {"\"run_name\": \"BM_Counters_Threads/threads:%int\",$", MR_Next},
+           {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"repetitions\": 1,$", MR_Next},
+           {"\"repetition_index\": 0,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"bar\": %float,$", MR_Next},
+           {"\"foo\": %float$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(
+    TC_CSVOut,
+    {{"^\"BM_Counters_Threads/threads:%int\",%csv_report,%float,%float$"}});
 // VS2013 does not allow this function to be passed as a lambda argument
 // to CHECK_BENCHMARK_RESULTS()
 void CheckThreads(Results const& e) {
@@ -160,16 +293,27 @@ void BM_Counters_AvgThreads(benchmark::State& state) {
   state.counters["bar"] = bm::Counter{2, bm::Counter::kAvgThreads};
 }
 BENCHMARK(BM_Counters_AvgThreads)->ThreadRange(1, 8);
-ADD_CASES(TC_ConsoleOut, {{"^BM_Counters_AvgThreads/threads:%int %console_report bar=%hrfloat foo=%hrfloat$"}});
-ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_Counters_AvgThreads/threads:%int\",$"},
-                       {"\"iterations\": %int,$", MR_Next},
-                       {"\"real_time\": %float,$", MR_Next},
-                       {"\"cpu_time\": %float,$", MR_Next},
-                       {"\"time_unit\": \"ns\",$", MR_Next},
-                       {"\"bar\": %float,$", MR_Next},
-                       {"\"foo\": %float$", MR_Next},
-                       {"}", MR_Next}});
-ADD_CASES(TC_CSVOut, {{"^\"BM_Counters_AvgThreads/threads:%int\",%csv_report,%float,%float$"}});
+ADD_CASES(TC_ConsoleOut, {{"^BM_Counters_AvgThreads/threads:%int "
+                           "%console_report bar=%hrfloat foo=%hrfloat$"}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_Counters_AvgThreads/threads:%int\",$"},
+           {"\"family_index\": 6,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
+           {"\"run_name\": \"BM_Counters_AvgThreads/threads:%int\",$", MR_Next},
+           {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"repetitions\": 1,$", MR_Next},
+           {"\"repetition_index\": 0,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"bar\": %float,$", MR_Next},
+           {"\"foo\": %float$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(
+    TC_CSVOut,
+    {{"^\"BM_Counters_AvgThreads/threads:%int\",%csv_report,%float,%float$"}});
 // VS2013 does not allow this function to be passed as a lambda argument
 // to CHECK_BENCHMARK_RESULTS()
 void CheckAvgThreads(Results const& e) {
@@ -185,31 +329,225 @@ CHECK_BENCHMARK_RESULTS("BM_Counters_AvgThreads/threads:%int",
 
 void BM_Counters_AvgThreadsRate(benchmark::State& state) {
   for (auto _ : state) {
+    // This test requires a non-zero CPU time to avoid divide-by-zero
+    benchmark::DoNotOptimize(state.iterations());
   }
   namespace bm = benchmark;
   state.counters["foo"] = bm::Counter{1, bm::Counter::kAvgThreadsRate};
   state.counters["bar"] = bm::Counter{2, bm::Counter::kAvgThreadsRate};
 }
 BENCHMARK(BM_Counters_AvgThreadsRate)->ThreadRange(1, 8);
-ADD_CASES(TC_ConsoleOut, {{"^BM_Counters_AvgThreadsRate/threads:%int %console_report bar=%hrfloat/s foo=%hrfloat/s$"}});
-ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_Counters_AvgThreadsRate/threads:%int\",$"},
-                       {"\"iterations\": %int,$", MR_Next},
-                       {"\"real_time\": %float,$", MR_Next},
-                       {"\"cpu_time\": %float,$", MR_Next},
-                       {"\"time_unit\": \"ns\",$", MR_Next},
-                       {"\"bar\": %float,$", MR_Next},
-                       {"\"foo\": %float$", MR_Next},
-                       {"}", MR_Next}});
-ADD_CASES(TC_CSVOut, {{"^\"BM_Counters_AvgThreadsRate/threads:%int\",%csv_report,%float,%float$"}});
+ADD_CASES(TC_ConsoleOut, {{"^BM_Counters_AvgThreadsRate/threads:%int "
+                           "%console_report bar=%hrfloat/s foo=%hrfloat/s$"}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_Counters_AvgThreadsRate/threads:%int\",$"},
+           {"\"family_index\": 7,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
+           {"\"run_name\": \"BM_Counters_AvgThreadsRate/threads:%int\",$",
+            MR_Next},
+           {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"repetitions\": 1,$", MR_Next},
+           {"\"repetition_index\": 0,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"bar\": %float,$", MR_Next},
+           {"\"foo\": %float$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_Counters_AvgThreadsRate/"
+                       "threads:%int\",%csv_report,%float,%float$"}});
 // VS2013 does not allow this function to be passed as a lambda argument
 // to CHECK_BENCHMARK_RESULTS()
 void CheckAvgThreadsRate(Results const& e) {
-  CHECK_FLOAT_COUNTER_VALUE(e, "foo", EQ, 1./e.DurationCPUTime(), 0.001);
-  CHECK_FLOAT_COUNTER_VALUE(e, "bar", EQ, 2./e.DurationCPUTime(), 0.001);
+  CHECK_FLOAT_COUNTER_VALUE(e, "foo", EQ, 1. / e.DurationCPUTime(), 0.001);
+  CHECK_FLOAT_COUNTER_VALUE(e, "bar", EQ, 2. / e.DurationCPUTime(), 0.001);
 }
 CHECK_BENCHMARK_RESULTS("BM_Counters_AvgThreadsRate/threads:%int",
                         &CheckAvgThreadsRate);
 
+// ========================================================================= //
+// ------------------- IterationInvariant Counters Output ------------------ //
+// ========================================================================= //
+
+void BM_Counters_IterationInvariant(benchmark::State& state) {
+  for (auto _ : state) {
+  }
+  namespace bm = benchmark;
+  state.counters["foo"] = bm::Counter{1, bm::Counter::kIsIterationInvariant};
+  state.counters["bar"] = bm::Counter{2, bm::Counter::kIsIterationInvariant};
+}
+BENCHMARK(BM_Counters_IterationInvariant);
+ADD_CASES(TC_ConsoleOut, {{"^BM_Counters_IterationInvariant %console_report "
+                           "bar=%hrfloat foo=%hrfloat$"}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_Counters_IterationInvariant\",$"},
+           {"\"family_index\": 8,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
+           {"\"run_name\": \"BM_Counters_IterationInvariant\",$", MR_Next},
+           {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"repetitions\": 1,$", MR_Next},
+           {"\"repetition_index\": 0,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"bar\": %float,$", MR_Next},
+           {"\"foo\": %float$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(TC_CSVOut,
+          {{"^\"BM_Counters_IterationInvariant\",%csv_report,%float,%float$"}});
+// VS2013 does not allow this function to be passed as a lambda argument
+// to CHECK_BENCHMARK_RESULTS()
+void CheckIterationInvariant(Results const& e) {
+  double its = e.NumIterations();
+  // check that the values are within 0.1% of the expected value
+  CHECK_FLOAT_COUNTER_VALUE(e, "foo", EQ, its, 0.001);
+  CHECK_FLOAT_COUNTER_VALUE(e, "bar", EQ, 2. * its, 0.001);
+}
+CHECK_BENCHMARK_RESULTS("BM_Counters_IterationInvariant",
+                        &CheckIterationInvariant);
+
+// ========================================================================= //
+// ----------------- IterationInvariantRate Counters Output ---------------- //
+// ========================================================================= //
+
+void BM_Counters_kIsIterationInvariantRate(benchmark::State& state) {
+  for (auto _ : state) {
+    // This test requires a non-zero CPU time to avoid divide-by-zero
+    benchmark::DoNotOptimize(state.iterations());
+  }
+  namespace bm = benchmark;
+  state.counters["foo"] =
+      bm::Counter{1, bm::Counter::kIsIterationInvariantRate};
+  state.counters["bar"] =
+      bm::Counter{2, bm::Counter::kIsRate | bm::Counter::kIsIterationInvariant};
+}
+BENCHMARK(BM_Counters_kIsIterationInvariantRate);
+ADD_CASES(TC_ConsoleOut, {{"^BM_Counters_kIsIterationInvariantRate "
+                           "%console_report bar=%hrfloat/s foo=%hrfloat/s$"}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_Counters_kIsIterationInvariantRate\",$"},
+           {"\"family_index\": 9,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
+           {"\"run_name\": \"BM_Counters_kIsIterationInvariantRate\",$",
+            MR_Next},
+           {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"repetitions\": 1,$", MR_Next},
+           {"\"repetition_index\": 0,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"bar\": %float,$", MR_Next},
+           {"\"foo\": %float$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_Counters_kIsIterationInvariantRate\",%csv_report,"
+                       "%float,%float$"}});
+// VS2013 does not allow this function to be passed as a lambda argument
+// to CHECK_BENCHMARK_RESULTS()
+void CheckIsIterationInvariantRate(Results const& e) {
+  double its = e.NumIterations();
+  double t = e.DurationCPUTime();  // this (and not real time) is the time used
+  // check that the values are within 0.1% of the expected values
+  CHECK_FLOAT_COUNTER_VALUE(e, "foo", EQ, its * 1. / t, 0.001);
+  CHECK_FLOAT_COUNTER_VALUE(e, "bar", EQ, its * 2. / t, 0.001);
+}
+CHECK_BENCHMARK_RESULTS("BM_Counters_kIsIterationInvariantRate",
+                        &CheckIsIterationInvariantRate);
+
+// ========================================================================= //
+// ------------------- AvgIterations Counters Output ------------------ //
+// ========================================================================= //
+
+void BM_Counters_AvgIterations(benchmark::State& state) {
+  for (auto _ : state) {
+  }
+  namespace bm = benchmark;
+  state.counters["foo"] = bm::Counter{1, bm::Counter::kAvgIterations};
+  state.counters["bar"] = bm::Counter{2, bm::Counter::kAvgIterations};
+}
+BENCHMARK(BM_Counters_AvgIterations);
+ADD_CASES(TC_ConsoleOut, {{"^BM_Counters_AvgIterations %console_report "
+                           "bar=%hrfloat foo=%hrfloat$"}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_Counters_AvgIterations\",$"},
+           {"\"family_index\": 10,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
+           {"\"run_name\": \"BM_Counters_AvgIterations\",$", MR_Next},
+           {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"repetitions\": 1,$", MR_Next},
+           {"\"repetition_index\": 0,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"bar\": %float,$", MR_Next},
+           {"\"foo\": %float$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(TC_CSVOut,
+          {{"^\"BM_Counters_AvgIterations\",%csv_report,%float,%float$"}});
+// VS2013 does not allow this function to be passed as a lambda argument
+// to CHECK_BENCHMARK_RESULTS()
+void CheckAvgIterations(Results const& e) {
+  double its = e.NumIterations();
+  // check that the values are within 0.1% of the expected value
+  CHECK_FLOAT_COUNTER_VALUE(e, "foo", EQ, 1. / its, 0.001);
+  CHECK_FLOAT_COUNTER_VALUE(e, "bar", EQ, 2. / its, 0.001);
+}
+CHECK_BENCHMARK_RESULTS("BM_Counters_AvgIterations", &CheckAvgIterations);
+
+// ========================================================================= //
+// ----------------- AvgIterationsRate Counters Output ---------------- //
+// ========================================================================= //
+
+void BM_Counters_kAvgIterationsRate(benchmark::State& state) {
+  for (auto _ : state) {
+    // This test requires a non-zero CPU time to avoid divide-by-zero
+    benchmark::DoNotOptimize(state.iterations());
+  }
+  namespace bm = benchmark;
+  state.counters["foo"] = bm::Counter{1, bm::Counter::kAvgIterationsRate};
+  state.counters["bar"] =
+      bm::Counter{2, bm::Counter::kIsRate | bm::Counter::kAvgIterations};
+}
+BENCHMARK(BM_Counters_kAvgIterationsRate);
+ADD_CASES(TC_ConsoleOut, {{"^BM_Counters_kAvgIterationsRate "
+                           "%console_report bar=%hrfloat/s foo=%hrfloat/s$"}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_Counters_kAvgIterationsRate\",$"},
+           {"\"family_index\": 11,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
+           {"\"run_name\": \"BM_Counters_kAvgIterationsRate\",$", MR_Next},
+           {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"repetitions\": 1,$", MR_Next},
+           {"\"repetition_index\": 0,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"bar\": %float,$", MR_Next},
+           {"\"foo\": %float$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_Counters_kAvgIterationsRate\",%csv_report,"
+                       "%float,%float$"}});
+// VS2013 does not allow this function to be passed as a lambda argument
+// to CHECK_BENCHMARK_RESULTS()
+void CheckAvgIterationsRate(Results const& e) {
+  double its = e.NumIterations();
+  double t = e.DurationCPUTime();  // this (and not real time) is the time used
+  // check that the values are within 0.1% of the expected values
+  CHECK_FLOAT_COUNTER_VALUE(e, "foo", EQ, 1. / its / t, 0.001);
+  CHECK_FLOAT_COUNTER_VALUE(e, "bar", EQ, 2. / its / t, 0.001);
+}
+CHECK_BENCHMARK_RESULTS("BM_Counters_kAvgIterationsRate",
+                        &CheckAvgIterationsRate);
+
 // ========================================================================= //
 // --------------------------- TEST CASES END ------------------------------ //
 // ========================================================================= //
diff --git a/test/user_counters_thousands_test.cc b/test/user_counters_thousands_test.cc
new file mode 100644
index 0000000000..bbe194264e
--- /dev/null
+++ b/test/user_counters_thousands_test.cc
@@ -0,0 +1,183 @@
+
+#undef NDEBUG
+
+#include "benchmark/benchmark.h"
+#include "output_test.h"
+
+// ========================================================================= //
+// ------------------------ Thousands Customisation ------------------------ //
+// ========================================================================= //
+
+void BM_Counters_Thousands(benchmark::State& state) {
+  for (auto _ : state) {
+  }
+  namespace bm = benchmark;
+  state.counters.insert({
+      {"t0_1000000DefaultBase",
+       bm::Counter(1000 * 1000, bm::Counter::kDefaults)},
+      {"t1_1000000Base1000", bm::Counter(1000 * 1000, bm::Counter::kDefaults,
+                                         benchmark::Counter::OneK::kIs1000)},
+      {"t2_1000000Base1024", bm::Counter(1000 * 1000, bm::Counter::kDefaults,
+                                         benchmark::Counter::OneK::kIs1024)},
+      {"t3_1048576Base1000", bm::Counter(1024 * 1024, bm::Counter::kDefaults,
+                                         benchmark::Counter::OneK::kIs1000)},
+      {"t4_1048576Base1024", bm::Counter(1024 * 1024, bm::Counter::kDefaults,
+                                         benchmark::Counter::OneK::kIs1024)},
+  });
+}
+BENCHMARK(BM_Counters_Thousands)->Repetitions(2);
+ADD_CASES(
+    TC_ConsoleOut,
+    {
+        {"^BM_Counters_Thousands/repeats:2 %console_report "
+         "t0_1000000DefaultBase=1000k "
+         "t1_1000000Base1000=1000k t2_1000000Base1024=976.56[23]k "
+         "t3_1048576Base1000=1048.58k t4_1048576Base1024=1024k$"},
+        {"^BM_Counters_Thousands/repeats:2 %console_report "
+         "t0_1000000DefaultBase=1000k "
+         "t1_1000000Base1000=1000k t2_1000000Base1024=976.56[23]k "
+         "t3_1048576Base1000=1048.58k t4_1048576Base1024=1024k$"},
+        {"^BM_Counters_Thousands/repeats:2_mean %console_report "
+         "t0_1000000DefaultBase=1000k t1_1000000Base1000=1000k "
+         "t2_1000000Base1024=976.56[23]k t3_1048576Base1000=1048.58k "
+         "t4_1048576Base1024=1024k$"},
+        {"^BM_Counters_Thousands/repeats:2_median %console_report "
+         "t0_1000000DefaultBase=1000k t1_1000000Base1000=1000k "
+         "t2_1000000Base1024=976.56[23]k t3_1048576Base1000=1048.58k "
+         "t4_1048576Base1024=1024k$"},
+        {"^BM_Counters_Thousands/repeats:2_stddev %console_time_only_report [ "
+         "]*2 t0_1000000DefaultBase=0 t1_1000000Base1000=0 "
+         "t2_1000000Base1024=0 t3_1048576Base1000=0 t4_1048576Base1024=0$"},
+    });
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_Counters_Thousands/repeats:2\",$"},
+           {"\"family_index\": 0,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
+           {"\"run_name\": \"BM_Counters_Thousands/repeats:2\",$", MR_Next},
+           {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"repetitions\": 2,$", MR_Next},
+           {"\"repetition_index\": 0,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"t0_1000000DefaultBase\": 1\\.(0)*e\\+(0)*6,$", MR_Next},
+           {"\"t1_1000000Base1000\": 1\\.(0)*e\\+(0)*6,$", MR_Next},
+           {"\"t2_1000000Base1024\": 1\\.(0)*e\\+(0)*6,$", MR_Next},
+           {"\"t3_1048576Base1000\": 1\\.048576(0)*e\\+(0)*6,$", MR_Next},
+           {"\"t4_1048576Base1024\": 1\\.048576(0)*e\\+(0)*6$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_Counters_Thousands/repeats:2\",$"},
+           {"\"family_index\": 0,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
+           {"\"run_name\": \"BM_Counters_Thousands/repeats:2\",$", MR_Next},
+           {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"repetitions\": 2,$", MR_Next},
+           {"\"repetition_index\": 1,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"t0_1000000DefaultBase\": 1\\.(0)*e\\+(0)*6,$", MR_Next},
+           {"\"t1_1000000Base1000\": 1\\.(0)*e\\+(0)*6,$", MR_Next},
+           {"\"t2_1000000Base1024\": 1\\.(0)*e\\+(0)*6,$", MR_Next},
+           {"\"t3_1048576Base1000\": 1\\.048576(0)*e\\+(0)*6,$", MR_Next},
+           {"\"t4_1048576Base1024\": 1\\.048576(0)*e\\+(0)*6$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_Counters_Thousands/repeats:2_mean\",$"},
+           {"\"family_index\": 0,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
+           {"\"run_name\": \"BM_Counters_Thousands/repeats:2\",$", MR_Next},
+           {"\"run_type\": \"aggregate\",$", MR_Next},
+           {"\"repetitions\": 2,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"aggregate_name\": \"mean\",$", MR_Next},
+           {"\"iterations\": 2,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"t0_1000000DefaultBase\": 1\\.(0)*e\\+(0)*6,$", MR_Next},
+           {"\"t1_1000000Base1000\": 1\\.(0)*e\\+(0)*6,$", MR_Next},
+           {"\"t2_1000000Base1024\": 1\\.(0)*e\\+(0)*6,$", MR_Next},
+           {"\"t3_1048576Base1000\": 1\\.048576(0)*e\\+(0)*6,$", MR_Next},
+           {"\"t4_1048576Base1024\": 1\\.048576(0)*e\\+(0)*6$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_Counters_Thousands/repeats:2_median\",$"},
+           {"\"family_index\": 0,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
+           {"\"run_name\": \"BM_Counters_Thousands/repeats:2\",$", MR_Next},
+           {"\"run_type\": \"aggregate\",$", MR_Next},
+           {"\"repetitions\": 2,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"aggregate_name\": \"median\",$", MR_Next},
+           {"\"iterations\": 2,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"t0_1000000DefaultBase\": 1\\.(0)*e\\+(0)*6,$", MR_Next},
+           {"\"t1_1000000Base1000\": 1\\.(0)*e\\+(0)*6,$", MR_Next},
+           {"\"t2_1000000Base1024\": 1\\.(0)*e\\+(0)*6,$", MR_Next},
+           {"\"t3_1048576Base1000\": 1\\.048576(0)*e\\+(0)*6,$", MR_Next},
+           {"\"t4_1048576Base1024\": 1\\.048576(0)*e\\+(0)*6$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_Counters_Thousands/repeats:2_stddev\",$"},
+           {"\"family_index\": 0,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
+           {"\"run_name\": \"BM_Counters_Thousands/repeats:2\",$", MR_Next},
+           {"\"run_type\": \"aggregate\",$", MR_Next},
+           {"\"repetitions\": 2,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"aggregate_name\": \"stddev\",$", MR_Next},
+           {"\"iterations\": 2,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"t0_1000000DefaultBase\": 0\\.(0)*e\\+(0)*,$", MR_Next},
+           {"\"t1_1000000Base1000\": 0\\.(0)*e\\+(0)*,$", MR_Next},
+           {"\"t2_1000000Base1024\": 0\\.(0)*e\\+(0)*,$", MR_Next},
+           {"\"t3_1048576Base1000\": 0\\.(0)*e\\+(0)*,$", MR_Next},
+           {"\"t4_1048576Base1024\": 0\\.(0)*e\\+(0)*$", MR_Next},
+           {"}", MR_Next}});
+
+ADD_CASES(
+    TC_CSVOut,
+    {{"^\"BM_Counters_Thousands/"
+      "repeats:2\",%csv_report,1e\\+(0)*6,1e\\+(0)*6,1e\\+(0)*6,1\\.04858e\\+("
+      "0)*6,1\\.04858e\\+(0)*6$"},
+     {"^\"BM_Counters_Thousands/"
+      "repeats:2\",%csv_report,1e\\+(0)*6,1e\\+(0)*6,1e\\+(0)*6,1\\.04858e\\+("
+      "0)*6,1\\.04858e\\+(0)*6$"},
+     {"^\"BM_Counters_Thousands/"
+      "repeats:2_mean\",%csv_report,1e\\+(0)*6,1e\\+(0)*6,1e\\+(0)*6,1\\."
+      "04858e\\+(0)*6,1\\.04858e\\+(0)*6$"},
+     {"^\"BM_Counters_Thousands/"
+      "repeats:2_median\",%csv_report,1e\\+(0)*6,1e\\+(0)*6,1e\\+(0)*6,1\\."
+      "04858e\\+(0)*6,1\\.04858e\\+(0)*6$"},
+     {"^\"BM_Counters_Thousands/repeats:2_stddev\",%csv_report,0,0,0,0,0$"}});
+// VS2013 does not allow this function to be passed as a lambda argument
+// to CHECK_BENCHMARK_RESULTS()
+void CheckThousands(Results const& e) {
+  if (e.name != "BM_Counters_Thousands/repeats:2")
+    return;  // Do not check the aggregates!
+
+  // check that the values are within 0.01% of the expected values
+  CHECK_FLOAT_COUNTER_VALUE(e, "t0_1000000DefaultBase", EQ, 1000 * 1000,
+                            0.0001);
+  CHECK_FLOAT_COUNTER_VALUE(e, "t1_1000000Base1000", EQ, 1000 * 1000, 0.0001);
+  CHECK_FLOAT_COUNTER_VALUE(e, "t2_1000000Base1024", EQ, 1000 * 1000, 0.0001);
+  CHECK_FLOAT_COUNTER_VALUE(e, "t3_1048576Base1000", EQ, 1024 * 1024, 0.0001);
+  CHECK_FLOAT_COUNTER_VALUE(e, "t4_1048576Base1024", EQ, 1024 * 1024, 0.0001);
+}
+CHECK_BENCHMARK_RESULTS("BM_Counters_Thousands", &CheckThousands);
+
+// ========================================================================= //
+// --------------------------- TEST CASES END ------------------------------ //
+// ========================================================================= //
+
+int main(int argc, char* argv[]) { RunOutputTests(argc, argv); }
diff --git a/tools/BUILD.bazel b/tools/BUILD.bazel
new file mode 100644
index 0000000000..5895883a2e
--- /dev/null
+++ b/tools/BUILD.bazel
@@ -0,0 +1,19 @@
+load("@py_deps//:requirements.bzl", "requirement")
+
+py_library(
+    name = "gbench",
+    srcs = glob(["gbench/*.py"]),
+    deps = [
+      requirement("numpy"),
+      requirement("scipy"),
+    ],
+)
+
+py_binary(
+    name = "compare",
+    srcs = ["compare.py"],
+    python_version = "PY2",
+    deps = [
+        ":gbench",
+    ],
+)
diff --git a/tools/compare.py b/tools/compare.py
index c4a47e8d50..01d2c89f50 100755
--- a/tools/compare.py
+++ b/tools/compare.py
@@ -1,11 +1,13 @@
 #!/usr/bin/env python
 
+import unittest
 """
 compare.py - versatile benchmark output compare tool
 """
 
 import argparse
 from argparse import ArgumentParser
+import json
 import sys
 import gbench
 from gbench import util, report
@@ -35,6 +37,48 @@ def check_inputs(in1, in2, flags):
 def create_parser():
     parser = ArgumentParser(
         description='versatile benchmark output compare tool')
+
+    parser.add_argument(
+        '-a',
+        '--display_aggregates_only',
+        dest='display_aggregates_only',
+        action="store_true",
+        help="If there are repetitions, by default, we display everything - the"
+             " actual runs, and the aggregates computed. Sometimes, it is "
+             "desirable to only view the aggregates. E.g. when there are a lot "
+             "of repetitions. Do note that only the display is affected. "
+             "Internally, all the actual runs are still used, e.g. for U test.")
+
+    parser.add_argument(
+        '--no-color',
+        dest='color',
+        default=True,
+        action="store_false",
+        help="Do not use colors in the terminal output"
+    )
+
+    parser.add_argument(
+        '-d',
+        '--dump_to_json',
+        dest='dump_to_json',
+        help="Additionally, dump benchmark comparison output to this file in JSON format.")
+
+    utest = parser.add_argument_group()
+    utest.add_argument(
+        '--no-utest',
+        dest='utest',
+        default=True,
+        action="store_false",
+        help="The tool can do a two-tailed Mann-Whitney U test with the null hypothesis that it is equally likely that a randomly selected value from one sample will be less than or greater than a randomly selected value from a second sample.\nWARNING: requires **LARGE** (no less than {}) number of repetitions to be meaningful!\nThe test is being done by default, if at least {} repetitions were done.\nThis option can disable the U Test.".format(report.UTEST_OPTIMAL_REPETITIONS, report.UTEST_MIN_REPETITIONS))
+    alpha_default = 0.05
+    utest.add_argument(
+        "--alpha",
+        dest='utest_alpha',
+        default=alpha_default,
+        type=float,
+        help=("significance level alpha. if the calculated p-value is below this value, then the result is said to be statistically significant and the null hypothesis is rejected.\n(default: %0.4f)") %
+        alpha_default)
+
     subparsers = parser.add_subparsers(
         help='This tool has multiple modes of operation:',
         dest='mode')
@@ -138,6 +182,9 @@ def main():
     # Parse the command line flags
     parser = create_parser()
     args, unknown_args = parser.parse_known_args()
+    if args.mode is None:
+        parser.print_help()
+        exit(1)
     assert not unknown_args
     benchmark_options = args.benchmark_options
 
@@ -175,10 +222,14 @@ def main():
     else:
         # should never happen
         print("Unrecognized mode of operation: '%s'" % args.mode)
+        parser.print_help()
         exit(1)
 
     check_inputs(test_baseline, test_contender, benchmark_options)
 
+    if args.display_aggregates_only:
+        benchmark_options += ['--benchmark_display_aggregates_only=true']
+
     options_baseline = []
     options_contender = []
 
@@ -187,10 +238,10 @@ def main():
         options_contender = ['--benchmark_filter=%s' % filter_contender]
 
     # Run the benchmarks and report the results
-    json1 = json1_orig = gbench.util.run_or_load_benchmark(
-        test_baseline, benchmark_options + options_baseline)
-    json2 = json2_orig = gbench.util.run_or_load_benchmark(
-        test_contender, benchmark_options + options_contender)
+    json1 = json1_orig = gbench.util.sort_benchmark_results(gbench.util.run_or_load_benchmark(
+        test_baseline, benchmark_options + options_baseline))
+    json2 = json2_orig = gbench.util.sort_benchmark_results(gbench.util.run_or_load_benchmark(
+        test_contender, benchmark_options + options_contender))
 
     # Now, filter the benchmarks so that the difference report can work
     if filter_baseline and filter_contender:
@@ -200,15 +251,20 @@ def main():
         json2 = gbench.report.filter_benchmark(
             json2_orig, filter_contender, replacement)
 
-    # Diff and output
-    output_lines = gbench.report.generate_difference_report(json1, json2)
+    diff_report = gbench.report.get_difference_report(
+        json1, json2, args.utest)
+    output_lines = gbench.report.print_difference_report(
+        diff_report,
+        args.display_aggregates_only,
+        args.utest, args.utest_alpha, args.color)
     print(description)
     for ln in output_lines:
         print(ln)
 
-
-import unittest
-
+    # Optionally, diff and output to JSON
+    if args.dump_to_json is not None:
+        with open(args.dump_to_json, 'w') as f_json:
+            json.dump(diff_report, f_json)
 
 class TestParser(unittest.TestCase):
     def setUp(self):
@@ -218,12 +274,57 @@ def setUp(self):
                 os.path.realpath(__file__)),
             'gbench',
             'Inputs')
-        self.testInput0 = os.path.join(testInputs, 'test_baseline_run1.json')
-        self.testInput1 = os.path.join(testInputs, 'test_baseline_run2.json')
+        self.testInput0 = os.path.join(testInputs, 'test1_run1.json')
+        self.testInput1 = os.path.join(testInputs, 'test1_run2.json')
 
     def test_benchmarks_basic(self):
         parsed = self.parser.parse_args(
             ['benchmarks', self.testInput0, self.testInput1])
+        self.assertFalse(parsed.display_aggregates_only)
+        self.assertTrue(parsed.utest)
+        self.assertEqual(parsed.mode, 'benchmarks')
+        self.assertEqual(parsed.test_baseline[0].name, self.testInput0)
+        self.assertEqual(parsed.test_contender[0].name, self.testInput1)
+        self.assertFalse(parsed.benchmark_options)
+
+    def test_benchmarks_basic_without_utest(self):
+        parsed = self.parser.parse_args(
+            ['--no-utest', 'benchmarks', self.testInput0, self.testInput1])
+        self.assertFalse(parsed.display_aggregates_only)
+        self.assertFalse(parsed.utest)
+        self.assertEqual(parsed.utest_alpha, 0.05)
+        self.assertEqual(parsed.mode, 'benchmarks')
+        self.assertEqual(parsed.test_baseline[0].name, self.testInput0)
+        self.assertEqual(parsed.test_contender[0].name, self.testInput1)
+        self.assertFalse(parsed.benchmark_options)
+
+    def test_benchmarks_basic_display_aggregates_only(self):
+        parsed = self.parser.parse_args(
+            ['-a', 'benchmarks', self.testInput0, self.testInput1])
+        self.assertTrue(parsed.display_aggregates_only)
+        self.assertTrue(parsed.utest)
+        self.assertEqual(parsed.mode, 'benchmarks')
+        self.assertEqual(parsed.test_baseline[0].name, self.testInput0)
+        self.assertEqual(parsed.test_contender[0].name, self.testInput1)
+        self.assertFalse(parsed.benchmark_options)
+
+    def test_benchmarks_basic_with_utest_alpha(self):
+        parsed = self.parser.parse_args(
+            ['--alpha=0.314', 'benchmarks', self.testInput0, self.testInput1])
+        self.assertFalse(parsed.display_aggregates_only)
+        self.assertTrue(parsed.utest)
+        self.assertEqual(parsed.utest_alpha, 0.314)
+        self.assertEqual(parsed.mode, 'benchmarks')
+        self.assertEqual(parsed.test_baseline[0].name, self.testInput0)
+        self.assertEqual(parsed.test_contender[0].name, self.testInput1)
+        self.assertFalse(parsed.benchmark_options)
+
+    def test_benchmarks_basic_without_utest_with_utest_alpha(self):
+        parsed = self.parser.parse_args(
+            ['--no-utest', '--alpha=0.314', 'benchmarks', self.testInput0, self.testInput1])
+        self.assertFalse(parsed.display_aggregates_only)
+        self.assertFalse(parsed.utest)
+        self.assertEqual(parsed.utest_alpha, 0.314)
         self.assertEqual(parsed.mode, 'benchmarks')
         self.assertEqual(parsed.test_baseline[0].name, self.testInput0)
         self.assertEqual(parsed.test_contender[0].name, self.testInput1)
@@ -232,6 +333,8 @@ def test_benchmarks_basic(self):
     def test_benchmarks_with_remainder(self):
         parsed = self.parser.parse_args(
             ['benchmarks', self.testInput0, self.testInput1, 'd'])
+        self.assertFalse(parsed.display_aggregates_only)
+        self.assertTrue(parsed.utest)
         self.assertEqual(parsed.mode, 'benchmarks')
         self.assertEqual(parsed.test_baseline[0].name, self.testInput0)
         self.assertEqual(parsed.test_contender[0].name, self.testInput1)
@@ -240,6 +343,8 @@ def test_benchmarks_with_remainder(self):
     def test_benchmarks_with_remainder_after_doubleminus(self):
         parsed = self.parser.parse_args(
             ['benchmarks', self.testInput0, self.testInput1, '--', 'e'])
+        self.assertFalse(parsed.display_aggregates_only)
+        self.assertTrue(parsed.utest)
         self.assertEqual(parsed.mode, 'benchmarks')
         self.assertEqual(parsed.test_baseline[0].name, self.testInput0)
         self.assertEqual(parsed.test_contender[0].name, self.testInput1)
@@ -248,6 +353,8 @@ def test_benchmarks_with_remainder_after_doubleminus(self):
     def test_filters_basic(self):
         parsed = self.parser.parse_args(
             ['filters', self.testInput0, 'c', 'd'])
+        self.assertFalse(parsed.display_aggregates_only)
+        self.assertTrue(parsed.utest)
         self.assertEqual(parsed.mode, 'filters')
         self.assertEqual(parsed.test[0].name, self.testInput0)
         self.assertEqual(parsed.filter_baseline[0], 'c')
@@ -257,6 +364,8 @@ def test_filters_basic(self):
     def test_filters_with_remainder(self):
         parsed = self.parser.parse_args(
             ['filters', self.testInput0, 'c', 'd', 'e'])
+        self.assertFalse(parsed.display_aggregates_only)
+        self.assertTrue(parsed.utest)
         self.assertEqual(parsed.mode, 'filters')
         self.assertEqual(parsed.test[0].name, self.testInput0)
         self.assertEqual(parsed.filter_baseline[0], 'c')
@@ -266,6 +375,8 @@ def test_filters_with_remainder(self):
     def test_filters_with_remainder_after_doubleminus(self):
         parsed = self.parser.parse_args(
             ['filters', self.testInput0, 'c', 'd', '--', 'f'])
+        self.assertFalse(parsed.display_aggregates_only)
+        self.assertTrue(parsed.utest)
         self.assertEqual(parsed.mode, 'filters')
         self.assertEqual(parsed.test[0].name, self.testInput0)
         self.assertEqual(parsed.filter_baseline[0], 'c')
@@ -275,6 +386,8 @@ def test_filters_with_remainder_after_doubleminus(self):
     def test_benchmarksfiltered_basic(self):
         parsed = self.parser.parse_args(
             ['benchmarksfiltered', self.testInput0, 'c', self.testInput1, 'e'])
+        self.assertFalse(parsed.display_aggregates_only)
+        self.assertTrue(parsed.utest)
         self.assertEqual(parsed.mode, 'benchmarksfiltered')
         self.assertEqual(parsed.test_baseline[0].name, self.testInput0)
         self.assertEqual(parsed.filter_baseline[0], 'c')
@@ -285,6 +398,8 @@ def test_benchmarksfiltered_basic(self):
     def test_benchmarksfiltered_with_remainder(self):
         parsed = self.parser.parse_args(
             ['benchmarksfiltered', self.testInput0, 'c', self.testInput1, 'e', 'f'])
+        self.assertFalse(parsed.display_aggregates_only)
+        self.assertTrue(parsed.utest)
         self.assertEqual(parsed.mode, 'benchmarksfiltered')
         self.assertEqual(parsed.test_baseline[0].name, self.testInput0)
         self.assertEqual(parsed.filter_baseline[0], 'c')
@@ -295,6 +410,8 @@ def test_benchmarksfiltered_with_remainder(self):
     def test_benchmarksfiltered_with_remainder_after_doubleminus(self):
         parsed = self.parser.parse_args(
             ['benchmarksfiltered', self.testInput0, 'c', self.testInput1, 'e', '--', 'g'])
+        self.assertFalse(parsed.display_aggregates_only)
+        self.assertTrue(parsed.utest)
         self.assertEqual(parsed.mode, 'benchmarksfiltered')
         self.assertEqual(parsed.test_baseline[0].name, self.testInput0)
         self.assertEqual(parsed.filter_baseline[0], 'c')
diff --git a/tools/compare_bench.py b/tools/compare_bench.py
deleted file mode 100755
index 7bbf0d0157..0000000000
--- a/tools/compare_bench.py
+++ /dev/null
@@ -1,67 +0,0 @@
-#!/usr/bin/env python
-"""
-compare_bench.py - Compare two benchmarks or their results and report the
-                   difference.
-"""
-import argparse
-from argparse import ArgumentParser
-import sys
-import gbench
-from gbench import util, report
-from gbench.util import *
-
-def check_inputs(in1, in2, flags):
-    """
-    Perform checking on the user provided inputs and diagnose any abnormalities
-    """
-    in1_kind, in1_err = classify_input_file(in1)
-    in2_kind, in2_err = classify_input_file(in2)
-    output_file = find_benchmark_flag('--benchmark_out=', flags)
-    output_type = find_benchmark_flag('--benchmark_out_format=', flags)
-    if in1_kind == IT_Executable and in2_kind == IT_Executable and output_file:
-        print(("WARNING: '--benchmark_out=%s' will be passed to both "
-              "benchmarks causing it to be overwritten") % output_file)
-    if in1_kind == IT_JSON and in2_kind == IT_JSON and len(flags) > 0:
-        print("WARNING: passing --benchmark flags has no effect since both "
-              "inputs are JSON")
-    if output_type is not None and output_type != 'json':
-        print(("ERROR: passing '--benchmark_out_format=%s' to 'compare_bench.py`"
-              " is not supported.") % output_type)
-        sys.exit(1)
-
-
-def main():
-    parser = ArgumentParser(
-        description='compare the results of two benchmarks')
-    parser.add_argument(
-        'test1', metavar='test1', type=str, nargs=1,
-        help='A benchmark executable or JSON output file')
-    parser.add_argument(
-        'test2', metavar='test2', type=str, nargs=1,
-        help='A benchmark executable or JSON output file')
-    parser.add_argument(
-        'benchmark_options', metavar='benchmark_options', nargs=argparse.REMAINDER,
-        help='Arguments to pass when running benchmark executables'
-    )
-    args, unknown_args = parser.parse_known_args()
-    # Parse the command line flags
-    test1 = args.test1[0]
-    test2 = args.test2[0]
-    if unknown_args:
-        # should never happen
-        print("Unrecognized positional argument arguments: '%s'"
-              % unknown_args)
-        exit(1)
-    benchmark_options = args.benchmark_options
-    check_inputs(test1, test2, benchmark_options)
-    # Run the benchmarks and report the results
-    json1 = gbench.util.run_or_load_benchmark(test1, benchmark_options)
-    json2 = gbench.util.run_or_load_benchmark(test2, benchmark_options)
-    output_lines = gbench.report.generate_difference_report(json1, json2)
-    print('Comparing %s to %s' % (test1, test2))
-    for ln in output_lines:
-        print(ln)
-
-
-if __name__ == '__main__':
-    main()
diff --git a/tools/gbench/Inputs/test1_run1.json b/tools/gbench/Inputs/test1_run1.json
index d7ec6a9c8f..601e327aef 100644
--- a/tools/gbench/Inputs/test1_run1.json
+++ b/tools/gbench/Inputs/test1_run1.json
@@ -85,7 +85,24 @@
       "time_unit": "ns"
     },
     {
-      "name": "BM_BadTimeUnit",
+      "name": "MyComplexityTest_BigO",
+      "run_name": "MyComplexityTest",
+      "run_type": "aggregate",
+      "aggregate_name": "BigO",
+      "cpu_coefficient": 4.2749856294592886e+00,
+      "real_coefficient": 6.4789275289789780e+00,
+      "big_o": "N",
+      "time_unit": "ns"
+    },
+    {
+      "name": "MyComplexityTest_RMS",
+      "run_name": "MyComplexityTest",
+      "run_type": "aggregate",
+      "aggregate_name": "RMS",
+      "rms": 4.5097802512472874e-03
+    },
+    {
+      "name": "BM_NotBadTimeUnit",
       "iterations": 1000,
       "real_time": 0.4,
       "cpu_time": 0.5,
diff --git a/tools/gbench/Inputs/test1_run2.json b/tools/gbench/Inputs/test1_run2.json
index 59a5ffaca4..3cbcf39b0c 100644
--- a/tools/gbench/Inputs/test1_run2.json
+++ b/tools/gbench/Inputs/test1_run2.json
@@ -85,7 +85,24 @@
       "time_unit": "ns"
     },
     {
-      "name": "BM_BadTimeUnit",
+      "name": "MyComplexityTest_BigO",
+      "run_name": "MyComplexityTest",
+      "run_type": "aggregate",
+      "aggregate_name": "BigO",
+      "cpu_coefficient": 5.6215779594361486e+00,
+      "real_coefficient": 5.6288314793554610e+00,
+      "big_o": "N",
+      "time_unit": "ns"
+    },
+    {
+      "name": "MyComplexityTest_RMS",
+      "run_name": "MyComplexityTest",
+      "run_type": "aggregate",
+      "aggregate_name": "RMS",
+      "rms": 3.3128901852342174e-03
+    },
+    {
+      "name": "BM_NotBadTimeUnit",
       "iterations": 1000,
       "real_time": 0.04,
       "cpu_time": 0.6,
diff --git a/tools/gbench/Inputs/test3_run0.json b/tools/gbench/Inputs/test3_run0.json
new file mode 100644
index 0000000000..49f8b06143
--- /dev/null
+++ b/tools/gbench/Inputs/test3_run0.json
@@ -0,0 +1,65 @@
+{
+  "context": {
+    "date": "2016-08-02 17:44:46",
+    "num_cpus": 4,
+    "mhz_per_cpu": 4228,
+    "cpu_scaling_enabled": false,
+    "library_build_type": "release"
+  },
+  "benchmarks": [
+    {
+      "name": "BM_One",
+      "run_type": "aggregate",
+      "iterations": 1000,
+      "real_time": 10,
+      "cpu_time": 100,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_Two",
+      "iterations": 1000,
+      "real_time": 9,
+      "cpu_time": 90,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_Two",
+      "iterations": 1000,
+      "real_time": 8,
+      "cpu_time": 86,
+      "time_unit": "ns"
+    },
+    {
+      "name": "short",
+      "run_type": "aggregate",
+      "iterations": 1000,
+      "real_time": 8,
+      "cpu_time": 80,
+      "time_unit": "ns"
+    },
+    {
+      "name": "short",
+      "run_type": "aggregate",
+      "iterations": 1000,
+      "real_time": 8,
+      "cpu_time": 77,
+      "time_unit": "ns"
+    },
+    {
+      "name": "medium",
+      "run_type": "iteration",
+      "iterations": 1000,
+      "real_time": 8,
+      "cpu_time": 80,
+      "time_unit": "ns"
+    },
+    {
+      "name": "medium",
+      "run_type": "iteration",
+      "iterations": 1000,
+      "real_time": 9,
+      "cpu_time": 82,
+      "time_unit": "ns"
+    }
+  ]
+}
diff --git a/tools/gbench/Inputs/test3_run1.json b/tools/gbench/Inputs/test3_run1.json
new file mode 100644
index 0000000000..acc5ba17ae
--- /dev/null
+++ b/tools/gbench/Inputs/test3_run1.json
@@ -0,0 +1,65 @@
+{
+  "context": {
+    "date": "2016-08-02 17:44:46",
+    "num_cpus": 4,
+    "mhz_per_cpu": 4228,
+    "cpu_scaling_enabled": false,
+    "library_build_type": "release"
+  },
+  "benchmarks": [
+    {
+      "name": "BM_One",
+      "iterations": 1000,
+      "real_time": 9,
+      "cpu_time": 110,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_Two",
+      "run_type": "aggregate",
+      "iterations": 1000,
+      "real_time": 10,
+      "cpu_time": 89,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_Two",
+      "iterations": 1000,
+      "real_time": 7,
+      "cpu_time": 72,
+      "time_unit": "ns"
+    },
+    {
+      "name": "short",
+      "run_type": "aggregate",
+      "iterations": 1000,
+      "real_time": 7,
+      "cpu_time": 75,
+      "time_unit": "ns"
+    },
+    {
+      "name": "short",
+      "run_type": "aggregate",
+      "iterations": 762,
+      "real_time": 4.54,
+      "cpu_time": 66.6,
+      "time_unit": "ns"
+    },
+    {
+      "name": "short",
+      "run_type": "iteration",
+      "iterations": 1000,
+      "real_time": 800,
+      "cpu_time": 1,
+      "time_unit": "ns"
+    },
+    {
+      "name": "medium",
+      "run_type": "iteration",
+      "iterations": 1200,
+      "real_time": 5,
+      "cpu_time": 53,
+      "time_unit": "ns"
+    }
+  ]
+}
diff --git a/tools/gbench/Inputs/test4_run.json b/tools/gbench/Inputs/test4_run.json
new file mode 100644
index 0000000000..eaa005f3a9
--- /dev/null
+++ b/tools/gbench/Inputs/test4_run.json
@@ -0,0 +1,96 @@
+{
+  "benchmarks": [
+    {
+      "name": "99 family 0 instance 0 repetition 0",
+      "run_type": "iteration",
+      "family_index": 0,
+      "per_family_instance_index": 0,
+      "repetition_index": 0
+    },
+    {
+      "name": "98 family 0 instance 0 repetition 1",
+      "run_type": "iteration",
+      "family_index": 0,
+      "per_family_instance_index": 0,
+      "repetition_index": 1
+    },
+    {
+      "name": "97 family 0 instance 0 aggregate",
+      "run_type": "aggregate",
+      "family_index": 0,
+      "per_family_instance_index": 0,
+      "aggregate_name": "9 aggregate"
+    },
+
+
+    {
+      "name": "96 family 0 instance 1 repetition 0",
+      "run_type": "iteration",
+      "family_index": 0,
+      "per_family_instance_index": 1,
+      "repetition_index": 0
+    },
+    {
+      "name": "95 family 0 instance 1 repetition 1",
+      "run_type": "iteration",
+      "family_index": 0,
+      "per_family_instance_index": 1,
+      "repetition_index": 1
+    },
+    {
+      "name": "94 family 0 instance 1 aggregate",
+      "run_type": "aggregate",
+      "family_index": 0,
+      "per_family_instance_index": 1,
+      "aggregate_name": "9 aggregate"
+    },
+
+
+
+
+    {
+      "name": "93 family 1 instance 0 repetition 0",
+      "run_type": "iteration",
+      "family_index": 1,
+      "per_family_instance_index": 0,
+      "repetition_index": 0
+    },
+    {
+      "name": "92 family 1 instance 0 repetition 1",
+      "run_type": "iteration",
+      "family_index": 1,
+      "per_family_instance_index": 0,
+      "repetition_index": 1
+    },
+    {
+      "name": "91 family 1 instance 0 aggregate",
+      "run_type": "aggregate",
+      "family_index": 1,
+      "per_family_instance_index": 0,
+      "aggregate_name": "9 aggregate"
+    },
+
+
+    {
+      "name": "90 family 1 instance 1 repetition 0",
+      "run_type": "iteration",
+      "family_index": 1,
+      "per_family_instance_index": 1,
+      "repetition_index": 0
+    },
+    {
+      "name": "89 family 1 instance 1 repetition 1",
+      "run_type": "iteration",
+      "family_index": 1,
+      "per_family_instance_index": 1,
+      "repetition_index": 1
+    },
+    {
+      "name": "88 family 1 instance 1 aggregate",
+      "run_type": "aggregate",
+      "family_index": 1,
+      "per_family_instance_index": 1,
+      "aggregate_name": "9 aggregate"
+    }
+  ]
+}
diff --git a/tools/gbench/report.py b/tools/gbench/report.py
index 8d68fe96ee..6bea82f6bf 100644
--- a/tools/gbench/report.py
+++ b/tools/gbench/report.py
@@ -1,8 +1,14 @@
 """report.py - Utilities for reporting statistics about benchmark results
 """
+
+import unittest
 import os
 import re
 import copy
+import random
+
+from scipy.stats import mannwhitneyu
+
 
 class BenchmarkColor(object):
     def __init__(self, name, code):
@@ -16,11 +22,13 @@ def __repr__(self):
     def __format__(self, format):
         return self.code
 
+
 # Benchmark Colors Enumeration
 BC_NONE = BenchmarkColor('NONE', '')
 BC_MAGENTA = BenchmarkColor('MAGENTA', '\033[95m')
 BC_CYAN = BenchmarkColor('CYAN', '\033[96m')
 BC_OKBLUE = BenchmarkColor('OKBLUE', '\033[94m')
+BC_OKGREEN = BenchmarkColor('OKGREEN', '\033[32m')
 BC_HEADER = BenchmarkColor('HEADER', '\033[92m')
 BC_WARNING = BenchmarkColor('WARNING', '\033[93m')
 BC_WHITE = BenchmarkColor('WHITE', '\033[97m')
@@ -29,6 +37,11 @@ def __format__(self, format):
 BC_BOLD = BenchmarkColor('BOLD', '\033[1m')
 BC_UNDERLINE = BenchmarkColor('UNDERLINE', '\033[4m')
 
+UTEST_MIN_REPETITIONS = 2
+UTEST_OPTIMAL_REPETITIONS = 9  # Lowest reasonable number, More is better.
+UTEST_COL_NAME = "_pvalue"
+
+
 def color_format(use_color, fmt_str, *args, **kwargs):
     """
     Return the result of 'fmt_str.format(*args, **kwargs)' after transforming
@@ -78,73 +91,302 @@ def filter_benchmark(json_orig, family, replacement=""):
     for be in json_orig['benchmarks']:
         if not regex.search(be['name']):
             continue
-        filteredbench = copy.deepcopy(be) # Do NOT modify the old name!
+        filteredbench = copy.deepcopy(be)  # Do NOT modify the old name!
         filteredbench['name'] = regex.sub(replacement, filteredbench['name'])
         filtered['benchmarks'].append(filteredbench)
     return filtered
 
 
-def generate_difference_report(json1, json2, use_color=True):
+def get_unique_benchmark_names(json):
+    """
+    While *keeping* the order, give all the unique 'names' used for benchmarks.
+    """
+    seen = set()
+    uniqued = [x['name'] for x in json['benchmarks']
+               if x['name'] not in seen and
+               (seen.add(x['name']) or True)]
+    return uniqued
+
+
+def intersect(list1, list2):
+    """
+    Given two lists, get a new list consisting of the elements only contained
+    in *both of the input lists*, while preserving the ordering.
+    """
+    return [x for x in list1 if x in list2]
+
+
+def is_potentially_comparable_benchmark(x):
+    return ('time_unit' in x and 'real_time' in x and 'cpu_time' in x)
+
+
+def partition_benchmarks(json1, json2):
+    """
+    While preserving the ordering, find benchmarks with the same names in
+    both of the inputs, and group them.
+    (i.e. partition/filter into groups with common name)
+    """
+    json1_unique_names = get_unique_benchmark_names(json1)
+    json2_unique_names = get_unique_benchmark_names(json2)
+    names = intersect(json1_unique_names, json2_unique_names)
+    partitions = []
+    for name in names:
+        time_unit = None
+        # Pick the time unit from the first entry of the lhs benchmark.
+        # We should be careful not to crash with unexpected input.
+        for x in json1['benchmarks']:
+            if (x['name'] == name and is_potentially_comparable_benchmark(x)):
+                time_unit = x['time_unit']
+                break
+        if time_unit is None:
+            continue
+        # Filter by name and time unit.
+        # All the repetitions are assumed to be comparable.
+        lhs = [x for x in json1['benchmarks'] if x['name'] == name and
+               x['time_unit'] == time_unit]
+        rhs = [x for x in json2['benchmarks'] if x['name'] == name and
+               x['time_unit'] == time_unit]
+        partitions.append([lhs, rhs])
+    return partitions
+
+
+def extract_field(partition, field_name):
+    # The count of elements may be different. We want *all* of them.
+    lhs = [x[field_name] for x in partition[0]]
+    rhs = [x[field_name] for x in partition[1]]
+    return [lhs, rhs]
+
+
+def calc_utest(timings_cpu, timings_time):
+    min_rep_cnt = min(len(timings_time[0]),
+                      len(timings_time[1]),
+                      len(timings_cpu[0]),
+                      len(timings_cpu[1]))
+
+    # Does *everything* has at least UTEST_MIN_REPETITIONS repetitions?
+    if min_rep_cnt < UTEST_MIN_REPETITIONS:
+        return False, None, None
+
+    time_pvalue = mannwhitneyu(
+        timings_time[0], timings_time[1], alternative='two-sided').pvalue
+    cpu_pvalue = mannwhitneyu(
+        timings_cpu[0], timings_cpu[1], alternative='two-sided').pvalue
+
+    return (min_rep_cnt >= UTEST_OPTIMAL_REPETITIONS), cpu_pvalue, time_pvalue
+
+def print_utest(bc_name, utest, utest_alpha, first_col_width, use_color=True):
+    def get_utest_color(pval):
+        return BC_FAIL if pval >= utest_alpha else BC_OKGREEN
+
+    # Check if we failed miserably with minimum required repetitions for utest
+    if not utest['have_optimal_repetitions'] and utest['cpu_pvalue'] is None and utest['time_pvalue'] is None:
+        return []
+
+    dsc = "U Test, Repetitions: {} vs {}".format(
+        utest['nr_of_repetitions'], utest['nr_of_repetitions_other'])
+    dsc_color = BC_OKGREEN
+
+    # We still got some results to show but issue a warning about it.
+    if not utest['have_optimal_repetitions']:
+        dsc_color = BC_WARNING
+        dsc += ". WARNING: Results unreliable! {}+ repetitions recommended.".format(
+            UTEST_OPTIMAL_REPETITIONS)
+
+    special_str = "{}{:<{}s}{endc}{}{:16.4f}{endc}{}{:16.4f}{endc}{}      {}"
+
+    return [color_format(use_color,
+                         special_str,
+                         BC_HEADER,
+                         "{}{}".format(bc_name, UTEST_COL_NAME),
+                         first_col_width,
+                         get_utest_color(
+                             utest['time_pvalue']), utest['time_pvalue'],
+                         get_utest_color(
+                             utest['cpu_pvalue']), utest['cpu_pvalue'],
+                         dsc_color, dsc,
+                         endc=BC_ENDC)]
+
+
+def get_difference_report(
+        json1,
+        json2,
+        utest=False):
+    """
+    Calculate and report the difference between each test of two benchmarks
+    runs specified as 'json1' and 'json2'. Output is another json containing
+    relevant details for each test run.
+    """
+    assert utest is True or utest is False
+
+    diff_report = []
+    partitions = partition_benchmarks(json1, json2)
+    for partition in partitions:
+        benchmark_name = partition[0][0]['name']
+        time_unit = partition[0][0]['time_unit']
+        measurements = []
+        utest_results = {}
+        # Careful, we may have different repetition count.
+        for i in range(min(len(partition[0]), len(partition[1]))):
+            bn = partition[0][i]
+            other_bench = partition[1][i]
+            measurements.append({
+                'real_time': bn['real_time'],
+                'cpu_time': bn['cpu_time'],
+                'real_time_other': other_bench['real_time'],
+                'cpu_time_other': other_bench['cpu_time'],
+                'time': calculate_change(bn['real_time'], other_bench['real_time']),
+                'cpu': calculate_change(bn['cpu_time'], other_bench['cpu_time'])
+            })
+
+        # After processing the whole partition, if requested, do the U test.
+        if utest:
+            timings_cpu = extract_field(partition, 'cpu_time')
+            timings_time = extract_field(partition, 'real_time')
+            have_optimal_repetitions, cpu_pvalue, time_pvalue = calc_utest(timings_cpu, timings_time)
+            if cpu_pvalue and time_pvalue:
+                utest_results = {
+                    'have_optimal_repetitions': have_optimal_repetitions,
+                    'cpu_pvalue': cpu_pvalue,
+                    'time_pvalue': time_pvalue,
+                    'nr_of_repetitions': len(timings_cpu[0]),
+                    'nr_of_repetitions_other': len(timings_cpu[1])
+                }
+
+        # Store only if we had any measurements for given benchmark.
+        # E.g. partition_benchmarks will filter out the benchmarks having
+        # time units which are not compatible with other time units in the
+        # benchmark suite.
+        if measurements:
+            run_type = partition[0][0]['run_type'] if 'run_type' in partition[0][0] else ''
+            aggregate_name = partition[0][0]['aggregate_name'] if run_type == 'aggregate' and 'aggregate_name' in partition[0][0] else ''
+            diff_report.append({
+                'name': benchmark_name,
+                'measurements': measurements,
+                'time_unit': time_unit,
+                'run_type': run_type,
+                'aggregate_name': aggregate_name,
+                'utest': utest_results
+            })
+
+    return diff_report
+
+
+def print_difference_report(
+        json_diff_report,
+        include_aggregates_only=False,
+        utest=False,
+        utest_alpha=0.05,
+        use_color=True):
     """
     Calculate and report the difference between each test of two benchmarks
     runs specified as 'json1' and 'json2'.
     """
-    first_col_width = find_longest_name(json1['benchmarks'])
-    def find_test(name):
-        for b in json2['benchmarks']:
-            if b['name'] == name:
-                return b
-        return None
-    first_col_width = max(first_col_width, len('Benchmark'))
+    assert utest is True or utest is False
+
+    def get_color(res):
+        if res > 0.05:
+            return BC_FAIL
+        elif res > -0.07:
+            return BC_WHITE
+        else:
+            return BC_CYAN
+
+    first_col_width = find_longest_name(json_diff_report)
+    first_col_width = max(
+        first_col_width,
+        len('Benchmark'))
+    first_col_width += len(UTEST_COL_NAME)
     first_line = "{:<{}s}Time             CPU      Time Old      Time New       CPU Old       CPU New".format(
         'Benchmark', 12 + first_col_width)
     output_strs = [first_line, '-' * len(first_line)]
 
-    gen = (bn for bn in json1['benchmarks'] if 'real_time' in bn and 'cpu_time' in bn)
-    for bn in gen:
-        other_bench = find_test(bn['name'])
-        if not other_bench:
-            continue
+    fmt_str = "{}{:<{}s}{endc}{}{:+16.4f}{endc}{}{:+16.4f}{endc}{:14.0f}{:14.0f}{endc}{:14.0f}{:14.0f}"
+    for benchmark in json_diff_report:
+        # *If* we were asked to only include aggregates,
+        # and if it is non-aggregate, then don't print it.
+        if not include_aggregates_only or not 'run_type' in benchmark or benchmark['run_type'] == 'aggregate':
+            for measurement in benchmark['measurements']:
+                output_strs += [color_format(use_color,
+                                            fmt_str,
+                                            BC_HEADER,
+                                            benchmark['name'],
+                                            first_col_width,
+                                            get_color(measurement['time']),
+                                            measurement['time'],
+                                            get_color(measurement['cpu']),
+                                            measurement['cpu'],
+                                            measurement['real_time'],
+                                            measurement['real_time_other'],
+                                            measurement['cpu_time'],
+                                            measurement['cpu_time_other'],
+                                            endc=BC_ENDC)]
 
-        if bn['time_unit'] != other_bench['time_unit']:
-            continue
+        # After processing the measurements, if requested and
+        # if applicable (e.g. u-test exists for given benchmark),
+        # print the U test.
+        if utest and benchmark['utest']:
+            output_strs += print_utest(benchmark['name'],
+                                       benchmark['utest'],
+                                       utest_alpha=utest_alpha,
+                                       first_col_width=first_col_width,
+                                       use_color=use_color)
 
-        def get_color(res):
-            if res > 0.05:
-                return BC_FAIL
-            elif res > -0.07:
-                return BC_WHITE
-            else:
-                return BC_CYAN
-        fmt_str = "{}{:<{}s}{endc}{}{:+16.4f}{endc}{}{:+16.4f}{endc}{:14.0f}{:14.0f}{endc}{:14.0f}{:14.0f}"
-        tres = calculate_change(bn['real_time'], other_bench['real_time'])
-        cpures = calculate_change(bn['cpu_time'], other_bench['cpu_time'])
-        output_strs += [color_format(use_color, fmt_str,
-            BC_HEADER, bn['name'], first_col_width,
-            get_color(tres), tres, get_color(cpures), cpures,
-            bn['real_time'], other_bench['real_time'],
-            bn['cpu_time'], other_bench['cpu_time'],
-            endc=BC_ENDC)]
     return output_strs
 
+
 ###############################################################################
 # Unit tests
 
-import unittest
 
-class TestReportDifference(unittest.TestCase):
+class TestGetUniqueBenchmarkNames(unittest.TestCase):
     def load_results(self):
         import json
-        testInputs = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'Inputs')
-        testOutput1 = os.path.join(testInputs, 'test1_run1.json')
-        testOutput2 = os.path.join(testInputs, 'test1_run2.json')
-        with open(testOutput1, 'r') as f:
-            json1 = json.load(f)
-        with open(testOutput2, 'r') as f:
-            json2 = json.load(f)
-        return json1, json2
+        testInputs = os.path.join(
+            os.path.dirname(
+                os.path.realpath(__file__)),
+            'Inputs')
+        testOutput = os.path.join(testInputs, 'test3_run0.json')
+        with open(testOutput, 'r') as f:
+            json = json.load(f)
+        return json
 
     def test_basic(self):
+        expect_lines = [
+            'BM_One',
+            'BM_Two',
+            'short',  # These two are not sorted
+            'medium',  # These two are not sorted
+        ]
+        json = self.load_results()
+        output_lines = get_unique_benchmark_names(json)
+        print("\n")
+        print("\n".join(output_lines))
+        self.assertEqual(len(output_lines), len(expect_lines))
+        for i in range(0, len(output_lines)):
+            self.assertEqual(expect_lines[i], output_lines[i])
+
+
+class TestReportDifference(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        def load_results():
+            import json
+            testInputs = os.path.join(
+                os.path.dirname(
+                    os.path.realpath(__file__)),
+                'Inputs')
+            testOutput1 = os.path.join(testInputs, 'test1_run1.json')
+            testOutput2 = os.path.join(testInputs, 'test1_run2.json')
+            with open(testOutput1, 'r') as f:
+                json1 = json.load(f)
+            with open(testOutput2, 'r') as f:
+                json2 = json.load(f)
+            return json1, json2
+
+        json1, json2 = load_results()
+        cls.json_diff_report = get_difference_report(json1, json2)
+
+    def test_json_diff_report_pretty_printing(self):
         expect_lines = [
             ['BM_SameTimes', '+0.0000', '+0.0000', '10', '10', '10', '10'],
             ['BM_2xFaster', '-0.5000', '-0.5000', '50', '25', '50', '25'],
@@ -153,51 +395,592 @@ def test_basic(self):
             ['BM_1PercentSlower', '+0.0100', '+0.0100', '100', '101', '100', '101'],
             ['BM_10PercentFaster', '-0.1000', '-0.1000', '100', '90', '100', '90'],
             ['BM_10PercentSlower', '+0.1000', '+0.1000', '100', '110', '100', '110'],
-            ['BM_100xSlower', '+99.0000', '+99.0000', '100', '10000', '100', '10000'],
-            ['BM_100xFaster', '-0.9900', '-0.9900', '10000', '100', '10000', '100'],
-            ['BM_10PercentCPUToTime', '+0.1000', '-0.1000', '100', '110', '100', '90'],
+            ['BM_100xSlower', '+99.0000', '+99.0000',
+                '100', '10000', '100', '10000'],
+            ['BM_100xFaster', '-0.9900', '-0.9900',
+                '10000', '100', '10000', '100'],
+            ['BM_10PercentCPUToTime', '+0.1000',
+                '-0.1000', '100', '110', '100', '90'],
             ['BM_ThirdFaster', '-0.3333', '-0.3334', '100', '67', '100', '67'],
-            ['BM_BadTimeUnit', '-0.9000', '+0.2000', '0', '0', '0', '1'],
+            ['BM_NotBadTimeUnit', '-0.9000', '+0.2000', '0', '0', '0', '1'],
         ]
-        json1, json2 = self.load_results()
-        output_lines_with_header = generate_difference_report(json1, json2, use_color=False)
+        output_lines_with_header = print_difference_report(
+            self.json_diff_report, use_color=False)
         output_lines = output_lines_with_header[2:]
+        print("\n")
         print("\n".join(output_lines_with_header))
         self.assertEqual(len(output_lines), len(expect_lines))
         for i in range(0, len(output_lines)):
             parts = [x for x in output_lines[i].split(' ') if x]
             self.assertEqual(len(parts), 7)
-            self.assertEqual(parts, expect_lines[i])
+            self.assertEqual(expect_lines[i], parts)
+
+    def test_json_diff_report_output(self):
+        expected_output = [
+            {
+                'name': 'BM_SameTimes',
+                'measurements': [{'time': 0.0000, 'cpu': 0.0000, 'real_time': 10, 'real_time_other': 10, 'cpu_time': 10, 'cpu_time_other': 10}],
+                'time_unit': 'ns',
+                'utest': {}
+            },
+            {
+                'name': 'BM_2xFaster',
+                'measurements': [{'time': -0.5000, 'cpu': -0.5000, 'real_time': 50, 'real_time_other': 25, 'cpu_time': 50, 'cpu_time_other': 25}],
+                'time_unit': 'ns',
+                'utest': {}
+            },
+            {
+                'name': 'BM_2xSlower',
+                'measurements': [{'time': 1.0000, 'cpu': 1.0000, 'real_time': 50, 'real_time_other': 100, 'cpu_time': 50, 'cpu_time_other': 100}],
+                'time_unit': 'ns',
+                'utest': {}
+            },
+            {
+                'name': 'BM_1PercentFaster',
+                'measurements': [{'time': -0.0100, 'cpu': -0.0100, 'real_time': 100, 'real_time_other': 98.9999999, 'cpu_time': 100, 'cpu_time_other': 98.9999999}],
+                'time_unit': 'ns',
+                'utest': {}
+            },
+            {
+                'name': 'BM_1PercentSlower',
+                'measurements': [{'time': 0.0100, 'cpu': 0.0100, 'real_time': 100, 'real_time_other': 101, 'cpu_time': 100, 'cpu_time_other': 101}],
+                'time_unit': 'ns',
+                'utest': {}
+            },
+            {
+                'name': 'BM_10PercentFaster',
+                'measurements': [{'time': -0.1000, 'cpu': -0.1000, 'real_time': 100, 'real_time_other': 90, 'cpu_time': 100, 'cpu_time_other': 90}],
+                'time_unit': 'ns',
+                'utest': {}
+            },
+            {
+                'name': 'BM_10PercentSlower',
+                'measurements': [{'time': 0.1000, 'cpu': 0.1000, 'real_time': 100, 'real_time_other': 110, 'cpu_time': 100, 'cpu_time_other': 110}],
+                'time_unit': 'ns',
+                'utest': {}
+            },
+            {
+                'name': 'BM_100xSlower',
+                'measurements': [{'time': 99.0000, 'cpu': 99.0000, 'real_time': 100, 'real_time_other': 10000, 'cpu_time': 100, 'cpu_time_other': 10000}],
+                'time_unit': 'ns',
+                'utest': {}
+            },
+            {
+                'name': 'BM_100xFaster',
+                'measurements': [{'time': -0.9900, 'cpu': -0.9900, 'real_time': 10000, 'real_time_other': 100, 'cpu_time': 10000, 'cpu_time_other': 100}],
+                'time_unit': 'ns',
+                'utest': {}
+            },
+            {
+                'name': 'BM_10PercentCPUToTime',
+                'measurements': [{'time': 0.1000, 'cpu': -0.1000, 'real_time': 100, 'real_time_other': 110, 'cpu_time': 100, 'cpu_time_other': 90}],
+                'time_unit': 'ns',
+                'utest': {}
+            },
+            {
+                'name': 'BM_ThirdFaster',
+                'measurements': [{'time': -0.3333, 'cpu': -0.3334, 'real_time': 100, 'real_time_other': 67, 'cpu_time': 100, 'cpu_time_other': 67}],
+                'time_unit': 'ns',
+                'utest': {}
+            },
+            {
+                'name': 'BM_NotBadTimeUnit',
+                'measurements': [{'time': -0.9000, 'cpu': 0.2000, 'real_time': 0.4, 'real_time_other': 0.04, 'cpu_time': 0.5, 'cpu_time_other': 0.6}],
+                'time_unit': 's',
+                'utest': {}
+            },
+        ]
+        self.assertEqual(len(self.json_diff_report), len(expected_output))
+        for out, expected in zip(
+                self.json_diff_report, expected_output):
+            self.assertEqual(out['name'], expected['name'])
+            self.assertEqual(out['time_unit'], expected['time_unit'])
+            assert_utest(self, out, expected)
+            assert_measurements(self, out, expected)
 
 
 class TestReportDifferenceBetweenFamilies(unittest.TestCase):
-    def load_result(self):
-        import json
-        testInputs = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'Inputs')
-        testOutput = os.path.join(testInputs, 'test2_run.json')
-        with open(testOutput, 'r') as f:
-            json = json.load(f)
-        return json
+    @classmethod
+    def setUpClass(cls):
+        def load_result():
+            import json
+            testInputs = os.path.join(
+                os.path.dirname(
+                    os.path.realpath(__file__)),
+                'Inputs')
+            testOutput = os.path.join(testInputs, 'test2_run.json')
+            with open(testOutput, 'r') as f:
+                json = json.load(f)
+            return json
 
-    def test_basic(self):
+        json = load_result()
+        json1 = filter_benchmark(json, "BM_Z.ro", ".")
+        json2 = filter_benchmark(json, "BM_O.e", ".")
+        cls.json_diff_report = get_difference_report(json1, json2)
+
+    def test_json_diff_report_pretty_printing(self):
         expect_lines = [
             ['.', '-0.5000', '-0.5000', '10', '5', '10', '5'],
             ['./4', '-0.5000', '-0.5000', '40', '20', '40', '20'],
             ['Prefix/.', '-0.5000', '-0.5000', '20', '10', '20', '10'],
             ['Prefix/./3', '-0.5000', '-0.5000', '30', '15', '30', '15'],
         ]
-        json = self.load_result()
-        json1 = filter_benchmark(json, "BM_Z.ro", ".")
-        json2 = filter_benchmark(json, "BM_O.e", ".")
-        output_lines_with_header = generate_difference_report(json1, json2, use_color=False)
+        output_lines_with_header = print_difference_report(
+            self.json_diff_report, use_color=False)
         output_lines = output_lines_with_header[2:]
-        print "\n"
+        print("\n")
         print("\n".join(output_lines_with_header))
         self.assertEqual(len(output_lines), len(expect_lines))
         for i in range(0, len(output_lines)):
             parts = [x for x in output_lines[i].split(' ') if x]
             self.assertEqual(len(parts), 7)
-            self.assertEqual(parts, expect_lines[i])
+            self.assertEqual(expect_lines[i], parts)
+
+    def test_json_diff_report(self):
+        expected_output = [
+            {
+                'name': u'.',
+                'measurements': [{'time': -0.5, 'cpu': -0.5, 'real_time': 10, 'real_time_other': 5, 'cpu_time': 10, 'cpu_time_other': 5}],
+                'time_unit': 'ns',
+                'utest': {}
+            },
+            {
+                'name': u'./4',
+                'measurements': [{'time': -0.5, 'cpu': -0.5, 'real_time': 40, 'real_time_other': 20, 'cpu_time': 40, 'cpu_time_other': 20}],
+                'time_unit': 'ns',
+                'utest': {},
+            },
+            {
+                'name': u'Prefix/.',
+                'measurements': [{'time': -0.5, 'cpu': -0.5, 'real_time': 20, 'real_time_other': 10, 'cpu_time': 20, 'cpu_time_other': 10}],
+                'time_unit': 'ns',
+                'utest': {}
+            },
+            {
+                'name': u'Prefix/./3',
+                'measurements': [{'time': -0.5, 'cpu': -0.5, 'real_time': 30, 'real_time_other': 15, 'cpu_time': 30, 'cpu_time_other': 15}],
+                'time_unit': 'ns',
+                'utest': {}
+            }
+        ]
+        self.assertEqual(len(self.json_diff_report), len(expected_output))
+        for out, expected in zip(
+                self.json_diff_report, expected_output):
+            self.assertEqual(out['name'], expected['name'])
+            self.assertEqual(out['time_unit'], expected['time_unit'])
+            assert_utest(self, out, expected)
+            assert_measurements(self, out, expected)
+
+
+class TestReportDifferenceWithUTest(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        def load_results():
+            import json
+            testInputs = os.path.join(
+                os.path.dirname(
+                    os.path.realpath(__file__)),
+                'Inputs')
+            testOutput1 = os.path.join(testInputs, 'test3_run0.json')
+            testOutput2 = os.path.join(testInputs, 'test3_run1.json')
+            with open(testOutput1, 'r') as f:
+                json1 = json.load(f)
+            with open(testOutput2, 'r') as f:
+                json2 = json.load(f)
+            return json1, json2
+
+        json1, json2 = load_results()
+        cls.json_diff_report = get_difference_report(
+            json1, json2, utest=True)
+
+    def test_json_diff_report_pretty_printing(self):
+        expect_lines = [
+            ['BM_One', '-0.1000', '+0.1000', '10', '9', '100', '110'],
+            ['BM_Two', '+0.1111', '-0.0111', '9', '10', '90', '89'],
+            ['BM_Two', '-0.1250', '-0.1628', '8', '7', '86', '72'],
+            ['BM_Two_pvalue',
+             '0.6985',
+             '0.6985',
+             'U',
+             'Test,',
+             'Repetitions:',
+             '2',
+             'vs',
+             '2.',
+             'WARNING:',
+             'Results',
+             'unreliable!',
+             '9+',
+             'repetitions',
+             'recommended.'],
+            ['short', '-0.1250', '-0.0625', '8', '7', '80', '75'],
+            ['short', '-0.4325', '-0.1351', '8', '5', '77', '67'],
+            ['short_pvalue',
+             '0.7671',
+             '0.1489',
+             'U',
+             'Test,',
+             'Repetitions:',
+             '2',
+             'vs',
+             '3.',
+             'WARNING:',
+             'Results',
+             'unreliable!',
+             '9+',
+             'repetitions',
+             'recommended.'],
+            ['medium', '-0.3750', '-0.3375', '8', '5', '80', '53'],
+        ]
+        output_lines_with_header = print_difference_report(
+            self.json_diff_report, utest=True, utest_alpha=0.05, use_color=False)
+        output_lines = output_lines_with_header[2:]
+        print("\n")
+        print("\n".join(output_lines_with_header))
+        self.assertEqual(len(output_lines), len(expect_lines))
+        for i in range(0, len(output_lines)):
+            parts = [x for x in output_lines[i].split(' ') if x]
+            self.assertEqual(expect_lines[i], parts)
+
+    def test_json_diff_report_pretty_printing_aggregates_only(self):
+        expect_lines = [
+            ['BM_One', '-0.1000', '+0.1000', '10', '9', '100', '110'],
+            ['BM_Two_pvalue',
+             '0.6985',
+             '0.6985',
+             'U',
+             'Test,',
+             'Repetitions:',
+             '2',
+             'vs',
+             '2.',
+             'WARNING:',
+             'Results',
+             'unreliable!',
+             '9+',
+             'repetitions',
+             'recommended.'],
+            ['short', '-0.1250', '-0.0625', '8', '7', '80', '75'],
+            ['short', '-0.4325', '-0.1351', '8', '5', '77', '67'],
+            ['short_pvalue',
+             '0.7671',
+             '0.1489',
+             'U',
+             'Test,',
+             'Repetitions:',
+             '2',
+             'vs',
+             '3.',
+             'WARNING:',
+             'Results',
+             'unreliable!',
+             '9+',
+             'repetitions',
+             'recommended.'],
+        ]
+        output_lines_with_header = print_difference_report(
+            self.json_diff_report, include_aggregates_only=True, utest=True, utest_alpha=0.05, use_color=False)
+        output_lines = output_lines_with_header[2:]
+        print("\n")
+        print("\n".join(output_lines_with_header))
+        self.assertEqual(len(output_lines), len(expect_lines))
+        for i in range(0, len(output_lines)):
+            parts = [x for x in output_lines[i].split(' ') if x]
+            self.assertEqual(expect_lines[i], parts)
+
+    def test_json_diff_report(self):
+        expected_output = [
+            {
+                'name': u'BM_One',
+                'measurements': [
+                    {'time': -0.1,
+                     'cpu': 0.1,
+                     'real_time': 10,
+                     'real_time_other': 9,
+                     'cpu_time': 100,
+                     'cpu_time_other': 110}
+                ],
+                'time_unit': 'ns',
+                'utest': {}
+            },
+            {
+                'name': u'BM_Two',
+                'measurements': [
+                    {'time': 0.1111111111111111,
+                     'cpu': -0.011111111111111112,
+                     'real_time': 9,
+                     'real_time_other': 10,
+                     'cpu_time': 90,
+                     'cpu_time_other': 89},
+                    {'time': -0.125, 'cpu': -0.16279069767441862, 'real_time': 8,
+                        'real_time_other': 7, 'cpu_time': 86, 'cpu_time_other': 72}
+                ],
+                'time_unit': 'ns',
+                'utest': {
+                    'have_optimal_repetitions': False, 'cpu_pvalue': 0.6985353583033387, 'time_pvalue': 0.6985353583033387
+                }
+            },
+            {
+                'name': u'short',
+                'measurements': [
+                    {'time': -0.125,
+                     'cpu': -0.0625,
+                     'real_time': 8,
+                     'real_time_other': 7,
+                     'cpu_time': 80,
+                     'cpu_time_other': 75},
+                    {'time': -0.4325,
+                     'cpu': -0.13506493506493514,
+                     'real_time': 8,
+                     'real_time_other': 4.54,
+                     'cpu_time': 77,
+                     'cpu_time_other': 66.6}
+                ],
+                'time_unit': 'ns',
+                'utest': {
+                    'have_optimal_repetitions': False, 'cpu_pvalue': 0.14891467317876572, 'time_pvalue': 0.7670968684102772
+                }
+            },
+            {
+                'name': u'medium',
+                'measurements': [
+                    {'time': -0.375,
+                     'cpu': -0.3375,
+                     'real_time': 8,
+                     'real_time_other': 5,
+                     'cpu_time': 80,
+                     'cpu_time_other': 53}
+                ],
+                'time_unit': 'ns',
+                'utest': {}
+            }
+        ]
+        self.assertEqual(len(self.json_diff_report), len(expected_output))
+        for out, expected in zip(
+                self.json_diff_report, expected_output):
+            self.assertEqual(out['name'], expected['name'])
+            self.assertEqual(out['time_unit'], expected['time_unit'])
+            assert_utest(self, out, expected)
+            assert_measurements(self, out, expected)
+
+
+class TestReportDifferenceWithUTestWhileDisplayingAggregatesOnly(
+        unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        def load_results():
+            import json
+            testInputs = os.path.join(
+                os.path.dirname(
+                    os.path.realpath(__file__)),
+                'Inputs')
+            testOutput1 = os.path.join(testInputs, 'test3_run0.json')
+            testOutput2 = os.path.join(testInputs, 'test3_run1.json')
+            with open(testOutput1, 'r') as f:
+                json1 = json.load(f)
+            with open(testOutput2, 'r') as f:
+                json2 = json.load(f)
+            return json1, json2
+
+        json1, json2 = load_results()
+        cls.json_diff_report = get_difference_report(
+            json1, json2, utest=True)
+
+    def test_json_diff_report_pretty_printing(self):
+        expect_lines = [
+            ['BM_One', '-0.1000', '+0.1000', '10', '9', '100', '110'],
+            ['BM_Two', '+0.1111', '-0.0111', '9', '10', '90', '89'],
+            ['BM_Two', '-0.1250', '-0.1628', '8', '7', '86', '72'],
+            ['BM_Two_pvalue',
+             '0.6985',
+             '0.6985',
+             'U',
+             'Test,',
+             'Repetitions:',
+             '2',
+             'vs',
+             '2.',
+             'WARNING:',
+             'Results',
+             'unreliable!',
+             '9+',
+             'repetitions',
+             'recommended.'],
+            ['short', '-0.1250', '-0.0625', '8', '7', '80', '75'],
+            ['short', '-0.4325', '-0.1351', '8', '5', '77', '67'],
+            ['short_pvalue',
+             '0.7671',
+             '0.1489',
+             'U',
+             'Test,',
+             'Repetitions:',
+             '2',
+             'vs',
+             '3.',
+             'WARNING:',
+             'Results',
+             'unreliable!',
+             '9+',
+             'repetitions',
+             'recommended.'],
+             ['medium', '-0.3750', '-0.3375', '8', '5', '80', '53']
+        ]
+        output_lines_with_header = print_difference_report(
+            self.json_diff_report,
+            utest=True, utest_alpha=0.05, use_color=False)
+        output_lines = output_lines_with_header[2:]
+        print("\n")
+        print("\n".join(output_lines_with_header))
+        self.assertEqual(len(output_lines), len(expect_lines))
+        for i in range(0, len(output_lines)):
+            parts = [x for x in output_lines[i].split(' ') if x]
+            self.assertEqual(expect_lines[i], parts)
+
+    def test_json_diff_report(self):
+        expected_output = [
+            {
+                'name': u'BM_One',
+                'measurements': [
+                    {'time': -0.1,
+                     'cpu': 0.1,
+                     'real_time': 10,
+                     'real_time_other': 9,
+                     'cpu_time': 100,
+                     'cpu_time_other': 110}
+                ],
+                'time_unit': 'ns',
+                'utest': {}
+            },
+            {
+                'name': u'BM_Two',
+                'measurements': [
+                    {'time': 0.1111111111111111,
+                     'cpu': -0.011111111111111112,
+                     'real_time': 9,
+                     'real_time_other': 10,
+                     'cpu_time': 90,
+                     'cpu_time_other': 89},
+                    {'time': -0.125, 'cpu': -0.16279069767441862, 'real_time': 8,
+                        'real_time_other': 7, 'cpu_time': 86, 'cpu_time_other': 72}
+                ],
+                'time_unit': 'ns',
+                'utest': {
+                    'have_optimal_repetitions': False, 'cpu_pvalue': 0.6985353583033387, 'time_pvalue': 0.6985353583033387
+                }
+            },
+            {
+                'name': u'short',
+                'measurements': [
+                    {'time': -0.125,
+                     'cpu': -0.0625,
+                     'real_time': 8,
+                     'real_time_other': 7,
+                     'cpu_time': 80,
+                     'cpu_time_other': 75},
+                    {'time': -0.4325,
+                     'cpu': -0.13506493506493514,
+                     'real_time': 8,
+                     'real_time_other': 4.54,
+                     'cpu_time': 77,
+                     'cpu_time_other': 66.6}
+                ],
+                'time_unit': 'ns',
+                'utest': {
+                    'have_optimal_repetitions': False, 'cpu_pvalue': 0.14891467317876572, 'time_pvalue': 0.7670968684102772
+                }
+            },
+            {
+                'name': u'medium',
+                'measurements': [
+                    {'real_time_other': 5,
+                     'cpu_time': 80,
+                     'time': -0.375,
+                     'real_time': 8,
+                     'cpu_time_other': 53,
+                     'cpu': -0.3375
+                    }
+                ],
+                'utest': {},
+                'time_unit': u'ns',
+                'aggregate_name': ''
+            }
+        ]
+        self.assertEqual(len(self.json_diff_report), len(expected_output))
+        for out, expected in zip(
+                self.json_diff_report, expected_output):
+            self.assertEqual(out['name'], expected['name'])
+            self.assertEqual(out['time_unit'], expected['time_unit'])
+            assert_utest(self, out, expected)
+            assert_measurements(self, out, expected)
+
+
+class TestReportSorting(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        def load_result():
+            import json
+            testInputs = os.path.join(
+                os.path.dirname(
+                    os.path.realpath(__file__)),
+                'Inputs')
+            testOutput = os.path.join(testInputs, 'test4_run.json')
+            with open(testOutput, 'r') as f:
+                json = json.load(f)
+            return json
+
+        cls.json = load_result()
+
+    def test_json_diff_report_pretty_printing(self):
+        import util
+
+        expected_names = [
+            "99 family 0 instance 0 repetition 0",
+            "98 family 0 instance 0 repetition 1",
+            "97 family 0 instance 0 aggregate",
+            "96 family 0 instance 1 repetition 0",
+            "95 family 0 instance 1 repetition 1",
+            "94 family 0 instance 1 aggregate",
+            "93 family 1 instance 0 repetition 0",
+            "92 family 1 instance 0 repetition 1",
+            "91 family 1 instance 0 aggregate",
+            "90 family 1 instance 1 repetition 0",
+            "89 family 1 instance 1 repetition 1",
+            "88 family 1 instance 1 aggregate"
+        ]
+
+        for n in range(len(self.json['benchmarks']) ** 2):
+            random.shuffle(self.json['benchmarks'])
+            sorted_benchmarks = util.sort_benchmark_results(self.json)[
+                'benchmarks']
+            self.assertEqual(len(expected_names), len(sorted_benchmarks))
+            for out, expected in zip(sorted_benchmarks, expected_names):
+                self.assertEqual(out['name'], expected)
+
+
+def assert_utest(unittest_instance, lhs, rhs):
+    if lhs['utest']:
+        unittest_instance.assertAlmostEqual(
+            lhs['utest']['cpu_pvalue'],
+            rhs['utest']['cpu_pvalue'])
+        unittest_instance.assertAlmostEqual(
+            lhs['utest']['time_pvalue'],
+            rhs['utest']['time_pvalue'])
+        unittest_instance.assertEqual(
+            lhs['utest']['have_optimal_repetitions'],
+            rhs['utest']['have_optimal_repetitions'])
+    else:
+        # lhs is empty. assert if rhs is not.
+        unittest_instance.assertEqual(lhs['utest'], rhs['utest'])
+
+
+def assert_measurements(unittest_instance, lhs, rhs):
+    for m1, m2 in zip(lhs['measurements'], rhs['measurements']):
+        unittest_instance.assertEqual(m1['real_time'], m2['real_time'])
+        unittest_instance.assertEqual(m1['cpu_time'], m2['cpu_time'])
+        # m1['time'] and m1['cpu'] hold values which are being calculated,
+        # and therefore we must use almost-equal pattern.
+        unittest_instance.assertAlmostEqual(m1['time'], m2['time'], places=4)
+        unittest_instance.assertAlmostEqual(m1['cpu'], m2['cpu'], places=4)
 
 
 if __name__ == '__main__':
diff --git a/tools/gbench/util.py b/tools/gbench/util.py
index 07c2377275..5d0012c0cb 100644
--- a/tools/gbench/util.py
+++ b/tools/gbench/util.py
@@ -5,13 +5,16 @@
 import tempfile
 import subprocess
 import sys
+import functools
 
 # Input file type enumeration
-IT_Invalid    = 0
-IT_JSON       = 1
+IT_Invalid = 0
+IT_JSON = 1
 IT_Executable = 2
 
 _num_magic_bytes = 2 if sys.platform.startswith('win') else 4
+
+
 def is_executable_file(filename):
     """
     Return 'True' if 'filename' names a valid file which is likely
@@ -46,7 +49,7 @@ def is_json_file(filename):
         with open(filename, 'r') as f:
             json.load(f)
         return True
-    except:
+    except BaseException:
         pass
     return False
 
@@ -84,6 +87,7 @@ def check_input_file(filename):
         sys.exit(1)
     return ftype
 
+
 def find_benchmark_flag(prefix, benchmark_flags):
     """
     Search the specified list of flags for a flag matching `<prefix><arg>` and
@@ -97,6 +101,7 @@ def find_benchmark_flag(prefix, benchmark_flags):
             result = f[len(prefix):]
     return result
 
+
 def remove_benchmark_flags(prefix, benchmark_flags):
     """
     Return a new list containing the specified benchmark_flags except those
@@ -105,6 +110,7 @@ def remove_benchmark_flags(prefix, benchmark_flags):
     assert prefix.startswith('--') and prefix.endswith('=')
     return [f for f in benchmark_flags if not f.startswith(prefix)]
 
+
 def load_benchmark_results(fname):
     """
     Read benchmark output from a file and return the JSON object.
@@ -114,6 +120,23 @@ def load_benchmark_results(fname):
         return json.load(f)
 
 
+def sort_benchmark_results(result):
+    benchmarks = result['benchmarks']
+
+    # From inner key to the outer key!
+    benchmarks = sorted(
+        benchmarks, key=lambda benchmark: benchmark['repetition_index'] if 'repetition_index' in benchmark else -1)
+    benchmarks = sorted(
+        benchmarks, key=lambda benchmark: 1 if 'run_type' in benchmark and benchmark['run_type'] == "aggregate" else 0)
+    benchmarks = sorted(
+        benchmarks, key=lambda benchmark: benchmark['per_family_instance_index'] if 'per_family_instance_index' in benchmark else -1)
+    benchmarks = sorted(
+        benchmarks, key=lambda benchmark: benchmark['family_index'] if 'family_index' in benchmark else -1)
+
+    result['benchmarks'] = benchmarks
+    return result
+
+
 def run_benchmark(exe_name, benchmark_flags):
     """
     Run a benchmark specified by 'exe_name' with the specified
@@ -129,7 +152,7 @@ def run_benchmark(exe_name, benchmark_flags):
         thandle, output_name = tempfile.mkstemp()
         os.close(thandle)
         benchmark_flags = list(benchmark_flags) + \
-                          ['--benchmark_out=%s' % output_name]
+            ['--benchmark_out=%s' % output_name]
 
     cmd = [exe_name] + benchmark_flags
     print("RUNNING: %s" % ' '.join(cmd))
@@ -153,7 +176,6 @@ def run_or_load_benchmark(filename, benchmark_flags):
     ftype = check_input_file(filename)
     if ftype == IT_JSON:
         return load_benchmark_results(filename)
-    elif ftype == IT_Executable:
+    if ftype == IT_Executable:
         return run_benchmark(filename, benchmark_flags)
-    else:
-        assert False # This branch is unreachable
\ No newline at end of file
+    raise ValueError('Unknown file type %s' % ftype)
diff --git a/tools/requirements.txt b/tools/requirements.txt
new file mode 100644
index 0000000000..3b3331b5af
--- /dev/null
+++ b/tools/requirements.txt
@@ -0,0 +1 @@
+scipy>=1.5.0
\ No newline at end of file
diff --git a/tools/strip_asm.py b/tools/strip_asm.py
new file mode 100755
index 0000000000..9030550b43
--- /dev/null
+++ b/tools/strip_asm.py
@@ -0,0 +1,151 @@
+#!/usr/bin/env python
+
+"""
+strip_asm.py - Cleanup ASM output for the specified file
+"""
+
+from argparse import ArgumentParser
+import sys
+import os
+import re
+
+def find_used_labels(asm):
+    found = set()
+    label_re = re.compile("\s*j[a-z]+\s+\.L([a-zA-Z0-9][a-zA-Z0-9_]*)")
+    for l in asm.splitlines():
+        m = label_re.match(l)
+        if m:
+            found.add('.L%s' % m.group(1))
+    return found
+
+
+def normalize_labels(asm):
+    decls = set()
+    label_decl = re.compile("^[.]{0,1}L([a-zA-Z0-9][a-zA-Z0-9_]*)(?=:)")
+    for l in asm.splitlines():
+        m = label_decl.match(l)
+        if m:
+            decls.add(m.group(0))
+    if len(decls) == 0:
+        return asm
+    needs_dot = next(iter(decls))[0] != '.'
+    if not needs_dot:
+        return asm
+    for ld in decls:
+        asm = re.sub("(^|\s+)" + ld + "(?=:|\s)", '\\1.' + ld, asm)
+    return asm
+
+
+def transform_labels(asm):
+    asm = normalize_labels(asm)
+    used_decls = find_used_labels(asm)
+    new_asm = ''
+    label_decl = re.compile("^\.L([a-zA-Z0-9][a-zA-Z0-9_]*)(?=:)")
+    for l in asm.splitlines():
+        m = label_decl.match(l)
+        if not m or m.group(0) in used_decls:
+            new_asm += l
+            new_asm += '\n'
+    return new_asm
+
+
+def is_identifier(tk):
+    if len(tk) == 0:
+        return False
+    first = tk[0]
+    if not first.isalpha() and first != '_':
+        return False
+    for i in range(1, len(tk)):
+        c = tk[i]
+        if not c.isalnum() and c != '_':
+            return False
+    return True
+
+def process_identifiers(l):
+    """
+    process_identifiers - process all identifiers and modify them to have
+    consistent names across all platforms; specifically across ELF and MachO.
+    For example, MachO inserts an additional understore at the beginning of
+    names. This function removes that.
+    """
+    parts = re.split(r'([a-zA-Z0-9_]+)', l)
+    new_line = ''
+    for tk in parts:
+        if is_identifier(tk):
+            if tk.startswith('__Z'):
+                tk = tk[1:]
+            elif tk.startswith('_') and len(tk) > 1 and \
+                    tk[1].isalpha() and tk[1] != 'Z':
+                tk = tk[1:]
+        new_line += tk
+    return new_line
+
+
+def process_asm(asm):
+    """
+    Strip the ASM of unwanted directives and lines
+    """
+    new_contents = ''
+    asm = transform_labels(asm)
+
+    # TODO: Add more things we want to remove
+    discard_regexes = [
+        re.compile("\s+\..*$"), # directive
+        re.compile("\s*#(NO_APP|APP)$"), #inline ASM
+        re.compile("\s*#.*$"), # comment line
+        re.compile("\s*\.globa?l\s*([.a-zA-Z_][a-zA-Z0-9$_.]*)"), #global directive
+        re.compile("\s*\.(string|asciz|ascii|[1248]?byte|short|word|long|quad|value|zero)"),
+    ]
+    keep_regexes = [
+
+    ]
+    fn_label_def = re.compile("^[a-zA-Z_][a-zA-Z0-9_.]*:")
+    for l in asm.splitlines():
+        # Remove Mach-O attribute
+        l = l.replace('@GOTPCREL', '')
+        add_line = True
+        for reg in discard_regexes:
+            if reg.match(l) is not None:
+                add_line = False
+                break
+        for reg in keep_regexes:
+            if reg.match(l) is not None:
+                add_line = True
+                break
+        if add_line:
+            if fn_label_def.match(l) and len(new_contents) != 0:
+                new_contents += '\n'
+            l = process_identifiers(l)
+            new_contents += l
+            new_contents += '\n'
+    return new_contents
+
+def main():
+    parser = ArgumentParser(
+        description='generate a stripped assembly file')
+    parser.add_argument(
+        'input', metavar='input', type=str, nargs=1,
+        help='An input assembly file')
+    parser.add_argument(
+        'out', metavar='output', type=str, nargs=1,
+        help='The output file')
+    args, unknown_args = parser.parse_known_args()
+    input = args.input[0]
+    output = args.out[0]
+    if not os.path.isfile(input):
+        print(("ERROR: input file '%s' does not exist") % input)
+        sys.exit(1)
+    contents = None
+    with open(input, 'r') as f:
+        contents = f.read()
+    new_contents = process_asm(contents)
+    with open(output, 'w') as f:
+        f.write(new_contents)
+
+
+if __name__ == '__main__':
+    main()
+
+# vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4
+# kate: tab-width: 4; replace-tabs on; indent-width 4; tab-indents: off;
+# kate: indent-mode python; remove-trailing-spaces modified;