From 3f2cd66c1cc9dc07cb8b64320956d701eecf20ab Mon Sep 17 00:00:00 2001 From: Daniel Lemire Date: Mon, 25 Nov 2024 11:59:03 -0500 Subject: [PATCH 1/2] adding actual benchmarks to the project --- .github/workflows/ubuntu24.yml | 2 +- CMakeLists.txt | 11 + benchmarks/CMakeLists.txt | 26 + benchmarks/apple_arm_events.h | 1117 ++++++++++++++++++++++++++++++++ benchmarks/benchmark.cpp | 247 +++++++ benchmarks/event_counter.h | 152 +++++ benchmarks/linux-perf-events.h | 103 +++ 7 files changed, 1657 insertions(+), 1 deletion(-) create mode 100644 benchmarks/CMakeLists.txt create mode 100644 benchmarks/apple_arm_events.h create mode 100644 benchmarks/benchmark.cpp create mode 100644 benchmarks/event_counter.h create mode 100644 benchmarks/linux-perf-events.h diff --git a/.github/workflows/ubuntu24.yml b/.github/workflows/ubuntu24.yml index 0a327b34..8da4c5fc 100644 --- a/.github/workflows/ubuntu24.yml +++ b/.github/workflows/ubuntu24.yml @@ -11,7 +11,7 @@ jobs: run: | mkdir build && cd build && - CXXFLAGS=-Werror cmake -DFASTFLOAT_TEST=ON .. && + CXXFLAGS=-Werror cmake -DFASTFLOAT_TEST=ON -D FASTFLOAT_BENCHMARKS=ON .. && cmake --build . && ctest --output-on-failure - name: Use cmake CXX23 diff --git a/CMakeLists.txt b/CMakeLists.txt index e331b15e..94fc0b7f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -4,6 +4,7 @@ project(fast_float VERSION 7.0.0 LANGUAGES CXX) set(FASTFLOAT_CXX_STANDARD 11 CACHE STRING "the C++ standard to use for fastfloat") set(CMAKE_CXX_STANDARD ${FASTFLOAT_CXX_STANDARD}) option(FASTFLOAT_TEST "Enable tests" OFF) + if(FASTFLOAT_TEST) enable_testing() add_subdirectory(tests) @@ -29,6 +30,16 @@ if(FASTFLOAT_INSTALL) endif() add_library(fast_float INTERFACE) + + +option(FASTFLOAT_BENCHMARKS "Enable benchmarks" OFF) +if(FASTFLOAT_BENCHMARKS) + add_subdirectory(benchmarks) +else(FASTFLOAT_BENCHMARKS) + message(STATUS "Benchmarks are disabled. Set FASTFLOAT_BENCHMARKS to ON to build benchmarks (assumes C++17).") +endif(FASTFLOAT_BENCHMARKS) + + add_library(FastFloat::fast_float ALIAS fast_float) target_include_directories( fast_float diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt new file mode 100644 index 00000000..b4e03954 --- /dev/null +++ b/benchmarks/CMakeLists.txt @@ -0,0 +1,26 @@ +add_executable(realbenchmark benchmark.cpp) +set_property( + TARGET realbenchmark + PROPERTY CXX_STANDARD 17) + +target_link_libraries(realbenchmark PUBLIC fast_float) +include(ExternalProject) + +# Define the external project +ExternalProject_Add(simple_fastfloat_benchmark + GIT_REPOSITORY https://github.com/lemire/simple_fastfloat_benchmark.git + GIT_TAG master # or specify a particular commit/tag/branch + SOURCE_DIR ${CMAKE_BINARY_DIR}/simple_fastfloat_benchmark + BINARY_DIR ${CMAKE_BINARY_DIR}/simple_fastfloat_benchmark-build + CONFIGURE_COMMAND "" + BUILD_COMMAND "" + INSTALL_COMMAND "" +) +set(DATA_DIR ${CMAKE_BINARY_DIR}/simple_fastfloat_benchmark/data) + +add_custom_target(CopyData ALL + COMMAND ${CMAKE_COMMAND} -E copy_directory ${DATA_DIR} ${CMAKE_CURRENT_BINARY_DIR}/data + DEPENDS simple_fastfloat_benchmark +) +add_dependencies(realbenchmark CopyData) +target_compile_definitions(realbenchmark PUBLIC BENCHMARK_DATA_DIR="${CMAKE_CURRENT_BINARY_DIR}/data") diff --git a/benchmarks/apple_arm_events.h b/benchmarks/apple_arm_events.h new file mode 100644 index 00000000..3a940811 --- /dev/null +++ b/benchmarks/apple_arm_events.h @@ -0,0 +1,1117 @@ +// Original design from: +// ============================================================================= +// XNU kperf/kpc +// Available for 64-bit Intel/Apple Silicon, macOS/iOS, with root privileges +// +// References: +// +// XNU source (since xnu 2422.1.72): +// https://github.com/apple/darwin-xnu/blob/main/osfmk/kern/kpc.h +// https://github.com/apple/darwin-xnu/blob/main/bsd/kern/kern_kpc.c +// +// Lightweight PET (Profile Every Thread, since xnu 3789.1.32): +// https://github.com/apple/darwin-xnu/blob/main/osfmk/kperf/pet.c +// https://github.com/apple/darwin-xnu/blob/main/osfmk/kperf/kperf_kpc.c +// +// System Private frameworks (since macOS 10.11, iOS 8.0): +// /System/Library/PrivateFrameworks/kperf.framework +// /System/Library/PrivateFrameworks/kperfdata.framework +// +// Xcode framework (since Xcode 7.0): +// /Applications/Xcode.app/Contents/SharedFrameworks/DVTInstrumentsFoundation.framework +// +// CPU database (plist files) +// macOS (since macOS 10.11): +// /usr/share/kpep/.plist +// iOS (copied from Xcode, since iOS 10.0, Xcode 8.0): +// /Applications/Xcode.app/Contents/Developer/Platforms/iPhoneOS.platform +// /DeviceSupport//DeveloperDiskImage.dmg/usr/share/kpep/.plist +// +// +// Created by YaoYuan on 2021. +// Released into the public domain (unlicense.org). +// ============================================================================= + +#ifndef M1CYCLES_H +#define M1CYCLES_H + +#include +#include +#include +#include +#include + +#include // for dlopen() and dlsym() +#include // for mach_absolute_time() +#include // for kdebug trace decode +#include // for sysctl() +#include // for usleep() + +struct performance_counters { + double cycles; + double branches; + double missed_branches; + double instructions; + performance_counters(uint64_t c, uint64_t b, uint64_t m, uint64_t i) + : cycles(c), branches(b), missed_branches(m), instructions(i) {} + performance_counters(double c, double b, double m, double i) + : cycles(c), branches(b), missed_branches(m), instructions(i) {} + performance_counters(double init) + : cycles(init), branches(init), missed_branches(init), + instructions(init) {} + + inline performance_counters &operator-=(const performance_counters &other) { + cycles -= other.cycles; + branches -= other.branches; + missed_branches -= other.missed_branches; + instructions -= other.instructions; + return *this; + } + inline performance_counters &min(const performance_counters &other) { + cycles = other.cycles < cycles ? other.cycles : cycles; + branches = other.branches < branches ? other.branches : branches; + missed_branches = other.missed_branches < missed_branches + ? other.missed_branches + : missed_branches; + instructions = + other.instructions < instructions ? other.instructions : instructions; + return *this; + } + inline performance_counters &operator+=(const performance_counters &other) { + cycles += other.cycles; + branches += other.branches; + missed_branches += other.missed_branches; + instructions += other.instructions; + return *this; + } + + inline performance_counters &operator/=(double numerator) { + cycles /= numerator; + branches /= numerator; + missed_branches /= numerator; + instructions /= numerator; + return *this; + } +}; + +inline performance_counters operator-(const performance_counters &a, + const performance_counters &b) { + return performance_counters(a.cycles - b.cycles, a.branches - b.branches, + a.missed_branches - b.missed_branches, + a.instructions - b.instructions); +} + + + +typedef float f32; +typedef double f64; +typedef int8_t i8; +typedef uint8_t u8; +typedef int16_t i16; +typedef uint16_t u16; +typedef int32_t i32; +typedef uint32_t u32; +typedef int64_t i64; +typedef uint64_t u64; +typedef size_t usize; + +// ----------------------------------------------------------------------------- +// header (reverse engineered) +// This framework wraps some sysctl calls to communicate with the kpc in kernel. +// Most functions requires root privileges, or process is "blessed". +// ----------------------------------------------------------------------------- + +// Cross-platform class constants. +#define KPC_CLASS_FIXED (0) +#define KPC_CLASS_CONFIGURABLE (1) +#define KPC_CLASS_POWER (2) +#define KPC_CLASS_RAWPMU (3) + +// Cross-platform class mask constants. +#define KPC_CLASS_FIXED_MASK (1u << KPC_CLASS_FIXED) // 1 +#define KPC_CLASS_CONFIGURABLE_MASK (1u << KPC_CLASS_CONFIGURABLE) // 2 +#define KPC_CLASS_POWER_MASK (1u << KPC_CLASS_POWER) // 4 +#define KPC_CLASS_RAWPMU_MASK (1u << KPC_CLASS_RAWPMU) // 8 + +// PMU version constants. +#define KPC_PMU_ERROR (0) // Error +#define KPC_PMU_INTEL_V3 (1) // Intel +#define KPC_PMU_ARM_APPLE (2) // ARM64 +#define KPC_PMU_INTEL_V2 (3) // Old Intel +#define KPC_PMU_ARM_V2 (4) // Old ARM + +// The maximum number of counters we could read from every class in one go. +// ARMV7: FIXED: 1, CONFIGURABLE: 4 +// ARM32: FIXED: 2, CONFIGURABLE: 6 +// ARM64: FIXED: 2, CONFIGURABLE: CORE_NCTRS - FIXED (6 or 8) +// x86: 32 +#define KPC_MAX_COUNTERS 32 + +// Bits for defining what to do on an action. +// Defined in https://github.com/apple/darwin-xnu/blob/main/osfmk/kperf/action.h +#define KPERF_SAMPLER_TH_INFO (1U << 0) +#define KPERF_SAMPLER_TH_SNAPSHOT (1U << 1) +#define KPERF_SAMPLER_KSTACK (1U << 2) +#define KPERF_SAMPLER_USTACK (1U << 3) +#define KPERF_SAMPLER_PMC_THREAD (1U << 4) +#define KPERF_SAMPLER_PMC_CPU (1U << 5) +#define KPERF_SAMPLER_PMC_CONFIG (1U << 6) +#define KPERF_SAMPLER_MEMINFO (1U << 7) +#define KPERF_SAMPLER_TH_SCHEDULING (1U << 8) +#define KPERF_SAMPLER_TH_DISPATCH (1U << 9) +#define KPERF_SAMPLER_TK_SNAPSHOT (1U << 10) +#define KPERF_SAMPLER_SYS_MEM (1U << 11) +#define KPERF_SAMPLER_TH_INSCYC (1U << 12) +#define KPERF_SAMPLER_TK_INFO (1U << 13) + +// Maximum number of kperf action ids. +#define KPERF_ACTION_MAX (32) + +// Maximum number of kperf timer ids. +#define KPERF_TIMER_MAX (8) + +// x86/arm config registers are 64-bit +typedef u64 kpc_config_t; + +/// Print current CPU identification string to the buffer (same as snprintf), +/// such as "cpu_7_8_10b282dc_46". This string can be used to locate the PMC +/// database in /usr/share/kpep. +/// @return string's length, or negative value if error occurs. +/// @note This method does not requires root privileges. +/// @details sysctl get(hw.cputype), get(hw.cpusubtype), +/// get(hw.cpufamily), get(machdep.cpu.model) +static int (*kpc_cpu_string)(char *buf, usize buf_size); + +/// Get the version of KPC that's being run. +/// @return See `PMU version constants` above. +/// @details sysctl get(kpc.pmu_version) +static u32 (*kpc_pmu_version)(void); + +/// Get running PMC classes. +/// @return See `class mask constants` above, +/// 0 if error occurs or no class is set. +/// @details sysctl get(kpc.counting) +static u32 (*kpc_get_counting)(void); + +/// Set PMC classes to enable counting. +/// @param classes See `class mask constants` above, set 0 to shutdown counting. +/// @return 0 for success. +/// @details sysctl set(kpc.counting) +static int (*kpc_set_counting)(u32 classes); + +/// Get running PMC classes for current thread. +/// @return See `class mask constants` above, +/// 0 if error occurs or no class is set. +/// @details sysctl get(kpc.thread_counting) +static u32 (*kpc_get_thread_counting)(void); + +/// Set PMC classes to enable counting for current thread. +/// @param classes See `class mask constants` above, set 0 to shutdown counting. +/// @return 0 for success. +/// @details sysctl set(kpc.thread_counting) +static int (*kpc_set_thread_counting)(u32 classes); + +/// Get how many config registers there are for a given mask. +/// For example: Intel may returns 1 for `KPC_CLASS_FIXED_MASK`, +/// returns 4 for `KPC_CLASS_CONFIGURABLE_MASK`. +/// @param classes See `class mask constants` above. +/// @return 0 if error occurs or no class is set. +/// @note This method does not requires root privileges. +/// @details sysctl get(kpc.config_count) +static u32 (*kpc_get_config_count)(u32 classes); + +/// Get config registers. +/// @param classes see `class mask constants` above. +/// @param config Config buffer to receive values, should not smaller than +/// kpc_get_config_count(classes) * sizeof(kpc_config_t). +/// @return 0 for success. +/// @details sysctl get(kpc.config_count), get(kpc.config) +static int (*kpc_get_config)(u32 classes, kpc_config_t *config); + +/// Set config registers. +/// @param classes see `class mask constants` above. +/// @param config Config buffer, should not smaller than +/// kpc_get_config_count(classes) * sizeof(kpc_config_t). +/// @return 0 for success. +/// @details sysctl get(kpc.config_count), set(kpc.config) +static int (*kpc_set_config)(u32 classes, kpc_config_t *config); + +/// Get how many counters there are for a given mask. +/// For example: Intel may returns 3 for `KPC_CLASS_FIXED_MASK`, +/// returns 4 for `KPC_CLASS_CONFIGURABLE_MASK`. +/// @param classes See `class mask constants` above. +/// @note This method does not requires root privileges. +/// @details sysctl get(kpc.counter_count) +static u32 (*kpc_get_counter_count)(u32 classes); + +/// Get counter accumulations. +/// If `all_cpus` is true, the buffer count should not smaller than +/// (cpu_count * counter_count). Otherwize, the buffer count should not smaller +/// than (counter_count). +/// @see kpc_get_counter_count(), kpc_cpu_count(). +/// @param all_cpus true for all CPUs, false for current cpu. +/// @param classes See `class mask constants` above. +/// @param curcpu A pointer to receive current cpu id, can be NULL. +/// @param buf Buffer to receive counter's value. +/// @return 0 for success. +/// @details sysctl get(hw.ncpu), get(kpc.counter_count), get(kpc.counters) +static int (*kpc_get_cpu_counters)(bool all_cpus, u32 classes, int *curcpu, + u64 *buf); + +/// Get counter accumulations for current thread. +/// @param tid Thread id, should be 0. +/// @param buf_count The number of buf's elements (not bytes), +/// should not smaller than kpc_get_counter_count(). +/// @param buf Buffer to receive counter's value. +/// @return 0 for success. +/// @details sysctl get(kpc.thread_counters) +static int (*kpc_get_thread_counters)(u32 tid, u32 buf_count, u64 *buf); + +/// Acquire/release the counters used by the Power Manager. +/// @param val 1:acquire, 0:release +/// @return 0 for success. +/// @details sysctl set(kpc.force_all_ctrs) +static int (*kpc_force_all_ctrs_set)(int val); + +/// Get the state of all_ctrs. +/// @return 0 for success. +/// @details sysctl get(kpc.force_all_ctrs) +static int (*kpc_force_all_ctrs_get)(int *val_out); + +/// Set number of actions, should be `KPERF_ACTION_MAX`. +/// @details sysctl set(kperf.action.count) +static int (*kperf_action_count_set)(u32 count); + +/// Get number of actions. +/// @details sysctl get(kperf.action.count) +static int (*kperf_action_count_get)(u32 *count); + +/// Set what to sample when a trigger fires an action, e.g. +/// `KPERF_SAMPLER_PMC_CPU`. +/// @details sysctl set(kperf.action.samplers) +static int (*kperf_action_samplers_set)(u32 actionid, u32 sample); + +/// Get what to sample when a trigger fires an action. +/// @details sysctl get(kperf.action.samplers) +static int (*kperf_action_samplers_get)(u32 actionid, u32 *sample); + +/// Apply a task filter to the action, -1 to disable filter. +/// @details sysctl set(kperf.action.filter_by_task) +static int (*kperf_action_filter_set_by_task)(u32 actionid, i32 port); + +/// Apply a pid filter to the action, -1 to disable filter. +/// @details sysctl set(kperf.action.filter_by_pid) +static int (*kperf_action_filter_set_by_pid)(u32 actionid, i32 pid); + +/// Set number of time triggers, should be `KPERF_TIMER_MAX`. +/// @details sysctl set(kperf.timer.count) +static int (*kperf_timer_count_set)(u32 count); + +/// Get number of time triggers. +/// @details sysctl get(kperf.timer.count) +static int (*kperf_timer_count_get)(u32 *count); + +/// Set timer number and period. +/// @details sysctl set(kperf.timer.period) +static int (*kperf_timer_period_set)(u32 actionid, u64 tick); + +/// Get timer number and period. +/// @details sysctl get(kperf.timer.period) +static int (*kperf_timer_period_get)(u32 actionid, u64 *tick); + +/// Set timer number and actionid. +/// @details sysctl set(kperf.timer.action) +static int (*kperf_timer_action_set)(u32 actionid, u32 timerid); + +/// Get timer number and actionid. +/// @details sysctl get(kperf.timer.action) +static int (*kperf_timer_action_get)(u32 actionid, u32 *timerid); + +/// Set which timer ID does PET (Profile Every Thread). +/// @details sysctl set(kperf.timer.pet_timer) +static int (*kperf_timer_pet_set)(u32 timerid); + +/// Get which timer ID does PET (Profile Every Thread). +/// @details sysctl get(kperf.timer.pet_timer) +static int (*kperf_timer_pet_get)(u32 *timerid); + +/// Enable or disable sampling. +/// @details sysctl set(kperf.sampling) +static int (*kperf_sample_set)(u32 enabled); + +/// Get is currently sampling. +/// @details sysctl get(kperf.sampling) +static int (*kperf_sample_get)(u32 *enabled); + +/// Reset kperf: stop sampling, kdebug, timers and actions. +/// @return 0 for success. +static int (*kperf_reset)(void); + +/// Nanoseconds to CPU ticks. +static u64 (*kperf_ns_to_ticks)(u64 ns); + +/// CPU ticks to nanoseconds. +static u64 (*kperf_ticks_to_ns)(u64 ticks); + +/// CPU ticks frequency (mach_absolute_time). +static u64 (*kperf_tick_frequency)(void); + +/// Get lightweight PET mode (not in kperf.framework). +static int kperf_lightweight_pet_get(u32 *enabled) { + if (!enabled) + return -1; + usize size = 4; + return sysctlbyname("kperf.lightweight_pet", enabled, &size, NULL, 0); +} + +/// Set lightweight PET mode (not in kperf.framework). +static int kperf_lightweight_pet_set(u32 enabled) { + return sysctlbyname("kperf.lightweight_pet", NULL, NULL, &enabled, 4); +} + +// ----------------------------------------------------------------------------- +// header (reverse engineered) +// This framework provides some functions to access the local CPU database. +// These functions do not require root privileges. +// ----------------------------------------------------------------------------- + +// KPEP CPU archtecture constants. +#define KPEP_ARCH_I386 0 +#define KPEP_ARCH_X86_64 1 +#define KPEP_ARCH_ARM 2 +#define KPEP_ARCH_ARM64 3 + +/// KPEP event (size: 48/28 bytes on 64/32 bit OS) +typedef struct kpep_event { + const char *name; ///< Unique name of a event, such as "INST_RETIRED.ANY". + const char *description; ///< Description for this event. + const char *errata; ///< Errata, currently NULL. + const char *alias; ///< Alias name, such as "Instructions", "Cycles". + const char *fallback; ///< Fallback event name for fixed counter. + u32 mask; + u8 number; + u8 umask; + u8 reserved; + u8 is_fixed; +} kpep_event; + +/// KPEP database (size: 144/80 bytes on 64/32 bit OS) +typedef struct kpep_db { + const char *name; ///< Database name, such as "haswell". + const char *cpu_id; ///< Plist name, such as "cpu_7_8_10b282dc". + const char *marketing_name; ///< Marketing name, such as "Intel Haswell". + void *plist_data; ///< Plist data (CFDataRef), currently NULL. + void *event_map; ///< All events (CFDict). + kpep_event + *event_arr; ///< Event struct buffer (sizeof(kpep_event) * events_count). + kpep_event **fixed_event_arr; ///< Fixed counter events (sizeof(kpep_event *) + ///< * fixed_counter_count) + void *alias_map; ///< All aliases (CFDict). + usize reserved_1; + usize reserved_2; + usize reserved_3; + usize event_count; ///< All events count. + usize alias_count; + usize fixed_counter_count; + usize config_counter_count; + usize power_counter_count; + u32 archtecture; ///< see `KPEP CPU archtecture constants` above. + u32 fixed_counter_bits; + u32 config_counter_bits; + u32 power_counter_bits; +} kpep_db; + +/// KPEP config (size: 80/44 bytes on 64/32 bit OS) +typedef struct kpep_config { + kpep_db *db; + kpep_event **ev_arr; ///< (sizeof(kpep_event *) * counter_count), init NULL + usize *ev_map; ///< (sizeof(usize *) * counter_count), init 0 + usize *ev_idx; ///< (sizeof(usize *) * counter_count), init -1 + u32 *flags; ///< (sizeof(u32 *) * counter_count), init 0 + u64 *kpc_periods; ///< (sizeof(u64 *) * counter_count), init 0 + usize event_count; /// kpep_config_events_count() + usize counter_count; + u32 classes; ///< See `class mask constants` above. + u32 config_counter; + u32 power_counter; + u32 reserved; +} kpep_config; + +/// Error code for kpep_config_xxx() and kpep_db_xxx() functions. +typedef enum { + KPEP_CONFIG_ERROR_NONE = 0, + KPEP_CONFIG_ERROR_INVALID_ARGUMENT = 1, + KPEP_CONFIG_ERROR_OUT_OF_MEMORY = 2, + KPEP_CONFIG_ERROR_IO = 3, + KPEP_CONFIG_ERROR_BUFFER_TOO_SMALL = 4, + KPEP_CONFIG_ERROR_CUR_SYSTEM_UNKNOWN = 5, + KPEP_CONFIG_ERROR_DB_PATH_INVALID = 6, + KPEP_CONFIG_ERROR_DB_NOT_FOUND = 7, + KPEP_CONFIG_ERROR_DB_ARCH_UNSUPPORTED = 8, + KPEP_CONFIG_ERROR_DB_VERSION_UNSUPPORTED = 9, + KPEP_CONFIG_ERROR_DB_CORRUPT = 10, + KPEP_CONFIG_ERROR_EVENT_NOT_FOUND = 11, + KPEP_CONFIG_ERROR_CONFLICTING_EVENTS = 12, + KPEP_CONFIG_ERROR_COUNTERS_NOT_FORCED = 13, + KPEP_CONFIG_ERROR_EVENT_UNAVAILABLE = 14, + KPEP_CONFIG_ERROR_ERRNO = 15, + KPEP_CONFIG_ERROR_MAX +} kpep_config_error_code; + +/// Error description for kpep_config_error_code. +static const char *kpep_config_error_names[KPEP_CONFIG_ERROR_MAX] = { + "none", + "invalid argument", + "out of memory", + "I/O", + "buffer too small", + "current system unknown", + "database path invalid", + "database not found", + "database architecture unsupported", + "database version unsupported", + "database corrupt", + "event not found", + "conflicting events", + "all counters must be forced", + "event unavailable", + "check errno"}; + +/// Error description. +static const char *kpep_config_error_desc(int code) { + if (0 <= code && code < KPEP_CONFIG_ERROR_MAX) { + return kpep_config_error_names[code]; + } + return "unknown error"; +} + +/// Create a config. +/// @param db A kpep db, see kpep_db_create() +/// @param cfg_ptr A pointer to receive the new config. +/// @return kpep_config_error_code, 0 for success. +static int (*kpep_config_create)(kpep_db *db, kpep_config **cfg_ptr); + +/// Free the config. +static void (*kpep_config_free)(kpep_config *cfg); + +/// Add an event to config. +/// @param cfg The config. +/// @param ev_ptr A event pointer. +/// @param flag 0: all, 1: user space only +/// @param err Error bitmap pointer, can be NULL. +/// If return value is `CONFLICTING_EVENTS`, this bitmap contains +/// the conflicted event indices, e.g. "1 << 2" means index 2. +/// @return kpep_config_error_code, 0 for success. +static int (*kpep_config_add_event)(kpep_config *cfg, kpep_event **ev_ptr, + u32 flag, u32 *err); + +/// Remove event at index. +/// @return kpep_config_error_code, 0 for success. +static int (*kpep_config_remove_event)(kpep_config *cfg, usize idx); + +/// Force all counters. +/// @return kpep_config_error_code, 0 for success. +static int (*kpep_config_force_counters)(kpep_config *cfg); + +/// Get events count. +/// @return kpep_config_error_code, 0 for success. +static int (*kpep_config_events_count)(kpep_config *cfg, usize *count_ptr); + +/// Get all event pointers. +/// @param buf A buffer to receive event pointers. +/// @param buf_size The buffer's size in bytes, should not smaller than +/// kpep_config_events_count() * sizeof(void *). +/// @return kpep_config_error_code, 0 for success. +static int (*kpep_config_events)(kpep_config *cfg, kpep_event **buf, + usize buf_size); + +/// Get kpc register configs. +/// @param buf A buffer to receive kpc register configs. +/// @param buf_size The buffer's size in bytes, should not smaller than +/// kpep_config_kpc_count() * sizeof(kpc_config_t). +/// @return kpep_config_error_code, 0 for success. +static int (*kpep_config_kpc)(kpep_config *cfg, kpc_config_t *buf, + usize buf_size); + +/// Get kpc register config count. +/// @return kpep_config_error_code, 0 for success. +static int (*kpep_config_kpc_count)(kpep_config *cfg, usize *count_ptr); + +/// Get kpc classes. +/// @param classes See `class mask constants` above. +/// @return kpep_config_error_code, 0 for success. +static int (*kpep_config_kpc_classes)(kpep_config *cfg, u32 *classes_ptr); + +/// Get the index mapping from event to counter. +/// @param buf A buffer to receive indexes. +/// @param buf_size The buffer's size in bytes, should not smaller than +/// kpep_config_events_count() * sizeof(kpc_config_t). +/// @return kpep_config_error_code, 0 for success. +static int (*kpep_config_kpc_map)(kpep_config *cfg, usize *buf, usize buf_size); + +/// Open a kpep database file in "/usr/share/kpep/" or "/usr/local/share/kpep/". +/// @param name File name, for example "haswell", "cpu_100000c_1_92fb37c8". +/// Pass NULL for current CPU. +/// @return kpep_config_error_code, 0 for success. +static int (*kpep_db_create)(const char *name, kpep_db **db_ptr); + +/// Free the kpep database. +static void (*kpep_db_free)(kpep_db *db); + +/// Get the database's name. +/// @return kpep_config_error_code, 0 for success. +static int (*kpep_db_name)(kpep_db *db, const char **name); + +/// Get the event alias count. +/// @return kpep_config_error_code, 0 for success. +static int (*kpep_db_aliases_count)(kpep_db *db, usize *count); + +/// Get all alias. +/// @param buf A buffer to receive all alias strings. +/// @param buf_size The buffer's size in bytes, +/// should not smaller than kpep_db_aliases_count() * sizeof(void *). +/// @return kpep_config_error_code, 0 for success. +static int (*kpep_db_aliases)(kpep_db *db, const char **buf, usize buf_size); + +/// Get counters count for given classes. +/// @param classes 1: Fixed, 2: Configurable. +/// @return kpep_config_error_code, 0 for success. +static int (*kpep_db_counters_count)(kpep_db *db, u8 classes, usize *count); + +/// Get all event count. +/// @return kpep_config_error_code, 0 for success. +static int (*kpep_db_events_count)(kpep_db *db, usize *count); + +/// Get all events. +/// @param buf A buffer to receive all event pointers. +/// @param buf_size The buffer's size in bytes, +/// should not smaller than kpep_db_events_count() * sizeof(void *). +/// @return kpep_config_error_code, 0 for success. +static int (*kpep_db_events)(kpep_db *db, kpep_event **buf, usize buf_size); + +/// Get one event by name. +/// @return kpep_config_error_code, 0 for success. +static int (*kpep_db_event)(kpep_db *db, const char *name, kpep_event **ev_ptr); + +/// Get event's name. +/// @return kpep_config_error_code, 0 for success. +static int (*kpep_event_name)(kpep_event *ev, const char **name_ptr); + +/// Get event's alias. +/// @return kpep_config_error_code, 0 for success. +static int (*kpep_event_alias)(kpep_event *ev, const char **alias_ptr); + +/// Get event's description. +/// @return kpep_config_error_code, 0 for success. +static int (*kpep_event_description)(kpep_event *ev, const char **str_ptr); + +// ----------------------------------------------------------------------------- +// load kperf/kperfdata dynamic library +// ----------------------------------------------------------------------------- + +typedef struct { + const char *name; + void **impl; +} lib_symbol; + +#define lib_nelems(x) (sizeof(x) / sizeof((x)[0])) +#define lib_symbol_def(name) \ + { \ +#name, (void **)&name \ + } + +static const lib_symbol lib_symbols_kperf[] = { + lib_symbol_def(kpc_pmu_version), + lib_symbol_def(kpc_cpu_string), + lib_symbol_def(kpc_set_counting), + lib_symbol_def(kpc_get_counting), + lib_symbol_def(kpc_set_thread_counting), + lib_symbol_def(kpc_get_thread_counting), + lib_symbol_def(kpc_get_config_count), + lib_symbol_def(kpc_get_counter_count), + lib_symbol_def(kpc_set_config), + lib_symbol_def(kpc_get_config), + lib_symbol_def(kpc_get_cpu_counters), + lib_symbol_def(kpc_get_thread_counters), + lib_symbol_def(kpc_force_all_ctrs_set), + lib_symbol_def(kpc_force_all_ctrs_get), + lib_symbol_def(kperf_action_count_set), + lib_symbol_def(kperf_action_count_get), + lib_symbol_def(kperf_action_samplers_set), + lib_symbol_def(kperf_action_samplers_get), + lib_symbol_def(kperf_action_filter_set_by_task), + lib_symbol_def(kperf_action_filter_set_by_pid), + lib_symbol_def(kperf_timer_count_set), + lib_symbol_def(kperf_timer_count_get), + lib_symbol_def(kperf_timer_period_set), + lib_symbol_def(kperf_timer_period_get), + lib_symbol_def(kperf_timer_action_set), + lib_symbol_def(kperf_timer_action_get), + lib_symbol_def(kperf_sample_set), + lib_symbol_def(kperf_sample_get), + lib_symbol_def(kperf_reset), + lib_symbol_def(kperf_timer_pet_set), + lib_symbol_def(kperf_timer_pet_get), + lib_symbol_def(kperf_ns_to_ticks), + lib_symbol_def(kperf_ticks_to_ns), + lib_symbol_def(kperf_tick_frequency), +}; + +static const lib_symbol lib_symbols_kperfdata[] = { + lib_symbol_def(kpep_config_create), + lib_symbol_def(kpep_config_free), + lib_symbol_def(kpep_config_add_event), + lib_symbol_def(kpep_config_remove_event), + lib_symbol_def(kpep_config_force_counters), + lib_symbol_def(kpep_config_events_count), + lib_symbol_def(kpep_config_events), + lib_symbol_def(kpep_config_kpc), + lib_symbol_def(kpep_config_kpc_count), + lib_symbol_def(kpep_config_kpc_classes), + lib_symbol_def(kpep_config_kpc_map), + lib_symbol_def(kpep_db_create), + lib_symbol_def(kpep_db_free), + lib_symbol_def(kpep_db_name), + lib_symbol_def(kpep_db_aliases_count), + lib_symbol_def(kpep_db_aliases), + lib_symbol_def(kpep_db_counters_count), + lib_symbol_def(kpep_db_events_count), + lib_symbol_def(kpep_db_events), + lib_symbol_def(kpep_db_event), + lib_symbol_def(kpep_event_name), + lib_symbol_def(kpep_event_alias), + lib_symbol_def(kpep_event_description), +}; + +#define lib_path_kperf "/System/Library/PrivateFrameworks/kperf.framework/kperf" +#define lib_path_kperfdata \ + "/System/Library/PrivateFrameworks/kperfdata.framework/kperfdata" + +static bool lib_inited = false; +static bool lib_has_err = false; +static char lib_err_msg[256]; + +static void *lib_handle_kperf = NULL; +static void *lib_handle_kperfdata = NULL; + +static void lib_deinit(void) { + lib_inited = false; + lib_has_err = false; + if (lib_handle_kperf) + dlclose(lib_handle_kperf); + if (lib_handle_kperfdata) + dlclose(lib_handle_kperfdata); + lib_handle_kperf = NULL; + lib_handle_kperfdata = NULL; + for (usize i = 0; i < lib_nelems(lib_symbols_kperf); i++) { + const lib_symbol *symbol = &lib_symbols_kperf[i]; + *symbol->impl = NULL; + } + for (usize i = 0; i < lib_nelems(lib_symbols_kperfdata); i++) { + const lib_symbol *symbol = &lib_symbols_kperfdata[i]; + *symbol->impl = NULL; + } +} + +static bool lib_init(void) { +#define return_err() \ + do { \ + lib_deinit(); \ + lib_inited = true; \ + lib_has_err = true; \ + return false; \ + } while (false) + + if (lib_inited) + return !lib_has_err; + + // load dynamic library + lib_handle_kperf = dlopen(lib_path_kperf, RTLD_LAZY); + if (!lib_handle_kperf) { + snprintf(lib_err_msg, sizeof(lib_err_msg), + "Failed to load kperf.framework, message: %s.", dlerror()); + return_err(); + } + lib_handle_kperfdata = dlopen(lib_path_kperfdata, RTLD_LAZY); + if (!lib_handle_kperfdata) { + snprintf(lib_err_msg, sizeof(lib_err_msg), + "Failed to load kperfdata.framework, message: %s.", dlerror()); + return_err(); + } + + // load symbol address from dynamic library + for (usize i = 0; i < lib_nelems(lib_symbols_kperf); i++) { + const lib_symbol *symbol = &lib_symbols_kperf[i]; + *symbol->impl = dlsym(lib_handle_kperf, symbol->name); + if (!*symbol->impl) { + snprintf(lib_err_msg, sizeof(lib_err_msg), + "Failed to load kperf function: %s.", symbol->name); + return_err(); + } + } + for (usize i = 0; i < lib_nelems(lib_symbols_kperfdata); i++) { + const lib_symbol *symbol = &lib_symbols_kperfdata[i]; + *symbol->impl = dlsym(lib_handle_kperfdata, symbol->name); + if (!*symbol->impl) { + snprintf(lib_err_msg, sizeof(lib_err_msg), + "Failed to load kperfdata function: %s.", symbol->name); + return_err(); + } + } + + lib_inited = true; + lib_has_err = false; + return true; + +#undef return_err +} + +// ----------------------------------------------------------------------------- +// kdebug private structs +// https://github.com/apple/darwin-xnu/blob/main/bsd/sys_private/kdebug_private.h +// ----------------------------------------------------------------------------- + +/* + * Ensure that both LP32 and LP64 variants of arm64 use the same kd_buf + * structure. + */ +#if defined(__arm64__) +typedef uint64_t kd_buf_argtype; +#else +typedef uintptr_t kd_buf_argtype; +#endif + +typedef struct { + uint64_t timestamp; + kd_buf_argtype arg1; + kd_buf_argtype arg2; + kd_buf_argtype arg3; + kd_buf_argtype arg4; + kd_buf_argtype arg5; /* the thread ID */ + uint32_t debugid; /* see */ + +/* + * Ensure that both LP32 and LP64 variants of arm64 use the same kd_buf + * structure. + */ +#if defined(__LP64__) || defined(__arm64__) + uint32_t cpuid; /* cpu index, from 0 */ + kd_buf_argtype unused; +#endif +} kd_buf; + +/* bits for the type field of kd_regtype */ +#define KDBG_CLASSTYPE 0x10000 +#define KDBG_SUBCLSTYPE 0x20000 +#define KDBG_RANGETYPE 0x40000 +#define KDBG_TYPENONE 0x80000 +#define KDBG_CKTYPES 0xF0000 + +/* only trace at most 4 types of events, at the code granularity */ +#define KDBG_VALCHECK 0x00200000U + +typedef struct { + unsigned int type; + unsigned int value1; + unsigned int value2; + unsigned int value3; + unsigned int value4; +} kd_regtype; + +typedef struct { + /* number of events that can fit in the buffers */ + int nkdbufs; + /* set if trace is disabled */ + int nolog; + /* kd_ctrl_page.flags */ + unsigned int flags; + /* number of threads in thread map */ + int nkdthreads; + /* the owning pid */ + int bufid; +} kbufinfo_t; + +// ----------------------------------------------------------------------------- +// kdebug utils +// ----------------------------------------------------------------------------- + +/// Clean up trace buffers and reset ktrace/kdebug/kperf. +/// @return 0 on success. +static int kdebug_reset(void) { + int mib[3] = {CTL_KERN, KERN_KDEBUG, KERN_KDREMOVE}; + return sysctl(mib, 3, NULL, NULL, NULL, 0); +} + +/// Disable and reinitialize the trace buffers. +/// @return 0 on success. +static int kdebug_reinit(void) { + int mib[3] = {CTL_KERN, KERN_KDEBUG, KERN_KDSETUP}; + return sysctl(mib, 3, NULL, NULL, NULL, 0); +} + +/// Set debug filter. +static int kdebug_setreg(kd_regtype *kdr) { + int mib[3] = {CTL_KERN, KERN_KDEBUG, KERN_KDSETREG}; + usize size = sizeof(kd_regtype); + return sysctl(mib, 3, kdr, &size, NULL, 0); +} + +/// Set maximum number of trace entries (kd_buf). +/// Only allow allocation up to half the available memory (sane_size). +/// @return 0 on success. +static int kdebug_trace_setbuf(int nbufs) { + int mib[4] = {CTL_KERN, KERN_KDEBUG, KERN_KDSETBUF, nbufs}; + return sysctl(mib, 4, NULL, NULL, NULL, 0); +} + +/// Enable or disable kdebug trace. +/// Trace buffer must already be initialized. +/// @return 0 on success. +static int kdebug_trace_enable(bool enable) { + int mib[4] = {CTL_KERN, KERN_KDEBUG, KERN_KDENABLE, enable}; + return sysctl(mib, 4, NULL, 0, NULL, 0); +} + +/// Retrieve trace buffer information from kernel. +/// @return 0 on success. +static int kdebug_get_bufinfo(kbufinfo_t *info) { + if (!info) + return -1; + int mib[3] = {CTL_KERN, KERN_KDEBUG, KERN_KDGETBUF}; + size_t needed = sizeof(kbufinfo_t); + return sysctl(mib, 3, info, &needed, NULL, 0); +} + +/// Retrieve trace buffers from kernel. +/// @param buf Memory to receive buffer data, array of `kd_buf`. +/// @param len Length of `buf` in bytes. +/// @param count Number of trace entries (kd_buf) obtained. +/// @return 0 on success. +static int kdebug_trace_read(void *buf, usize len, usize *count) { + if (count) + *count = 0; + if (!buf || !len) + return -1; + + // Note: the input and output units are not the same. + // input: bytes + // output: number of kd_buf + int mib[3] = {CTL_KERN, KERN_KDEBUG, KERN_KDREADTR}; + int ret = sysctl(mib, 3, buf, &len, NULL, 0); + if (ret != 0) + return ret; + *count = len; + return 0; +} + +/// Block until there are new buffers filled or `timeout_ms` have passed. +/// @param timeout_ms timeout milliseconds, 0 means wait forever. +/// @param suc set true if new buffers filled. +/// @return 0 on success. +static int kdebug_wait(usize timeout_ms, bool *suc) { + if (timeout_ms == 0) + return -1; + int mib[3] = {CTL_KERN, KERN_KDEBUG, KERN_KDBUFWAIT}; + usize val = timeout_ms; + int ret = sysctl(mib, 3, NULL, &val, NULL, 0); + if (suc) + *suc = !!val; + return ret; +} + +// ----------------------------------------------------------------------------- +// Demo +// ----------------------------------------------------------------------------- + +#define EVENT_NAME_MAX 8 +typedef struct { + const char *alias; /// name for print + const char *names[EVENT_NAME_MAX]; /// name from pmc db +} event_alias; + +/// Event names from /usr/share/kpep/.plist +static const event_alias profile_events[] = { + {"cycles", + { + "FIXED_CYCLES", // Apple A7-A15//CORE_ACTIVE_CYCLE + "CPU_CLK_UNHALTED.THREAD", // Intel Core 1th-10th + "CPU_CLK_UNHALTED.CORE", // Intel Yonah, Merom + }}, + {"instructions", + { + "FIXED_INSTRUCTIONS", // Apple A7-A15 + "INST_RETIRED.ANY" // Intel Yonah, Merom, Core 1th-10th + }}, + {"branches", + { + "INST_BRANCH", // Apple A7-A15 + "BR_INST_RETIRED.ALL_BRANCHES", // Intel Core 1th-10th + "INST_RETIRED.ANY", // Intel Yonah, Merom + }}, + {"branch-misses", + { + "BRANCH_MISPRED_NONSPEC", // Apple A7-A15, since iOS 15, macOS 12 + "BRANCH_MISPREDICT", // Apple A7-A14 + "BR_MISP_RETIRED.ALL_BRANCHES", // Intel Core 2th-10th + "BR_INST_RETIRED.MISPRED", // Intel Yonah, Merom + }}, +}; + +static kpep_event *get_event(kpep_db *db, const event_alias *alias) { + for (usize j = 0; j < EVENT_NAME_MAX; j++) { + const char *name = alias->names[j]; + if (!name) + break; + kpep_event *ev = NULL; + if (kpep_db_event(db, name, &ev) == 0) { + return ev; + } + } + return NULL; +} + +kpc_config_t regs[KPC_MAX_COUNTERS] = {0}; +usize counter_map[KPC_MAX_COUNTERS] = {0}; +u64 counters_0[KPC_MAX_COUNTERS] = {0}; +u64 counters_1[KPC_MAX_COUNTERS] = {0}; +const usize ev_count = sizeof(profile_events) / sizeof(profile_events[0]); + + +bool setup_performance_counters() { + static bool init = false; + static bool worked = false; + + if (init) { + return worked; + } + init = true; + + // load dylib + if (!lib_init()) { + printf("Error: %s\n", lib_err_msg); + return (worked = false); + } + + // check permission + int force_ctrs = 0; + if (kpc_force_all_ctrs_get(&force_ctrs)) { + //printf("Permission denied, xnu/kpc requires root privileges.\n"); + return (worked = false); + } + int ret; + // load pmc db + kpep_db *db = NULL; + if ((ret = kpep_db_create(NULL, &db))) { + printf("Error: cannot load pmc database: %d.\n", ret); + return (worked = false); + } + printf("loaded db: %s (%s)\n", db->name, db->marketing_name); + + // create a config + kpep_config *cfg = NULL; + if ((ret = kpep_config_create(db, &cfg))) { + printf("Failed to create kpep config: %d (%s).\n", ret, + kpep_config_error_desc(ret)); + return (worked = false); + } + if ((ret = kpep_config_force_counters(cfg))) { + printf("Failed to force counters: %d (%s).\n", ret, + kpep_config_error_desc(ret)); + return (worked = false); + } + + // get events + kpep_event *ev_arr[ev_count] = {0}; + for (usize i = 0; i < ev_count; i++) { + const event_alias *alias = profile_events + i; + ev_arr[i] = get_event(db, alias); + if (!ev_arr[i]) { + printf("Cannot find event: %s.\n", alias->alias); + return (worked = false); + } + } + + // add event to config + for (usize i = 0; i < ev_count; i++) { + kpep_event *ev = ev_arr[i]; + if ((ret = kpep_config_add_event(cfg, &ev, 0, NULL))) { + printf("Failed to add event: %d (%s).\n", ret, + kpep_config_error_desc(ret)); + return (worked = false); + } + } + + // prepare buffer and config + u32 classes = 0; + usize reg_count = 0; + if ((ret = kpep_config_kpc_classes(cfg, &classes))) { + printf("Failed get kpc classes: %d (%s).\n", ret, + kpep_config_error_desc(ret)); + return (worked = false); + } + if ((ret = kpep_config_kpc_count(cfg, ®_count))) { + printf("Failed get kpc count: %d (%s).\n", ret, + kpep_config_error_desc(ret)); + return (worked = false); + } + if ((ret = kpep_config_kpc_map(cfg, counter_map, sizeof(counter_map)))) { + printf("Failed get kpc map: %d (%s).\n", ret, kpep_config_error_desc(ret)); + return (worked = false); + } + if ((ret = kpep_config_kpc(cfg, regs, sizeof(regs)))) { + printf("Failed get kpc registers: %d (%s).\n", ret, + kpep_config_error_desc(ret)); + return (worked = false); + } + + // set config to kernel + if ((ret = kpc_force_all_ctrs_set(1))) { + printf("Failed force all ctrs: %d.\n", ret); + return (worked = false); + } + if ((classes & KPC_CLASS_CONFIGURABLE_MASK) && reg_count) { + if ((ret = kpc_set_config(classes, regs))) { + printf("Failed set kpc config: %d.\n", ret); + return (worked = false); + } + } + + // start counting + if ((ret = kpc_set_counting(classes))) { + printf("Failed set counting: %d.\n", ret); + return (worked = false); + } + if ((ret = kpc_set_thread_counting(classes))) { + printf("Failed set thread counting: %d.\n", ret); + return (worked = false); + } + + return (worked = true); +} + +inline performance_counters get_counters() { + static bool warned = false; + int ret; + // get counters before + if ((ret = kpc_get_thread_counters(0, KPC_MAX_COUNTERS, counters_0))) { + if (!warned) { + + printf("Failed get thread counters before: %d.\n", ret); + warned = true; + } + return 1; + } + /*printf("counters value:\n"); + for (usize i = 0; i < ev_count; i++) { + const event_alias *alias = profile_events + i; + usize idx = counter_map[i]; + u64 val = counters_1[idx] - counters_0[idx]; + printf("%14s: %llu\n", alias->alias, val); + }*/ + return performance_counters{ + counters_0[counter_map[0]], counters_0[counter_map[2]], + counters_0[counter_map[3]], + counters_0[counter_map[1]]}; +} + +#endif diff --git a/benchmarks/benchmark.cpp b/benchmarks/benchmark.cpp new file mode 100644 index 00000000..c6b091f1 --- /dev/null +++ b/benchmarks/benchmark.cpp @@ -0,0 +1,247 @@ +#if defined(__linux__) || (__APPLE__ && __aarch64__) +#define USING_COUNTERS +#include "event_counter.h" +#endif +#include +#include "fast_float/fast_float.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +template +double findmax_fastfloat64(std::vector> &s) { + double answer = 0; + double x = 0; + for (auto &st : s) { + auto [p, ec] = fast_float::from_chars(st.data(), st.data() + st.size(), x); + if (p == st.data()) { + throw std::runtime_error("bug in findmax_fastfloat"); + } + answer = answer > x ? answer : x; + } + return answer; +} + +template +double findmax_fastfloat32(std::vector> &s) { + float answer = 0; + float x = 0; + for (auto &st : s) { + auto [p, ec] = fast_float::from_chars(st.data(), st.data() + st.size(), x); + if (p == st.data()) { + throw std::runtime_error("bug in findmax_fastfloat"); + } + answer = answer > x ? answer : x; + } + return answer; +} + +event_collector collector{}; + +#ifdef USING_COUNTERS +template +std::vector time_it_ns(std::vector> &lines, + T const &function, size_t repeat) { + std::vector aggregate; + bool printed_bug = false; + for (size_t i = 0; i < repeat; i++) { + collector.start(); + double ts = function(lines); + if (ts == 0 && !printed_bug) { + printf("bug\n"); + printed_bug = true; + } + aggregate.push_back(collector.end()); + } + return aggregate; +} + +void pretty_print(double volume, size_t number_of_floats, std::string name, std::vector events) { + double volumeMB = volume / (1024. * 1024.); + double average_ns{0}; + double min_ns{DBL_MAX}; + double cycles_min{DBL_MAX}; + double instructions_min{DBL_MAX}; + double cycles_avg{0}; + double instructions_avg{0}; + double branches_min{0}; + double branches_avg{0}; + double branch_misses_min{0}; + double branch_misses_avg{0}; + for(event_count e : events) { + double ns = e.elapsed_ns(); + average_ns += ns; + min_ns = min_ns < ns ? min_ns : ns; + + double cycles = e.cycles(); + cycles_avg += cycles; + cycles_min = cycles_min < cycles ? cycles_min : cycles; + + double instructions = e.instructions(); + instructions_avg += instructions; + instructions_min = instructions_min < instructions ? instructions_min : instructions; + + double branches = e.branches(); + branches_avg += branches; + branches_min = branches_min < branches ? branches_min : branches; + + double branch_misses = e.missed_branches(); + branch_misses_avg += branch_misses; + branch_misses_min = branch_misses_min < branch_misses ? branch_misses_min : branch_misses; + } + cycles_avg /= events.size(); + instructions_avg /= events.size(); + average_ns /= events.size(); + branches_avg /= events.size(); + printf("%-40s: %8.2f MB/s (+/- %.1f %%) ", name.data(), + volumeMB * 1000000000 / min_ns, + (average_ns - min_ns) * 100.0 / average_ns); + printf("%8.2f Mfloat/s ", + number_of_floats * 1000 / min_ns); + if(instructions_min > 0) { + printf(" %8.2f i/B %8.2f i/f (+/- %.1f %%) ", + instructions_min / volume, + instructions_min / number_of_floats, + (instructions_avg - instructions_min) * 100.0 / instructions_avg); + + printf(" %8.2f c/B %8.2f c/f (+/- %.1f %%) ", + cycles_min / volume, + cycles_min / number_of_floats, + (cycles_avg - cycles_min) * 100.0 / cycles_avg); + printf(" %8.2f i/c ", + instructions_min /cycles_min); + printf(" %8.2f b/f ", + branches_avg /number_of_floats); + printf(" %8.2f bm/f ", + branch_misses_avg /number_of_floats); + printf(" %8.2f GHz ", + cycles_min / min_ns); + } + printf("\n"); + +} +#else +template +std::pair time_it_ns(std::vector> &lines, + T const &function, size_t repeat) { + std::chrono::high_resolution_clock::time_point t1, t2; + double average = 0; + double min_value = DBL_MAX; + bool printed_bug = false; + for (size_t i = 0; i < repeat; i++) { + t1 = std::chrono::high_resolution_clock::now(); + double ts = function(lines); + if (ts == 0 && !printed_bug) { + printf("bug\n"); + printed_bug = true; + } + t2 = std::chrono::high_resolution_clock::now(); + double dif = + std::chrono::duration_cast(t2 - t1).count(); + average += dif; + min_value = min_value < dif ? min_value : dif; + } + average /= repeat; + return std::make_pair(min_value, average); +} + + + + +void pretty_print(double volume, size_t number_of_floats, std::string name, std::pair result) { + double volumeMB = volume / (1024. * 1024.); + printf("%-40s: %8.2f MB/s (+/- %.1f %%) ", name.data(), + volumeMB * 1000000000 / result.first, + (result.second - result.first) * 100.0 / result.second); + printf("%8.2f Mfloat/s ", + number_of_floats * 1000 / result.first); + printf(" %8.2f ns/f \n", + double(result.first) /number_of_floats ); +} +#endif + + +// this is okay, all chars are ASCII +inline std::u16string widen(std::string line) { + std::u16string u16line; + u16line.resize(line.size()); + for (size_t i = 0; i < line.size(); ++i) { + u16line[i] = char16_t(line[i]); + } + return u16line; +} + +std::vector widen(const std::vector &lines) { + std::vector u16lines; + u16lines.reserve(lines.size()); + for (auto const &line : lines) { + u16lines.push_back(widen(line)); + } + return u16lines; +} + + +void process(std::vector &lines, size_t volume) { + size_t repeat = 100; + double volumeMB = volume / (1024. * 1024.); + std::cout << "ASCII volume = " << volumeMB << " MB " << std::endl; + pretty_print(volume, lines.size(), "fastfloat (64)", time_it_ns(lines, findmax_fastfloat64, repeat)); + pretty_print(volume, lines.size(), "fastfloat (32)", time_it_ns(lines, findmax_fastfloat32, repeat)); + + std::vector lines16 = widen(lines); + volume = 2 * volume; + volumeMB = volume / (1024. * 1024.); + std::cout << "UTF-16 volume = " << volumeMB << " MB " << std::endl; + pretty_print(volume, lines.size(), "fastfloat (64)", time_it_ns(lines16, findmax_fastfloat64, repeat)); + pretty_print(volume, lines.size(), "fastfloat (32)", time_it_ns(lines16, findmax_fastfloat32, repeat)); + +} + +void fileload(std::string filename) { + std::ifstream inputfile(filename); + if (!inputfile) { + std::cerr << "can't open " << filename << std::endl; + return; + } + std::cout << "#### " << std::endl; + std::cout << "# reading " << filename << std::endl; + std::cout << "#### " << std::endl; + std::string line; + std::vector lines; + lines.reserve(10000); // let us reserve plenty of memory. + size_t volume = 0; + while (getline(inputfile, line)) { + volume += line.size(); + lines.push_back(line); + } + std::cout << "# read " << lines.size() << " lines " << std::endl; + process(lines, volume); +} + + +int main(int argc, char **argv) { + if(collector.has_events()) { + std::cout << "# Using hardware counters" << std::endl; + } else { +#if defined(__linux__) || (__APPLE__ && __aarch64__) + std::cout << "# Hardware counters not available, try to run in privileged mode (e.g., sudo)." << std::endl; +#endif + } + fileload(std::string(BENCHMARK_DATA_DIR) + "/canada.txt"); + fileload(std::string(BENCHMARK_DATA_DIR) + "/mesh.txt"); +} diff --git a/benchmarks/event_counter.h b/benchmarks/event_counter.h new file mode 100644 index 00000000..fb6db3a6 --- /dev/null +++ b/benchmarks/event_counter.h @@ -0,0 +1,152 @@ +#ifndef __EVENT_COUNTER_H +#define __EVENT_COUNTER_H + +#include +#ifndef _MSC_VER +#include +#endif +#include + +#include + +#include +#include + +#include "linux-perf-events.h" +#ifdef __linux__ +#include +#endif + +#if __APPLE__ && __aarch64__ +#include "apple_arm_events.h" +#endif + +struct event_count { + std::chrono::duration elapsed; + std::vector event_counts; + event_count() : elapsed(0), event_counts{0,0,0,0,0} {} + event_count(const std::chrono::duration _elapsed, const std::vector _event_counts) : elapsed(_elapsed), event_counts(_event_counts) {} + event_count(const event_count& other): elapsed(other.elapsed), event_counts(other.event_counts) { } + + // The types of counters (so we can read the getter more easily) + enum event_counter_types { + CPU_CYCLES = 0, + INSTRUCTIONS = 1, + BRANCHES = 2, + MISSED_BRANCHES = 3 + }; + + double elapsed_sec() const { return std::chrono::duration(elapsed).count(); } + double elapsed_ns() const { return std::chrono::duration(elapsed).count(); } + double cycles() const { return static_cast(event_counts[CPU_CYCLES]); } + double instructions() const { return static_cast(event_counts[INSTRUCTIONS]); } + double branches() const { return static_cast(event_counts[BRANCHES]); } + double missed_branches() const { return static_cast(event_counts[MISSED_BRANCHES]); } + + event_count& operator=(const event_count& other) { + this->elapsed = other.elapsed; + this->event_counts = other.event_counts; + return *this; + } + event_count operator+(const event_count& other) const { + return event_count(elapsed+other.elapsed, { + event_counts[0]+other.event_counts[0], + event_counts[1]+other.event_counts[1], + event_counts[2]+other.event_counts[2], + event_counts[3]+other.event_counts[3], + event_counts[4]+other.event_counts[4], + }); + } + + void operator+=(const event_count& other) { + *this = *this + other; + } +}; + +struct event_aggregate { + bool has_events = false; + int iterations = 0; + event_count total{}; + event_count best{}; + event_count worst{}; + + event_aggregate() = default; + + void operator<<(const event_count& other) { + if (iterations == 0 || other.elapsed < best.elapsed) { + best = other; + } + if (iterations == 0 || other.elapsed > worst.elapsed) { + worst = other; + } + iterations++; + total += other; + } + + double elapsed_sec() const { return total.elapsed_sec() / iterations; } + double elapsed_ns() const { return total.elapsed_ns() / iterations; } + double cycles() const { return total.cycles() / iterations; } + double instructions() const { return total.instructions() / iterations; } + double branches() const { return total.branches() / iterations; } + double missed_branches() const { return total.missed_branches() / iterations; } +}; + +struct event_collector { + event_count count{}; + std::chrono::time_point start_clock{}; + +#if defined(__linux__) + LinuxEvents linux_events; + event_collector() : linux_events(std::vector{ + PERF_COUNT_HW_CPU_CYCLES, + PERF_COUNT_HW_INSTRUCTIONS, + PERF_COUNT_HW_BRANCH_INSTRUCTIONS, // Retired branch instructions + PERF_COUNT_HW_BRANCH_MISSES + }) {} + bool has_events() { + return linux_events.is_working(); + } +#elif __APPLE__ && __aarch64__ + performance_counters diff; + event_collector() : diff(0) { + setup_performance_counters(); + } + bool has_events() { + return setup_performance_counters(); + } +#else + event_collector() {} + bool has_events() { + return false; + } +#endif + + inline void start() { +#if defined(__linux) + linux_events.start(); +#elif __APPLE__ && __aarch64__ + if(has_events()) { diff = get_counters(); } +#endif + start_clock = std::chrono::steady_clock::now(); + } + inline event_count& end() { + const auto end_clock = std::chrono::steady_clock::now(); +#if defined(__linux) + linux_events.end(count.event_counts); +#elif __APPLE__ && __aarch64__ + if(has_events()) { + performance_counters end = get_counters(); + diff = end - diff; + } + count.event_counts[0] = diff.cycles; + count.event_counts[1] = diff.instructions; + count.event_counts[2] = diff.branches; + count.event_counts[3] = diff.missed_branches; + count.event_counts[4] = 0; +#endif + count.elapsed = end_clock - start_clock; + return count; + } +}; + +#endif diff --git a/benchmarks/linux-perf-events.h b/benchmarks/linux-perf-events.h new file mode 100644 index 00000000..73cfbaf8 --- /dev/null +++ b/benchmarks/linux-perf-events.h @@ -0,0 +1,103 @@ +#pragma once +#ifdef __linux__ + +#include // for __NR_perf_event_open +#include // for perf event constants +#include // for ioctl +#include // for syscall + +#include // for errno +#include // for memset +#include + +#include +#include + +template class LinuxEvents { + int fd; + bool working; + perf_event_attr attribs{}; + size_t num_events{}; + std::vector temp_result_vec{}; + std::vector ids{}; + +public: + explicit LinuxEvents(std::vector config_vec) : fd(0), working(true) { + memset(&attribs, 0, sizeof(attribs)); + attribs.type = TYPE; + attribs.size = sizeof(attribs); + attribs.disabled = 1; + attribs.exclude_kernel = 1; + attribs.exclude_hv = 1; + + attribs.sample_period = 0; + attribs.read_format = PERF_FORMAT_GROUP | PERF_FORMAT_ID; + const int pid = 0; // the current process + const int cpu = -1; // all CPUs + const unsigned long flags = 0; + + int group = -1; // no group + num_events = config_vec.size(); + ids.resize(config_vec.size()); + uint32_t i = 0; + for (auto config : config_vec) { + attribs.config = config; + int _fd = static_cast(syscall(__NR_perf_event_open, &attribs, pid, cpu, group, flags)); + if (_fd == -1) { + report_error("perf_event_open"); + } + ioctl(_fd, PERF_EVENT_IOC_ID, &ids[i++]); + if (group == -1) { + group = _fd; + fd = _fd; + } + } + + temp_result_vec.resize(num_events * 2 + 1); + } + + ~LinuxEvents() { if (fd != -1) { close(fd); } } + + inline void start() { + if (fd != -1) { + if (ioctl(fd, PERF_EVENT_IOC_RESET, PERF_IOC_FLAG_GROUP) == -1) { + report_error("ioctl(PERF_EVENT_IOC_RESET)"); + } + + if (ioctl(fd, PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP) == -1) { + report_error("ioctl(PERF_EVENT_IOC_ENABLE)"); + } + } + } + + inline void end(std::vector &results) { + if (fd != -1) { + if (ioctl(fd, PERF_EVENT_IOC_DISABLE, PERF_IOC_FLAG_GROUP) == -1) { + report_error("ioctl(PERF_EVENT_IOC_DISABLE)"); + } + + if (read(fd, temp_result_vec.data(), temp_result_vec.size() * 8) == -1) { + report_error("read"); + } + } + // our actual results are in slots 1,3,5, ... of this structure + for (uint32_t i = 1; i < temp_result_vec.size(); i += 2) { + results[i / 2] = temp_result_vec[i]; + } + for (uint32_t i = 2; i < temp_result_vec.size(); i += 2) { + if(ids[i/2-1] != temp_result_vec[i]) { + report_error("event mismatch"); + } + } + } + + bool is_working() { + return working; + } + +private: + void report_error(const std::string &) { + working = false; + } +}; +#endif \ No newline at end of file From 8832c532b8aa11aad6caa91be011dd1a419bc552 Mon Sep 17 00:00:00 2001 From: Daniel Lemire Date: Mon, 25 Nov 2024 11:59:48 -0500 Subject: [PATCH 2/2] lint --- benchmarks/apple_arm_events.h | 28 ++++----- benchmarks/benchmark.cpp | 98 ++++++++++++++---------------- benchmarks/event_counter.h | 107 ++++++++++++++++++--------------- benchmarks/linux-perf-events.h | 19 +++--- 4 files changed, 125 insertions(+), 127 deletions(-) diff --git a/benchmarks/apple_arm_events.h b/benchmarks/apple_arm_events.h index 3a940811..cd631766 100644 --- a/benchmarks/apple_arm_events.h +++ b/benchmarks/apple_arm_events.h @@ -101,8 +101,6 @@ inline performance_counters operator-(const performance_counters &a, a.instructions - b.instructions); } - - typedef float f32; typedef double f64; typedef int8_t i8; @@ -616,9 +614,7 @@ typedef struct { #define lib_nelems(x) (sizeof(x) / sizeof((x)[0])) #define lib_symbol_def(name) \ - { \ -#name, (void **)&name \ - } + { #name, (void **)&name } static const lib_symbol lib_symbols_kperf[] = { lib_symbol_def(kpc_pmu_version), @@ -933,7 +929,7 @@ typedef struct { static const event_alias profile_events[] = { {"cycles", { - "FIXED_CYCLES", // Apple A7-A15//CORE_ACTIVE_CYCLE + "FIXED_CYCLES", // Apple A7-A15//CORE_ACTIVE_CYCLE "CPU_CLK_UNHALTED.THREAD", // Intel Core 1th-10th "CPU_CLK_UNHALTED.CORE", // Intel Yonah, Merom }}, @@ -976,7 +972,6 @@ u64 counters_0[KPC_MAX_COUNTERS] = {0}; u64 counters_1[KPC_MAX_COUNTERS] = {0}; const usize ev_count = sizeof(profile_events) / sizeof(profile_events[0]); - bool setup_performance_counters() { static bool init = false; static bool worked = false; @@ -995,7 +990,7 @@ bool setup_performance_counters() { // check permission int force_ctrs = 0; if (kpc_force_all_ctrs_get(&force_ctrs)) { - //printf("Permission denied, xnu/kpc requires root privileges.\n"); + // printf("Permission denied, xnu/kpc requires root privileges.\n"); return (worked = false); } int ret; @@ -1101,17 +1096,16 @@ inline performance_counters get_counters() { } return 1; } - /*printf("counters value:\n"); - for (usize i = 0; i < ev_count; i++) { - const event_alias *alias = profile_events + i; - usize idx = counter_map[i]; - u64 val = counters_1[idx] - counters_0[idx]; - printf("%14s: %llu\n", alias->alias, val); - }*/ + /*printf("counters value:\n"); + for (usize i = 0; i < ev_count; i++) { + const event_alias *alias = profile_events + i; + usize idx = counter_map[i]; + u64 val = counters_1[idx] - counters_0[idx]; + printf("%14s: %llu\n", alias->alias, val); + }*/ return performance_counters{ counters_0[counter_map[0]], counters_0[counter_map[2]], - counters_0[counter_map[3]], - counters_0[counter_map[1]]}; + counters_0[counter_map[3]], counters_0[counter_map[1]]}; } #endif diff --git a/benchmarks/benchmark.cpp b/benchmarks/benchmark.cpp index c6b091f1..993411eb 100644 --- a/benchmarks/benchmark.cpp +++ b/benchmarks/benchmark.cpp @@ -1,4 +1,4 @@ -#if defined(__linux__) || (__APPLE__ && __aarch64__) +#if defined(__linux__) || (__APPLE__ && __aarch64__) #define USING_COUNTERS #include "event_counter.h" #endif @@ -22,7 +22,6 @@ #include #include - template double findmax_fastfloat64(std::vector> &s) { double answer = 0; @@ -55,8 +54,9 @@ event_collector collector{}; #ifdef USING_COUNTERS template -std::vector time_it_ns(std::vector> &lines, - T const &function, size_t repeat) { +std::vector +time_it_ns(std::vector> &lines, T const &function, + size_t repeat) { std::vector aggregate; bool printed_bug = false; for (size_t i = 0; i < repeat; i++) { @@ -71,7 +71,8 @@ std::vector time_it_ns(std::vector> &lines return aggregate; } -void pretty_print(double volume, size_t number_of_floats, std::string name, std::vector events) { +void pretty_print(double volume, size_t number_of_floats, std::string name, + std::vector events) { double volumeMB = volume / (1024. * 1024.); double average_ns{0}; double min_ns{DBL_MAX}; @@ -83,7 +84,7 @@ void pretty_print(double volume, size_t number_of_floats, std::string name, std: double branches_avg{0}; double branch_misses_min{0}; double branch_misses_avg{0}; - for(event_count e : events) { + for (event_count e : events) { double ns = e.elapsed_ns(); average_ns += ns; min_ns = min_ns < ns ? min_ns : ns; @@ -94,7 +95,8 @@ void pretty_print(double volume, size_t number_of_floats, std::string name, std: double instructions = e.instructions(); instructions_avg += instructions; - instructions_min = instructions_min < instructions ? instructions_min : instructions; + instructions_min = + instructions_min < instructions ? instructions_min : instructions; double branches = e.branches(); branches_avg += branches; @@ -102,43 +104,37 @@ void pretty_print(double volume, size_t number_of_floats, std::string name, std: double branch_misses = e.missed_branches(); branch_misses_avg += branch_misses; - branch_misses_min = branch_misses_min < branch_misses ? branch_misses_min : branch_misses; + branch_misses_min = + branch_misses_min < branch_misses ? branch_misses_min : branch_misses; } cycles_avg /= events.size(); instructions_avg /= events.size(); average_ns /= events.size(); branches_avg /= events.size(); printf("%-40s: %8.2f MB/s (+/- %.1f %%) ", name.data(), - volumeMB * 1000000000 / min_ns, - (average_ns - min_ns) * 100.0 / average_ns); - printf("%8.2f Mfloat/s ", - number_of_floats * 1000 / min_ns); - if(instructions_min > 0) { - printf(" %8.2f i/B %8.2f i/f (+/- %.1f %%) ", - instructions_min / volume, - instructions_min / number_of_floats, + volumeMB * 1000000000 / min_ns, + (average_ns - min_ns) * 100.0 / average_ns); + printf("%8.2f Mfloat/s ", number_of_floats * 1000 / min_ns); + if (instructions_min > 0) { + printf(" %8.2f i/B %8.2f i/f (+/- %.1f %%) ", instructions_min / volume, + instructions_min / number_of_floats, (instructions_avg - instructions_min) * 100.0 / instructions_avg); - printf(" %8.2f c/B %8.2f c/f (+/- %.1f %%) ", - cycles_min / volume, - cycles_min / number_of_floats, + printf(" %8.2f c/B %8.2f c/f (+/- %.1f %%) ", cycles_min / volume, + cycles_min / number_of_floats, (cycles_avg - cycles_min) * 100.0 / cycles_avg); - printf(" %8.2f i/c ", - instructions_min /cycles_min); - printf(" %8.2f b/f ", - branches_avg /number_of_floats); - printf(" %8.2f bm/f ", - branch_misses_avg /number_of_floats); - printf(" %8.2f GHz ", - cycles_min / min_ns); + printf(" %8.2f i/c ", instructions_min / cycles_min); + printf(" %8.2f b/f ", branches_avg / number_of_floats); + printf(" %8.2f bm/f ", branch_misses_avg / number_of_floats); + printf(" %8.2f GHz ", cycles_min / min_ns); } printf("\n"); - } #else template -std::pair time_it_ns(std::vector> &lines, - T const &function, size_t repeat) { +std::pair +time_it_ns(std::vector> &lines, T const &function, + size_t repeat) { std::chrono::high_resolution_clock::time_point t1, t2; double average = 0; double min_value = DBL_MAX; @@ -160,21 +156,16 @@ std::pair time_it_ns(std::vector> &line return std::make_pair(min_value, average); } - - - -void pretty_print(double volume, size_t number_of_floats, std::string name, std::pair result) { +void pretty_print(double volume, size_t number_of_floats, std::string name, + std::pair result) { double volumeMB = volume / (1024. * 1024.); printf("%-40s: %8.2f MB/s (+/- %.1f %%) ", name.data(), - volumeMB * 1000000000 / result.first, - (result.second - result.first) * 100.0 / result.second); - printf("%8.2f Mfloat/s ", - number_of_floats * 1000 / result.first); - printf(" %8.2f ns/f \n", - double(result.first) /number_of_floats ); + volumeMB * 1000000000 / result.first, + (result.second - result.first) * 100.0 / result.second); + printf("%8.2f Mfloat/s ", number_of_floats * 1000 / result.first); + printf(" %8.2f ns/f \n", double(result.first) / number_of_floats); } -#endif - +#endif // this is okay, all chars are ASCII inline std::u16string widen(std::string line) { @@ -195,21 +186,23 @@ std::vector widen(const std::vector &lines) { return u16lines; } - void process(std::vector &lines, size_t volume) { size_t repeat = 100; double volumeMB = volume / (1024. * 1024.); std::cout << "ASCII volume = " << volumeMB << " MB " << std::endl; - pretty_print(volume, lines.size(), "fastfloat (64)", time_it_ns(lines, findmax_fastfloat64, repeat)); - pretty_print(volume, lines.size(), "fastfloat (32)", time_it_ns(lines, findmax_fastfloat32, repeat)); + pretty_print(volume, lines.size(), "fastfloat (64)", + time_it_ns(lines, findmax_fastfloat64, repeat)); + pretty_print(volume, lines.size(), "fastfloat (32)", + time_it_ns(lines, findmax_fastfloat32, repeat)); std::vector lines16 = widen(lines); volume = 2 * volume; volumeMB = volume / (1024. * 1024.); std::cout << "UTF-16 volume = " << volumeMB << " MB " << std::endl; - pretty_print(volume, lines.size(), "fastfloat (64)", time_it_ns(lines16, findmax_fastfloat64, repeat)); - pretty_print(volume, lines.size(), "fastfloat (32)", time_it_ns(lines16, findmax_fastfloat32, repeat)); - + pretty_print(volume, lines.size(), "fastfloat (64)", + time_it_ns(lines16, findmax_fastfloat64, repeat)); + pretty_print(volume, lines.size(), "fastfloat (32)", + time_it_ns(lines16, findmax_fastfloat32, repeat)); } void fileload(std::string filename) { @@ -233,13 +226,14 @@ void fileload(std::string filename) { process(lines, volume); } - int main(int argc, char **argv) { - if(collector.has_events()) { + if (collector.has_events()) { std::cout << "# Using hardware counters" << std::endl; } else { -#if defined(__linux__) || (__APPLE__ && __aarch64__) - std::cout << "# Hardware counters not available, try to run in privileged mode (e.g., sudo)." << std::endl; +#if defined(__linux__) || (__APPLE__ && __aarch64__) + std::cout << "# Hardware counters not available, try to run in privileged " + "mode (e.g., sudo)." + << std::endl; #endif } fileload(std::string(BENCHMARK_DATA_DIR) + "/canada.txt"); diff --git a/benchmarks/event_counter.h b/benchmarks/event_counter.h index fb6db3a6..3b7bb691 100644 --- a/benchmarks/event_counter.h +++ b/benchmarks/event_counter.h @@ -17,16 +17,19 @@ #include #endif -#if __APPLE__ && __aarch64__ +#if __APPLE__ && __aarch64__ #include "apple_arm_events.h" #endif struct event_count { std::chrono::duration elapsed; std::vector event_counts; - event_count() : elapsed(0), event_counts{0,0,0,0,0} {} - event_count(const std::chrono::duration _elapsed, const std::vector _event_counts) : elapsed(_elapsed), event_counts(_event_counts) {} - event_count(const event_count& other): elapsed(other.elapsed), event_counts(other.event_counts) { } + event_count() : elapsed(0), event_counts{0, 0, 0, 0, 0} {} + event_count(const std::chrono::duration _elapsed, + const std::vector _event_counts) + : elapsed(_elapsed), event_counts(_event_counts) {} + event_count(const event_count &other) + : elapsed(other.elapsed), event_counts(other.event_counts) {} // The types of counters (so we can read the getter more easily) enum event_counter_types { @@ -36,31 +39,42 @@ struct event_count { MISSED_BRANCHES = 3 }; - double elapsed_sec() const { return std::chrono::duration(elapsed).count(); } - double elapsed_ns() const { return std::chrono::duration(elapsed).count(); } - double cycles() const { return static_cast(event_counts[CPU_CYCLES]); } - double instructions() const { return static_cast(event_counts[INSTRUCTIONS]); } - double branches() const { return static_cast(event_counts[BRANCHES]); } - double missed_branches() const { return static_cast(event_counts[MISSED_BRANCHES]); } + double elapsed_sec() const { + return std::chrono::duration(elapsed).count(); + } + double elapsed_ns() const { + return std::chrono::duration(elapsed).count(); + } + double cycles() const { + return static_cast(event_counts[CPU_CYCLES]); + } + double instructions() const { + return static_cast(event_counts[INSTRUCTIONS]); + } + double branches() const { + return static_cast(event_counts[BRANCHES]); + } + double missed_branches() const { + return static_cast(event_counts[MISSED_BRANCHES]); + } - event_count& operator=(const event_count& other) { + event_count &operator=(const event_count &other) { this->elapsed = other.elapsed; this->event_counts = other.event_counts; return *this; } - event_count operator+(const event_count& other) const { - return event_count(elapsed+other.elapsed, { - event_counts[0]+other.event_counts[0], - event_counts[1]+other.event_counts[1], - event_counts[2]+other.event_counts[2], - event_counts[3]+other.event_counts[3], - event_counts[4]+other.event_counts[4], - }); + event_count operator+(const event_count &other) const { + return event_count(elapsed + other.elapsed, + { + event_counts[0] + other.event_counts[0], + event_counts[1] + other.event_counts[1], + event_counts[2] + other.event_counts[2], + event_counts[3] + other.event_counts[3], + event_counts[4] + other.event_counts[4], + }); } - void operator+=(const event_count& other) { - *this = *this + other; - } + void operator+=(const event_count &other) { *this = *this + other; } }; struct event_aggregate { @@ -72,7 +86,7 @@ struct event_aggregate { event_aggregate() = default; - void operator<<(const event_count& other) { + void operator<<(const event_count &other) { if (iterations == 0 || other.elapsed < best.elapsed) { best = other; } @@ -88,53 +102,48 @@ struct event_aggregate { double cycles() const { return total.cycles() / iterations; } double instructions() const { return total.instructions() / iterations; } double branches() const { return total.branches() / iterations; } - double missed_branches() const { return total.missed_branches() / iterations; } + double missed_branches() const { + return total.missed_branches() / iterations; + } }; struct event_collector { event_count count{}; std::chrono::time_point start_clock{}; -#if defined(__linux__) +#if defined(__linux__) LinuxEvents linux_events; - event_collector() : linux_events(std::vector{ - PERF_COUNT_HW_CPU_CYCLES, - PERF_COUNT_HW_INSTRUCTIONS, - PERF_COUNT_HW_BRANCH_INSTRUCTIONS, // Retired branch instructions - PERF_COUNT_HW_BRANCH_MISSES - }) {} - bool has_events() { - return linux_events.is_working(); - } -#elif __APPLE__ && __aarch64__ + event_collector() + : linux_events(std::vector{ + PERF_COUNT_HW_CPU_CYCLES, PERF_COUNT_HW_INSTRUCTIONS, + PERF_COUNT_HW_BRANCH_INSTRUCTIONS, // Retired branch instructions + PERF_COUNT_HW_BRANCH_MISSES}) {} + bool has_events() { return linux_events.is_working(); } +#elif __APPLE__ && __aarch64__ performance_counters diff; - event_collector() : diff(0) { - setup_performance_counters(); - } - bool has_events() { - return setup_performance_counters(); - } + event_collector() : diff(0) { setup_performance_counters(); } + bool has_events() { return setup_performance_counters(); } #else event_collector() {} - bool has_events() { - return false; - } + bool has_events() { return false; } #endif inline void start() { #if defined(__linux) linux_events.start(); -#elif __APPLE__ && __aarch64__ - if(has_events()) { diff = get_counters(); } +#elif __APPLE__ && __aarch64__ + if (has_events()) { + diff = get_counters(); + } #endif start_clock = std::chrono::steady_clock::now(); } - inline event_count& end() { + inline event_count &end() { const auto end_clock = std::chrono::steady_clock::now(); #if defined(__linux) linux_events.end(count.event_counts); -#elif __APPLE__ && __aarch64__ - if(has_events()) { +#elif __APPLE__ && __aarch64__ + if (has_events()) { performance_counters end = get_counters(); diff = end - diff; } diff --git a/benchmarks/linux-perf-events.h b/benchmarks/linux-perf-events.h index 73cfbaf8..0a9e5538 100644 --- a/benchmarks/linux-perf-events.h +++ b/benchmarks/linux-perf-events.h @@ -42,7 +42,8 @@ template class LinuxEvents { uint32_t i = 0; for (auto config : config_vec) { attribs.config = config; - int _fd = static_cast(syscall(__NR_perf_event_open, &attribs, pid, cpu, group, flags)); + int _fd = static_cast( + syscall(__NR_perf_event_open, &attribs, pid, cpu, group, flags)); if (_fd == -1) { report_error("perf_event_open"); } @@ -56,7 +57,11 @@ template class LinuxEvents { temp_result_vec.resize(num_events * 2 + 1); } - ~LinuxEvents() { if (fd != -1) { close(fd); } } + ~LinuxEvents() { + if (fd != -1) { + close(fd); + } + } inline void start() { if (fd != -1) { @@ -85,19 +90,15 @@ template class LinuxEvents { results[i / 2] = temp_result_vec[i]; } for (uint32_t i = 2; i < temp_result_vec.size(); i += 2) { - if(ids[i/2-1] != temp_result_vec[i]) { + if (ids[i / 2 - 1] != temp_result_vec[i]) { report_error("event mismatch"); } } } - bool is_working() { - return working; - } + bool is_working() { return working; } private: - void report_error(const std::string &) { - working = false; - } + void report_error(const std::string &) { working = false; } }; #endif \ No newline at end of file