#include <stdio.h>
#include <stdint.h>
#include <inttypes.h>
#include <time.h>

#define TEST_LIMIT (100000000ull)
#define INITIAL_STATE (0xDEADC0DECAFEBEEFULL)

// A buffer of 1024 entries fits easily in L1 Data Cache (typically 32KB).
// This avoids the RNG dependency chain bottleneck while still providing
// random bits to the branch predictor.
#define BUF_SIZE (1024)
static uint64_t l1_buffer[BUF_SIZE];

// While not cryptographically random, this is very fast and random enough
static inline uint64_t xorshift64(uint64_t *state) {
    uint64_t x = *state;
    x ^= x << 13;
    x ^= x >> 7;
    x ^= x << 17;
    return *state = x;
}

void fill_buffer() {
    uint64_t state = INITIAL_STATE;
    for (int i = 0; i < BUF_SIZE; i++) {
        l1_buffer[i] = xorshift64(&state);
    }
}

static inline int popcount_slow(uint64_t n) {
    int count = 0;
    while (n > 0) {
        if (n & 1) {
            count++;
        }
        n >>= 1;
    }
    return count;
}

static inline int popcount_better(uint64_t n) {
    int count = 0;
    while (n > 0) {
        n &= (n - 1);
        count++;
    }
    return count;
}

static inline int popcount_fast(uint64_t n) {
    return __builtin_popcountll(n);
}

// Using a macro to write functions is a bit awkward.
// The original version had me passing in a function pointer,
// but the overhead of this virtual function ended up as a
// significant driver of latency in the test.
// Printing the total bits at the end is important - else the compiler
// has a tendency to no-op optimize out the entire test at -O3
#define TIME_POPCOUNT(func, label) do { \
    struct timespec start, end; \
    uint64_t total_bits = 0; \
    clock_gettime(CLOCK_MONOTONIC, &start); \
    for (uint64_t i = 0; i < TEST_LIMIT; i++) { \
        total_bits += func(l1_buffer[i & (BUF_SIZE - 1)]); \
    } \
    clock_gettime(CLOCK_MONOTONIC, &end); \
    double elapsed = (end.tv_sec - start.tv_sec) + (end.tv_nsec - start.tv_nsec) / 1e9; \
    printf("%-12s took %f seconds (total bits: %" PRIx64 ")\n", label, elapsed, total_bits); \
} while (0)

void test_popcount(uint64_t num) {
    printf("Number: %016" PRIx64 "\n", num);
    printf("  Slow:   %d\n", popcount_slow(num));
    printf("  Better: %d\n", popcount_better(num));
    printf("  Fast:   %d\n", popcount_fast(num));
}

int main() {
    test_popcount(0);
    test_popcount(0xF0F0F0F0F0F0F0F0);
    test_popcount(0xFFFFFFFFFFFFFFFF);

    fill_buffer();

    printf("\n--- Timing tests ---\n");

    TIME_POPCOUNT(popcount_slow,   "Slow");
    TIME_POPCOUNT(popcount_better, "Better");
    TIME_POPCOUNT(popcount_fast,   "Hardware");

    return 0;
}
