Benchmark Kernels: Full Code and Design Intent

This document explains all benchmark kernels in this repo, why each kernel exists, and what microarchitectural behavior it stresses.

Two points first:

The active benchmark sweep (benchmark/run_benchmarks.py) currently runs this 10-kernel set:
- alu_chain, mem_stream, ilp_mix
- p2s_clean_ilp2, p2s_lane_dep_alu, p2s_lane_dep_mem
- p2o_low_ilp_chain, p2o_mid_ilp_dual, p2o_high_ilp4, p2o_mem_overlap
Additional kernels are present in benchmark/kernels for focused experiments (branch_mix, branch_wave, dep_hazard, load_store_hazard, dual_chain, mem_ring, p2s_ctrl_low_ilp).

Common Harness

All kernels include a shared harness that:

sets stack pointer
runs bench_main
writes signature words via bench_write(index, value)
terminates with ecall via bench_halt

#ifndef RV32I_BENCH_COMMON_H
#define RV32I_BENCH_COMMON_H

typedef unsigned int u32;

#ifndef OUT_BASE_ADDR
#define OUT_BASE_ADDR 0x00000100u
#endif

#ifndef BENCH_STACK_ADDR
#define BENCH_STACK_ADDR 0x00001ff0u
#endif

#define BENCH_STR1(x) #x
#define BENCH_STR(x) BENCH_STR1(x)

void bench_main(void);

__attribute__((naked, section(".text")))
void _start(void) {
    __asm__ volatile(
        "li sp, " BENCH_STR(BENCH_STACK_ADDR) "\n"
        "j bench_main\n"
    );
}

static inline void bench_write(u32 index, u32 value) {
    volatile u32 *out = (volatile u32 *)OUT_BASE_ADDR;
    out[index] = value;
}

static inline void bench_halt(void) {
    __asm__ volatile("ecall");
}

#endif

The signature words are what Spike and RTL are compared against for correctness.

Active Benchmark Set (10 Kernels)

1) alu_chain

#include "common.h"

void bench_main(void) {
    u32 x = 0x13579bdfu;
    u32 y = 0x2468ace0u;
    u32 i;

    for (i = 0; i < 96u; i++) {
        x = x + y + i + 0x9e3779b9u;
        y = y ^ x;
        x = x + 0x7f4a7c15u;
    }

    bench_write(0u, x);
    bench_write(1u, y);
    bench_write(2u, x ^ y);
    bench_write(3u, i);

    bench_halt();
    for (;;) {
    }
}

Why it exists:

Forces a tight RAW dependency chain.
Measures how well forwarding / wakeup pipelines keep a dependent ALU stream moving.
Good baseline for low-ILP behavior.

2) mem_stream

#include "common.h"

void bench_main(void) {
    volatile u32 *buf = (volatile u32 *)(OUT_BASE_ADDR + 0x80u);
    u32 seed = 0x10203040u;
    u32 sum = 0u;
    u32 chk = 0u;
    u32 i;

    for (i = 0; i < 32u; i++) {
        seed = (seed << 1) ^ (seed >> 3) ^ (i + 0x9u);
        buf[i] = seed;
    }

    for (i = 0; i < 32u; i++) {
        u32 v = buf[i];
        sum = sum + v;
        chk = chk ^ (v >> (i & 7u));
        buf[i] = v ^ (sum << 1);
    }

    bench_write(0u, sum);
    bench_write(1u, chk);
    bench_write(2u, buf[0]);
    bench_write(3u, buf[31]);

    bench_halt();
    for (;;) {
    }
}

Why it exists:

Sequential load/store stream with data mixing.
Exercises memory datapath, address generation, and load-use distance.
Useful for seeing how memory pressure affects CPI.

3) ilp_mix

#include "common.h"

void bench_main(void) {
    u32 a = 0x11111111u;
    u32 b = 0x22222222u;
    u32 c = 0x33333333u;
    u32 d = 0x44444444u;
    u32 i;

    for (i = 0; i < 160u; i++) {
        a = a + (b ^ i);
        c = c + (d + (i << 1));
        b = b ^ (a >> 3);
        d = d ^ (c << 1);
    }

    bench_write(0u, a);
    bench_write(1u, b);
    bench_write(2u, c);
    bench_write(3u, d);

    bench_halt();
    for (;;) {
    }
}

Why it exists:

Mixes two arithmetic chains with some cross-coupling.
Gives moderate ILP, not purely independent and not purely serial.
Good general-purpose stress for in-order vs OOO behavior.

4) p2s_clean_ilp2

#include "common.h"

void bench_main(void) {
    u32 a = 0x10203040u;
    u32 b = 0x55667788u;
    u32 c = 0x89abcdefu;
    u32 d = 0x13579bdfu;
    u32 t0 = 1u;
    u32 t1 = 7u;
    u32 i;

    for (i = 0; i < 256u; i++) {
        a = a + (b ^ t0);
        b = b + 0x11111111u;
        c = c + (d ^ t1);
        d = d + 0x01010101u;
        t0 = t0 + 3u;
        t1 = t1 + 5u;
    }

    bench_write(0u, a);
    bench_write(1u, b);
    bench_write(2u, c);
    bench_write(3u, d);

    bench_halt();
    for (;;) {
    }
}

Why it exists:

Explicitly shaped for 2-lane in-order superscalar.
Two clean independent lanes reduce inter-lane hazards.
Used to measure best-case dual-issue utilization.

5) p2s_lane_dep_alu

#include "common.h"

void bench_main(void) {
    u32 x = 0x2468ace0u;
    u32 y = 0x13579bdfu;
    u32 z = 0x89abcdefu;
    u32 t = 1u;
    u32 i;

    for (i = 0; i < 320u; i++) {
        x = x + y + t;
        y = (x ^ z) + 0x9e3779b9u;
        z = z + (y ^ (t << 1));
        x = x ^ (z >> (i & 7u));
        t = t + 1u;
    }

    bench_write(0u, x);
    bench_write(1u, y);
    bench_write(2u, z);
    bench_write(3u, x ^ y ^ z);

    bench_halt();
    for (;;) {
    }
}

Why it exists:

Creates lane-crossing ALU dependencies.
Stresses hazard detection and same-cycle forwarding choices.
Separates a true dual-issue engine from one that stalls/squashes often.

6) p2s_lane_dep_mem

#include "common.h"

void bench_main(void) {
    volatile u32 *buf = (volatile u32 *)(OUT_BASE_ADDR + 0x180u);
    u32 prev = 0x12345678u;
    u32 acc = 0u;
    u32 i;

    for (i = 0; i < 16u; i++) {
        buf[i] = (i << 4) ^ 0x55aa00ffu;
    }

    for (i = 0; i < 192u; i++) {
        u32 idx = i & 15u;
        u32 a = buf[idx];
        u32 b = a + prev + i;
        buf[idx] = b;
        prev = buf[(idx + 1u) & 15u] + b;
        acc = acc + prev;
    }

    bench_write(0u, acc);
    bench_write(1u, prev);
    bench_write(2u, buf[0]);
    bench_write(3u, buf[15]);

    bench_halt();
    for (;;) {
    }
}

Why it exists:

Short ring-buffer memory chain with immediate reuse.
Exposes load/store ordering constraints and memory dependencies.
Useful for testing conservative same-pair memory policies.

7) p2o_low_ilp_chain

#include "common.h"

void bench_main(void) {
    u32 x = 0x13579bdfu;
    u32 y = 0x2468ace0u;
    u32 t = 0u;
    u32 i;

    for (i = 0; i < 384u; i++) {
        x = x + y + t + 0x9e3779b9u;
        y = y ^ x;
        x = x + 0x7f4a7c15u;
        t = t + 1u;
    }

    bench_write(0u, x);
    bench_write(1u, y);
    bench_write(2u, x ^ y);
    bench_write(3u, i);

    bench_halt();
    for (;;) {
    }
}

Why it exists:

Very low-ILP chain for pipe vs OOO comparison.
Tells you how much overhead OOO carries when little parallelism exists.

8) p2o_mid_ilp_dual

#include "common.h"

void bench_main(void) {
    u32 a = 0x10203040u;
    u32 b = 0x55667788u;
    u32 c = 0x89abcdefu;
    u32 d = 0x0f1e2d3cu;
    u32 t0 = 1u;
    u32 t1 = 7u;
    u32 i;

    for (i = 0; i < 320u; i++) {
        a = a + (b ^ t0);
        b = b + ((a >> 2) ^ 0x1f1f1f1fu);

        c = c ^ (d + t1);
        d = d + ((c << 1) ^ 0x00ff00ffu);

        t0 = t0 + 3u;
        t1 = t1 + 5u;
    }

    bench_write(0u, a ^ b);
    bench_write(1u, c ^ d);
    bench_write(2u, a + c);
    bench_write(3u, b + d);

    bench_halt();
    for (;;) {
    }
}

Why it exists:

Medium ILP with two chains and moderate coupling.
Measures how much scheduler window helps once dependence is reduced but not eliminated.

9) p2o_high_ilp4

#include "common.h"

void bench_main(void) {
    u32 a = 0x11111111u;
    u32 b = 0x22222222u;
    u32 c = 0x33333333u;
    u32 d = 0x44444444u;
    u32 t0 = 1u;
    u32 t1 = 7u;
    u32 t2 = 11u;
    u32 t3 = 17u;
    u32 i;

    for (i = 0; i < 384u; i++) {
        a = a + t0;
        b = b + t1;
        c = c + t2;
        d = d + t3;

        t0 = t0 + 3u;
        t1 = t1 + 5u;
        t2 = t2 + 9u;
        t3 = t3 + 13u;
    }

    bench_write(0u, a);
    bench_write(1u, b);
    bench_write(2u, c);
    bench_write(3u, d);

    bench_halt();
    for (;;) {
    }
}

Why it exists:

High ILP with four almost-independent streams.
A scheduler-friendly case intended to expose window extraction potential.

10) p2o_mem_overlap

#include "common.h"

void bench_main(void) {
    volatile u32 *buf = (volatile u32 *)(OUT_BASE_ADDR + 0x200u);
    u32 sum = 0u;
    u32 chk = 0u;
    u32 pat = 0u;
    u32 i;

    for (i = 0; i < 32u; i++) {
        buf[i] = pat ^ 0x55aa00ffu;
        pat = pat + 0x01010101u;
    }

    for (i = 0; i < 256u; i++) {
        u32 idx = i & 31u;
        u32 v = buf[idx];
        sum = sum + v + i;
        chk = chk ^ (v >> (i & 7u));
        buf[idx] = v ^ (sum << 1);
    }

    bench_write(0u, sum);
    bench_write(1u, chk);
    bench_write(2u, buf[0]);
    bench_write(3u, buf[31]);

    bench_halt();
    for (;;) {
    }
}

Why it exists:

Memory-heavy loop with independent arithmetic mixed in.
Useful for seeing if core can keep compute moving around memory operations.

Additional Kernels in Repository (Not in Current 10-Kernel Sweep)

11) p2s_ctrl_low_ilp

#include "common.h"

void bench_main(void) {
    u32 state = 0x1f123bb5u;
    u32 acc = 0x76543210u;
    u32 taken0 = 0u;
    u32 taken1 = 0u;
    u32 t = 0u;
    u32 i;

    for (i = 0; i < 256u; i++) {
        state = state + 0x9e3779b9u + t;
        if (state & 1u) {
            taken0 = taken0 + 1u;
            acc = acc + state + t;
        } else {
            acc = acc ^ state;
        }

        if (acc & 0x20u) {
            taken1 = taken1 + 1u;
            acc = acc + 0x1021u;
        } else {
            acc = acc ^ 0x00ff00ffu;
        }
        t = t + 1u;
    }

    bench_write(0u, acc);
    bench_write(1u, state);
    bench_write(2u, taken0);
    bench_write(3u, taken1);

    bench_halt();
    for (;;) {
    }
}

Why it exists:

Control-heavy, low-ILP branch behavior.
Good for front-end redirect and control hazard studies.

12) branch_mix

#include "common.h"

void bench_main(void) {
    u32 state = 0x1f123bb5u;
    u32 acc = 0x76543210u;
    u32 taken0 = 0u;
    u32 taken1 = 0u;
    u32 i;

    for (i = 0; i < 192u; i++) {
        state = state + 0x9e3779b9u + i;

        if (state & 1u) {
            taken0++;
            acc = acc + state + i;
        } else {
            acc = acc ^ state;
        }

        if (acc & 0x20u) {
            taken1++;
            acc = acc + 0x1021u;
        } else {
            acc = acc ^ 0x00ff00ffu;
        }
    }

    bench_write(0u, acc);
    bench_write(1u, state);
    bench_write(2u, taken0);
    bench_write(3u, taken1);

    bench_halt();
    for (;;) {
    }
}

Why it exists:

Mixed branch outcomes with arithmetic side effects.
Captures branch-dependent dataflow behavior.

13) branch_wave

#include "common.h"

void bench_main(void) {
    u32 x = 0x31415926u;
    u32 y = 0x27182818u;
    u32 flips = 0u;
    u32 taken = 0u;
    u32 i;

    for (i = 0; i < 224u; i++) {
        x = x + 0x9e3779b9u + (i << 1);
        y = y ^ (x >> (i & 7u));

        if ((x ^ y) & 0x10u) {
            taken = taken + 1u;
            x = x ^ (y + i);
        } else {
            x = x + (y ^ 0x00ff00ffu);
        }

        if (x & 1u) {
            flips = flips + 1u;
            y = y + 0x1021u;
        } else {
            y = y ^ 0xa5a5a5a5u;
        }
    }

    bench_write(0u, x);
    bench_write(1u, y);
    bench_write(2u, taken);
    bench_write(3u, flips);

    bench_halt();
    for (;;) {
    }
}

Why it exists:

Produces branch behavior that oscillates with state evolution.
Good for studying control-path turbulence under dynamic bit patterns.

14) dep_hazard

#include "common.h"

void bench_main(void) {
    u32 acc = 0x00010001u;
    u32 i;

    for (i = 0; i < 220u; i++) {
        acc = acc + 3u;
        acc = acc ^ 0x00f0f0f0u;
        acc = acc + 0x00011111u;
        acc = acc + (i ^ 0x3cu);
    }

    bench_write(0u, acc);
    bench_write(1u, acc >> 16);
    bench_write(2u, acc & 0xffffu);
    bench_write(3u, i);

    bench_halt();
    for (;;) {
    }
}

Why it exists:

Dense data dependence on one accumulator.
Classic hazard-stress pattern for bypassing and scheduler wakeup.

15) load_store_hazard

#include "common.h"

void bench_main(void) {
    volatile u32 *buf = (volatile u32 *)(OUT_BASE_ADDR + 0x100u);
    u32 prev = 0x12345678u;
    u32 acc = 0u;
    u32 i;

    for (i = 0; i < 16u; i++) {
        buf[i] = (i << 4) ^ 0x55aa00ffu;
    }

    for (i = 0; i < 128u; i++) {
        u32 idx = i & 15u;
        u32 a = buf[idx];
        u32 b = a + prev + i;
        buf[idx] = b;
        {
            u32 c = buf[(idx + 1u) & 15u];
            prev = c + b;
        }
        acc = acc + prev;
    }

    bench_write(0u, acc);
    bench_write(1u, prev);
    bench_write(2u, buf[0]);
    bench_write(3u, buf[15]);

    bench_halt();
    for (;;) {
    }
}

Why it exists:

Load-store interaction with short reuse distance.
Targets ordering and load-after-store behavior.

16) dual_chain

#include "common.h"

void bench_main(void) {
    u32 a = 0x10203040u;
    u32 b = 0x55667788u;
    u32 c = 0x89abcdefu;
    u32 d = 0x0f1e2d3cu;
    u32 i;

    for (i = 0; i < 256u; i++) {
        a = a + (b ^ (i * 3u + 1u));
        b = b + ((a >> 2) ^ 0x1f1f1f1fu);

        c = c ^ (d + (i * 5u + 7u));
        d = d + ((c << 1) ^ 0x00ff00ffu);
    }

    bench_write(0u, a ^ b);
    bench_write(1u, c ^ d);
    bench_write(2u, a + c);
    bench_write(3u, b + d);

    bench_halt();
    for (;;) {
    }
}

Why it exists:

Two arithmetic chains with moderate coupling.
Good for dual-lane and scheduler fairness studies.

17) mem_ring

#include "common.h"

void bench_main(void) {
    volatile u32 *ring = (volatile u32 *)(OUT_BASE_ADDR + 0x180u);
    u32 acc = 0x89abcdefu;
    u32 mix = 0x13579bdfu;
    u32 pat = 0u;
    u32 i;

    for (i = 0; i < 16u; i++) {
        ring[i] = pat ^ 0x55aa33ccu;
        pat = pat + 0x01010101u;
    }

    for (i = 0; i < 192u; i++) {
        u32 idx0 = i & 15u;
        u32 idx1 = (idx0 + 5u) & 15u;
        u32 a = ring[idx0];
        u32 b = ring[idx1];
        u32 n = a + b + mix + i;
        ring[idx0] = n;
        mix = mix ^ (n >> (i & 3u));
        acc = acc + mix + ring[(idx0 + 1u) & 15u];
    }

    bench_write(0u, acc);
    bench_write(1u, mix);
    bench_write(2u, ring[0]);
    bench_write(3u, ring[15]);

    bench_halt();
    for (;;) {
    }
}

Why it exists:

Ring-buffer memory pattern with offset reads.
Stresses memory index arithmetic plus recurrent data reuse.

Kernel Families and What They Measure

Dependency-heavy ALU: alu_chain, dep_hazard, p2o_low_ilp_chain
Medium/high ILP arithmetic: ilp_mix, dual_chain, p2o_mid_ilp_dual, p2o_high_ilp4, p2s_clean_ilp2
Memory interaction: mem_stream, mem_ring, load_store_hazard, p2s_lane_dep_mem, p2o_mem_overlap
Control stress: branch_mix, branch_wave, p2s_ctrl_low_ilp
Superscalar lane interaction focus: p2s_clean_ilp2, p2s_lane_dep_alu, p2s_lane_dep_mem
OOO extraction focus: p2o_low_ilp_chain, p2o_mid_ilp_dual, p2o_high_ilp4, p2o_mem_overlap

Why This Mix Is Useful

This kernel set is not random. It spans:

low ILP to high ILP
ALU-dominant to memory-dominant loops
low-control to high-control patterns
friendly and adversarial conditions for dual-issue and OOO scheduling

That spread is what makes pairwise variant comparison meaningful.