
Benchmark Kernels: Full Code and Design Intent
This document explains all benchmark kernels in this repo, why each kernel exists, and what microarchitectural behavior it stresses.
Two points first:
- The active benchmark sweep (
benchmark/run_benchmarks.py) currently runs this 10-kernel set:alu_chain,mem_stream,ilp_mixp2s_clean_ilp2,p2s_lane_dep_alu,p2s_lane_dep_memp2o_low_ilp_chain,p2o_mid_ilp_dual,p2o_high_ilp4,p2o_mem_overlap
- Additional kernels are present in
benchmark/kernelsfor focused experiments (branch_mix,branch_wave,dep_hazard,load_store_hazard,dual_chain,mem_ring,p2s_ctrl_low_ilp).
Common Harness
All kernels include a shared harness that:
- sets stack pointer
- runs
bench_main - writes signature words via
bench_write(index, value) - terminates with
ecallviabench_halt
#ifndef RV32I_BENCH_COMMON_H
#define RV32I_BENCH_COMMON_H
typedef unsigned int u32;
#ifndef OUT_BASE_ADDR
#define OUT_BASE_ADDR 0x00000100u
#endif
#ifndef BENCH_STACK_ADDR
#define BENCH_STACK_ADDR 0x00001ff0u
#endif
#define BENCH_STR1(x) #x
#define BENCH_STR(x) BENCH_STR1(x)
void bench_main(void);
__attribute__((naked, section(".text")))
void _start(void) {
__asm__ volatile(
"li sp, " BENCH_STR(BENCH_STACK_ADDR) "\n"
"j bench_main\n"
);
}
static inline void bench_write(u32 index, u32 value) {
volatile u32 *out = (volatile u32 *)OUT_BASE_ADDR;
out[index] = value;
}
static inline void bench_halt(void) {
__asm__ volatile("ecall");
}
#endif
The signature words are what Spike and RTL are compared against for correctness.
Active Benchmark Set (10 Kernels)
1) alu_chain
#include "common.h"
void bench_main(void) {
u32 x = 0x13579bdfu;
u32 y = 0x2468ace0u;
u32 i;
for (i = 0; i < 96u; i++) {
x = x + y + i + 0x9e3779b9u;
y = y ^ x;
x = x + 0x7f4a7c15u;
}
bench_write(0u, x);
bench_write(1u, y);
bench_write(2u, x ^ y);
bench_write(3u, i);
bench_halt();
for (;;) {
}
}
Why it exists:
- Forces a tight RAW dependency chain.
- Measures how well forwarding / wakeup pipelines keep a dependent ALU stream moving.
- Good baseline for low-ILP behavior.
2) mem_stream
#include "common.h"
void bench_main(void) {
volatile u32 *buf = (volatile u32 *)(OUT_BASE_ADDR + 0x80u);
u32 seed = 0x10203040u;
u32 sum = 0u;
u32 chk = 0u;
u32 i;
for (i = 0; i < 32u; i++) {
seed = (seed << 1) ^ (seed >> 3) ^ (i + 0x9u);
buf[i] = seed;
}
for (i = 0; i < 32u; i++) {
u32 v = buf[i];
sum = sum + v;
chk = chk ^ (v >> (i & 7u));
buf[i] = v ^ (sum << 1);
}
bench_write(0u, sum);
bench_write(1u, chk);
bench_write(2u, buf[0]);
bench_write(3u, buf[31]);
bench_halt();
for (;;) {
}
}
Why it exists:
- Sequential load/store stream with data mixing.
- Exercises memory datapath, address generation, and load-use distance.
- Useful for seeing how memory pressure affects CPI.
3) ilp_mix
#include "common.h"
void bench_main(void) {
u32 a = 0x11111111u;
u32 b = 0x22222222u;
u32 c = 0x33333333u;
u32 d = 0x44444444u;
u32 i;
for (i = 0; i < 160u; i++) {
a = a + (b ^ i);
c = c + (d + (i << 1));
b = b ^ (a >> 3);
d = d ^ (c << 1);
}
bench_write(0u, a);
bench_write(1u, b);
bench_write(2u, c);
bench_write(3u, d);
bench_halt();
for (;;) {
}
}
Why it exists:
- Mixes two arithmetic chains with some cross-coupling.
- Gives moderate ILP, not purely independent and not purely serial.
- Good general-purpose stress for in-order vs OOO behavior.
4) p2s_clean_ilp2
#include "common.h"
void bench_main(void) {
u32 a = 0x10203040u;
u32 b = 0x55667788u;
u32 c = 0x89abcdefu;
u32 d = 0x13579bdfu;
u32 t0 = 1u;
u32 t1 = 7u;
u32 i;
for (i = 0; i < 256u; i++) {
a = a + (b ^ t0);
b = b + 0x11111111u;
c = c + (d ^ t1);
d = d + 0x01010101u;
t0 = t0 + 3u;
t1 = t1 + 5u;
}
bench_write(0u, a);
bench_write(1u, b);
bench_write(2u, c);
bench_write(3u, d);
bench_halt();
for (;;) {
}
}
Why it exists:
- Explicitly shaped for 2-lane in-order superscalar.
- Two clean independent lanes reduce inter-lane hazards.
- Used to measure best-case dual-issue utilization.
5) p2s_lane_dep_alu
#include "common.h"
void bench_main(void) {
u32 x = 0x2468ace0u;
u32 y = 0x13579bdfu;
u32 z = 0x89abcdefu;
u32 t = 1u;
u32 i;
for (i = 0; i < 320u; i++) {
x = x + y + t;
y = (x ^ z) + 0x9e3779b9u;
z = z + (y ^ (t << 1));
x = x ^ (z >> (i & 7u));
t = t + 1u;
}
bench_write(0u, x);
bench_write(1u, y);
bench_write(2u, z);
bench_write(3u, x ^ y ^ z);
bench_halt();
for (;;) {
}
}
Why it exists:
- Creates lane-crossing ALU dependencies.
- Stresses hazard detection and same-cycle forwarding choices.
- Separates a true dual-issue engine from one that stalls/squashes often.
6) p2s_lane_dep_mem
#include "common.h"
void bench_main(void) {
volatile u32 *buf = (volatile u32 *)(OUT_BASE_ADDR + 0x180u);
u32 prev = 0x12345678u;
u32 acc = 0u;
u32 i;
for (i = 0; i < 16u; i++) {
buf[i] = (i << 4) ^ 0x55aa00ffu;
}
for (i = 0; i < 192u; i++) {
u32 idx = i & 15u;
u32 a = buf[idx];
u32 b = a + prev + i;
buf[idx] = b;
prev = buf[(idx + 1u) & 15u] + b;
acc = acc + prev;
}
bench_write(0u, acc);
bench_write(1u, prev);
bench_write(2u, buf[0]);
bench_write(3u, buf[15]);
bench_halt();
for (;;) {
}
}
Why it exists:
- Short ring-buffer memory chain with immediate reuse.
- Exposes load/store ordering constraints and memory dependencies.
- Useful for testing conservative same-pair memory policies.
7) p2o_low_ilp_chain
#include "common.h"
void bench_main(void) {
u32 x = 0x13579bdfu;
u32 y = 0x2468ace0u;
u32 t = 0u;
u32 i;
for (i = 0; i < 384u; i++) {
x = x + y + t + 0x9e3779b9u;
y = y ^ x;
x = x + 0x7f4a7c15u;
t = t + 1u;
}
bench_write(0u, x);
bench_write(1u, y);
bench_write(2u, x ^ y);
bench_write(3u, i);
bench_halt();
for (;;) {
}
}
Why it exists:
- Very low-ILP chain for pipe vs OOO comparison.
- Tells you how much overhead OOO carries when little parallelism exists.
8) p2o_mid_ilp_dual
#include "common.h"
void bench_main(void) {
u32 a = 0x10203040u;
u32 b = 0x55667788u;
u32 c = 0x89abcdefu;
u32 d = 0x0f1e2d3cu;
u32 t0 = 1u;
u32 t1 = 7u;
u32 i;
for (i = 0; i < 320u; i++) {
a = a + (b ^ t0);
b = b + ((a >> 2) ^ 0x1f1f1f1fu);
c = c ^ (d + t1);
d = d + ((c << 1) ^ 0x00ff00ffu);
t0 = t0 + 3u;
t1 = t1 + 5u;
}
bench_write(0u, a ^ b);
bench_write(1u, c ^ d);
bench_write(2u, a + c);
bench_write(3u, b + d);
bench_halt();
for (;;) {
}
}
Why it exists:
- Medium ILP with two chains and moderate coupling.
- Measures how much scheduler window helps once dependence is reduced but not eliminated.
9) p2o_high_ilp4
#include "common.h"
void bench_main(void) {
u32 a = 0x11111111u;
u32 b = 0x22222222u;
u32 c = 0x33333333u;
u32 d = 0x44444444u;
u32 t0 = 1u;
u32 t1 = 7u;
u32 t2 = 11u;
u32 t3 = 17u;
u32 i;
for (i = 0; i < 384u; i++) {
a = a + t0;
b = b + t1;
c = c + t2;
d = d + t3;
t0 = t0 + 3u;
t1 = t1 + 5u;
t2 = t2 + 9u;
t3 = t3 + 13u;
}
bench_write(0u, a);
bench_write(1u, b);
bench_write(2u, c);
bench_write(3u, d);
bench_halt();
for (;;) {
}
}
Why it exists:
- High ILP with four almost-independent streams.
- A scheduler-friendly case intended to expose window extraction potential.
10) p2o_mem_overlap
#include "common.h"
void bench_main(void) {
volatile u32 *buf = (volatile u32 *)(OUT_BASE_ADDR + 0x200u);
u32 sum = 0u;
u32 chk = 0u;
u32 pat = 0u;
u32 i;
for (i = 0; i < 32u; i++) {
buf[i] = pat ^ 0x55aa00ffu;
pat = pat + 0x01010101u;
}
for (i = 0; i < 256u; i++) {
u32 idx = i & 31u;
u32 v = buf[idx];
sum = sum + v + i;
chk = chk ^ (v >> (i & 7u));
buf[idx] = v ^ (sum << 1);
}
bench_write(0u, sum);
bench_write(1u, chk);
bench_write(2u, buf[0]);
bench_write(3u, buf[31]);
bench_halt();
for (;;) {
}
}
Why it exists:
- Memory-heavy loop with independent arithmetic mixed in.
- Useful for seeing if core can keep compute moving around memory operations.
Additional Kernels in Repository (Not in Current 10-Kernel Sweep)
11) p2s_ctrl_low_ilp
#include "common.h"
void bench_main(void) {
u32 state = 0x1f123bb5u;
u32 acc = 0x76543210u;
u32 taken0 = 0u;
u32 taken1 = 0u;
u32 t = 0u;
u32 i;
for (i = 0; i < 256u; i++) {
state = state + 0x9e3779b9u + t;
if (state & 1u) {
taken0 = taken0 + 1u;
acc = acc + state + t;
} else {
acc = acc ^ state;
}
if (acc & 0x20u) {
taken1 = taken1 + 1u;
acc = acc + 0x1021u;
} else {
acc = acc ^ 0x00ff00ffu;
}
t = t + 1u;
}
bench_write(0u, acc);
bench_write(1u, state);
bench_write(2u, taken0);
bench_write(3u, taken1);
bench_halt();
for (;;) {
}
}
Why it exists:
- Control-heavy, low-ILP branch behavior.
- Good for front-end redirect and control hazard studies.
12) branch_mix
#include "common.h"
void bench_main(void) {
u32 state = 0x1f123bb5u;
u32 acc = 0x76543210u;
u32 taken0 = 0u;
u32 taken1 = 0u;
u32 i;
for (i = 0; i < 192u; i++) {
state = state + 0x9e3779b9u + i;
if (state & 1u) {
taken0++;
acc = acc + state + i;
} else {
acc = acc ^ state;
}
if (acc & 0x20u) {
taken1++;
acc = acc + 0x1021u;
} else {
acc = acc ^ 0x00ff00ffu;
}
}
bench_write(0u, acc);
bench_write(1u, state);
bench_write(2u, taken0);
bench_write(3u, taken1);
bench_halt();
for (;;) {
}
}
Why it exists:
- Mixed branch outcomes with arithmetic side effects.
- Captures branch-dependent dataflow behavior.
13) branch_wave
#include "common.h"
void bench_main(void) {
u32 x = 0x31415926u;
u32 y = 0x27182818u;
u32 flips = 0u;
u32 taken = 0u;
u32 i;
for (i = 0; i < 224u; i++) {
x = x + 0x9e3779b9u + (i << 1);
y = y ^ (x >> (i & 7u));
if ((x ^ y) & 0x10u) {
taken = taken + 1u;
x = x ^ (y + i);
} else {
x = x + (y ^ 0x00ff00ffu);
}
if (x & 1u) {
flips = flips + 1u;
y = y + 0x1021u;
} else {
y = y ^ 0xa5a5a5a5u;
}
}
bench_write(0u, x);
bench_write(1u, y);
bench_write(2u, taken);
bench_write(3u, flips);
bench_halt();
for (;;) {
}
}
Why it exists:
- Produces branch behavior that oscillates with state evolution.
- Good for studying control-path turbulence under dynamic bit patterns.
14) dep_hazard
#include "common.h"
void bench_main(void) {
u32 acc = 0x00010001u;
u32 i;
for (i = 0; i < 220u; i++) {
acc = acc + 3u;
acc = acc ^ 0x00f0f0f0u;
acc = acc + 0x00011111u;
acc = acc + (i ^ 0x3cu);
}
bench_write(0u, acc);
bench_write(1u, acc >> 16);
bench_write(2u, acc & 0xffffu);
bench_write(3u, i);
bench_halt();
for (;;) {
}
}
Why it exists:
- Dense data dependence on one accumulator.
- Classic hazard-stress pattern for bypassing and scheduler wakeup.
15) load_store_hazard
#include "common.h"
void bench_main(void) {
volatile u32 *buf = (volatile u32 *)(OUT_BASE_ADDR + 0x100u);
u32 prev = 0x12345678u;
u32 acc = 0u;
u32 i;
for (i = 0; i < 16u; i++) {
buf[i] = (i << 4) ^ 0x55aa00ffu;
}
for (i = 0; i < 128u; i++) {
u32 idx = i & 15u;
u32 a = buf[idx];
u32 b = a + prev + i;
buf[idx] = b;
{
u32 c = buf[(idx + 1u) & 15u];
prev = c + b;
}
acc = acc + prev;
}
bench_write(0u, acc);
bench_write(1u, prev);
bench_write(2u, buf[0]);
bench_write(3u, buf[15]);
bench_halt();
for (;;) {
}
}
Why it exists:
- Load-store interaction with short reuse distance.
- Targets ordering and load-after-store behavior.
16) dual_chain
#include "common.h"
void bench_main(void) {
u32 a = 0x10203040u;
u32 b = 0x55667788u;
u32 c = 0x89abcdefu;
u32 d = 0x0f1e2d3cu;
u32 i;
for (i = 0; i < 256u; i++) {
a = a + (b ^ (i * 3u + 1u));
b = b + ((a >> 2) ^ 0x1f1f1f1fu);
c = c ^ (d + (i * 5u + 7u));
d = d + ((c << 1) ^ 0x00ff00ffu);
}
bench_write(0u, a ^ b);
bench_write(1u, c ^ d);
bench_write(2u, a + c);
bench_write(3u, b + d);
bench_halt();
for (;;) {
}
}
Why it exists:
- Two arithmetic chains with moderate coupling.
- Good for dual-lane and scheduler fairness studies.
17) mem_ring
#include "common.h"
void bench_main(void) {
volatile u32 *ring = (volatile u32 *)(OUT_BASE_ADDR + 0x180u);
u32 acc = 0x89abcdefu;
u32 mix = 0x13579bdfu;
u32 pat = 0u;
u32 i;
for (i = 0; i < 16u; i++) {
ring[i] = pat ^ 0x55aa33ccu;
pat = pat + 0x01010101u;
}
for (i = 0; i < 192u; i++) {
u32 idx0 = i & 15u;
u32 idx1 = (idx0 + 5u) & 15u;
u32 a = ring[idx0];
u32 b = ring[idx1];
u32 n = a + b + mix + i;
ring[idx0] = n;
mix = mix ^ (n >> (i & 3u));
acc = acc + mix + ring[(idx0 + 1u) & 15u];
}
bench_write(0u, acc);
bench_write(1u, mix);
bench_write(2u, ring[0]);
bench_write(3u, ring[15]);
bench_halt();
for (;;) {
}
}
Why it exists:
- Ring-buffer memory pattern with offset reads.
- Stresses memory index arithmetic plus recurrent data reuse.
Kernel Families and What They Measure
- Dependency-heavy ALU:
alu_chain,dep_hazard,p2o_low_ilp_chain - Medium/high ILP arithmetic:
ilp_mix,dual_chain,p2o_mid_ilp_dual,p2o_high_ilp4,p2s_clean_ilp2 - Memory interaction:
mem_stream,mem_ring,load_store_hazard,p2s_lane_dep_mem,p2o_mem_overlap - Control stress:
branch_mix,branch_wave,p2s_ctrl_low_ilp - Superscalar lane interaction focus:
p2s_clean_ilp2,p2s_lane_dep_alu,p2s_lane_dep_mem - OOO extraction focus:
p2o_low_ilp_chain,p2o_mid_ilp_dual,p2o_high_ilp4,p2o_mem_overlap
Why This Mix Is Useful
This kernel set is not random. It spans:
- low ILP to high ILP
- ALU-dominant to memory-dominant loops
- low-control to high-control patterns
- friendly and adversarial conditions for dual-issue and OOO scheduling
That spread is what makes pairwise variant comparison meaningful.