Skip to content

Commit 4830758

Browse files
authored
Merge pull request #711 from sysprog21/block-cache
Add L1 block cache and direct-mapped BHT
2 parents 4b088c1 + f89a114 commit 4830758

File tree

5 files changed

+147
-66
lines changed

5 files changed

+147
-66
lines changed

src/decode.h

Lines changed: 12 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -366,49 +366,38 @@ _Static_assert(offsetof(opcode_fuse_t, opcode) == 7,
366366
"opcode_fuse_t.opcode must be at offset 7");
367367

368368
#define HISTORY_SIZE 16
369+
/* Direct-mapped BHT requires power-of-2 size for mask calculation */
370+
_Static_assert((HISTORY_SIZE & (HISTORY_SIZE - 1)) == 0,
371+
"HISTORY_SIZE must be a power of 2");
372+
369373
typedef struct {
370-
uint32_t PC[HISTORY_SIZE];
374+
uint32_t PC[HISTORY_SIZE]; /**< PC tags for direct-mapped lookup */
371375
#if !RV32_HAS(JIT)
372-
uint8_t idx;
373-
struct rv_insn *target[HISTORY_SIZE];
376+
struct rv_insn *target[HISTORY_SIZE]; /**< target IR pointers */
374377
#else
375-
uint32_t times[HISTORY_SIZE];
378+
uint32_t times[HISTORY_SIZE]; /**< access counts for JIT hotness */
376379
#if RV32_HAS(SYSTEM)
377-
uint32_t satp[HISTORY_SIZE];
380+
uint32_t satp[HISTORY_SIZE]; /**< SATP for address space matching */
378381
#endif
379382
#endif
380383
} branch_history_table_t;
381384

382385
#if RV32_HAS(JIT)
383386
/* Find index with maximum times count in branch history table.
384387
* Used by JIT to identify the most frequently taken indirect jump target.
388+
* Note: With direct-mapped BHT, zeros can appear at any index, so we must
389+
* scan all entries (cannot break early on first zero).
385390
*/
386391
static inline int bht_find_max_idx(const branch_history_table_t *bt)
387392
{
388393
int max_idx = 0;
389-
for (int i = 0; i < HISTORY_SIZE; i++) {
390-
if (!bt->times[i])
391-
break;
392-
if (bt->times[max_idx] < bt->times[i])
394+
for (int i = 1; i < HISTORY_SIZE; i++) {
395+
if (bt->times[i] > bt->times[max_idx])
393396
max_idx = i;
394397
}
395398
return max_idx;
396399
}
397400

398-
/* Find index with minimum times count for LFU replacement.
399-
* Returns first empty slot if available, otherwise the least frequently used.
400-
*/
401-
static inline int bht_find_min_idx(const branch_history_table_t *bt)
402-
{
403-
int min_idx = 0;
404-
for (int i = 0; i < HISTORY_SIZE; i++) {
405-
if (!bt->times[i])
406-
return i; /* empty slot found */
407-
if (bt->times[min_idx] > bt->times[i])
408-
min_idx = i;
409-
}
410-
return min_idx;
411-
}
412401
#endif
413402

414403
typedef struct rv_insn {

src/emulate.c

Lines changed: 45 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -530,8 +530,18 @@ static block_t *block_alloc(riscv_t *rv)
530530
}
531531

532532
#if !RV32_HAS(JIT)
533+
/* Update L1 direct-mapped block cache.
534+
* Called after block insertion to enable fast lookup for hot paths.
535+
*/
536+
static inline void block_l1_update(riscv_t *rv, block_t *block)
537+
{
538+
uint32_t idx = (block->pc_start >> BLOCK_L1_INDEX_SHIFT) & BLOCK_L1_MASK;
539+
rv->block_l1.tags[idx] = block->pc_start;
540+
rv->block_l1.ptrs[idx] = block;
541+
}
542+
533543
/* insert a block into block map */
534-
static void block_insert(block_map_t *map, const block_t *block)
544+
static void block_insert(block_map_t *map, riscv_t *rv, const block_t *block)
535545
{
536546
assert(map && block);
537547
const uint32_t mask = map->block_capacity - 1;
@@ -545,6 +555,9 @@ static void block_insert(block_map_t *map, const block_t *block)
545555
}
546556
}
547557
map->size++;
558+
559+
/* update L1 cache for fast subsequent lookups */
560+
block_l1_update(rv, (block_t *) block);
548561
}
549562

550563
/* try to locate an already translated block in the block map */
@@ -565,6 +578,32 @@ static block_t *block_find(const block_map_t *map, const uint32_t addr)
565578
}
566579
return NULL;
567580
}
581+
582+
/* Fast block lookup using L1 direct-mapped cache.
583+
* Falls back to hash table on L1 miss.
584+
* This is the hot path - optimized for tight loops.
585+
*
586+
* Separated arrays: tag array checked first (1KB), pointer loaded on hit.
587+
* Benchmarked faster than interleaved on x86-64.
588+
*/
589+
static inline block_t *block_lookup_or_find(riscv_t *rv, uint32_t pc)
590+
{
591+
/* L1 cache lookup - check tag first (avoids loading pointer on miss) */
592+
uint32_t idx = (pc >> BLOCK_L1_INDEX_SHIFT) & BLOCK_L1_MASK;
593+
if (likely(rv->block_l1.tags[idx] == pc))
594+
return rv->block_l1.ptrs[idx];
595+
596+
/* L1 miss - fall back to hash table lookup */
597+
block_t *block = block_find(&rv->block_map, pc);
598+
599+
/* Populate L1 cache on hash table hit for future lookups */
600+
if (block) {
601+
rv->block_l1.tags[idx] = pc;
602+
rv->block_l1.ptrs[idx] = block;
603+
}
604+
605+
return block;
606+
}
568607
#endif
569608

570609
#if !RV32_HAS(EXT_C)
@@ -1808,8 +1847,8 @@ static block_t *block_find_or_translate(riscv_t *rv)
18081847
{
18091848
#if !RV32_HAS(JIT)
18101849
block_map_t *map = &rv->block_map;
1811-
/* lookup the next block in the block map */
1812-
block_t *next_blk = block_find(map, rv->PC);
1850+
/* lookup the next block using L1 cache with hash table fallback */
1851+
block_t *next_blk = block_lookup_or_find(rv, rv->PC);
18131852
#else
18141853
/* lookup the next block in the block cache */
18151854
block_t *next_blk = (block_t *) cache_get(rv->block_cache, rv->PC, true);
@@ -1861,8 +1900,8 @@ static block_t *block_find_or_translate(riscv_t *rv)
18611900
#endif
18621901

18631902
#if !RV32_HAS(JIT)
1864-
/* insert the block into block map */
1865-
block_insert(&rv->block_map, next_blk);
1903+
/* insert the block into block map and L1 cache */
1904+
block_insert(&rv->block_map, rv, next_blk);
18661905
#else
18671906
list_add(&next_blk->list, &rv->block_list);
18681907

@@ -2096,7 +2135,7 @@ void rv_step(void *arg)
20962135
if (prev && prev->pc_start != last_pc) {
20972136
/* update previous block */
20982137
#if !RV32_HAS(JIT)
2099-
prev = block_find(&rv->block_map, last_pc);
2138+
prev = block_lookup_or_find(rv, last_pc);
21002139
#else
21012140
prev = cache_get(rv->block_cache, last_pc, false);
21022141
#endif

src/riscv.c

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,14 @@ void block_map_clear(riscv_t *rv)
8080
map->map[i] = NULL;
8181
}
8282
map->size = 0;
83+
84+
/* clear L1 direct-mapped block cache - use invalid tags to avoid
85+
* false hits on PC=0 edge case. Separated arrays: tags first for
86+
* cache-efficient miss detection, ptrs zeroed with memset.
87+
*/
88+
for (int i = 0; i < BLOCK_L1_SIZE; i++)
89+
rv->block_l1.tags[i] = BLOCK_L1_INVALID_TAG;
90+
memset(rv->block_l1.ptrs, 0, sizeof(rv->block_l1.ptrs));
8391
}
8492

8593
static void block_map_destroy(riscv_t *rv)
@@ -872,6 +880,11 @@ riscv_t *rv_create(riscv_user_t rv_attr)
872880
#if !RV32_HAS(JIT)
873881
/* initialize the block map */
874882
block_map_init(&rv->block_map, BLOCK_MAP_CAPACITY_BITS);
883+
884+
/* initialize L1 block cache with invalid tags */
885+
for (int i = 0; i < BLOCK_L1_SIZE; i++)
886+
rv->block_l1.tags[i] = BLOCK_L1_INVALID_TAG;
887+
memset(rv->block_l1.ptrs, 0, sizeof(rv->block_l1.ptrs));
875888
#else
876889
INIT_LIST_HEAD(&rv->block_list);
877890
rv->jit_state = jit_state_init(CODE_CACHE_SIZE);

src/riscv_private.h

Lines changed: 54 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -188,29 +188,72 @@ typedef struct {
188188
block_t **map; /**< block map */
189189
} block_map_t;
190190

191+
/* L1 direct-mapped block cache for fast block lookup.
192+
* Inspired by rvdbt's tcache.h design.
193+
* Expected gain: 5-15% by avoiding hash table lookup for hot loops.
194+
*
195+
* Design: Separated tag/pointer arrays for cache efficiency.
196+
* - Tag array (1KB) checked first - fits in L1, good for miss path
197+
* - Pointer array (2KB) loaded only on hit
198+
* - Benchmarked faster than interleaved on x86-64 (+11.9% vs +8.8%)
199+
*/
200+
#define BLOCK_L1_SIZE 256
201+
#define BLOCK_L1_MASK (BLOCK_L1_SIZE - 1)
202+
203+
/* Index shift for L1 cache lookup.
204+
* With EXT_C: PCs can be half-word aligned, shift by 1 to use bit 1.
205+
* Without EXT_C: PCs are word-aligned, shift by 2.
206+
* Using correct shift reduces conflict misses in compressed code.
207+
*/
208+
#if RV32_HAS(EXT_C)
209+
#define BLOCK_L1_INDEX_SHIFT 1
210+
#else
211+
#define BLOCK_L1_INDEX_SHIFT 2
212+
#endif
213+
214+
/* Cache line size for alignment (typical x86/Arm64). */
215+
#define CACHE_LINE_SIZE 64
216+
217+
/* Invalid tag sentinel - guaranteed never to match a valid PC.
218+
* Valid RISC-V PCs are word-aligned (or half-word for C extension),
219+
* so a value with low bits set is always invalid.
220+
*/
221+
#define BLOCK_L1_INVALID_TAG 0xFFFFFFFFu
222+
223+
/* L1 block cache with separated arrays for cache efficiency.
224+
* Tag array checked first (1KB), pointer loaded only on hit (2KB).
225+
* Separated layout benchmarked faster than interleaved on x86-64.
226+
*/
227+
typedef struct {
228+
uint32_t tags[BLOCK_L1_SIZE]; /**< PC tags for fast comparison */
229+
block_t *ptrs[BLOCK_L1_SIZE]; /**< block pointers, loaded on tag hit */
230+
} block_l1_cache_t;
231+
191232
/* clear all block in the block map */
192233
void block_map_clear(riscv_t *rv);
193234

194235
struct riscv_internal {
195-
bool halt; /* indicate whether the core is halted */
236+
bool halt; /**< indicate whether the core is halted */
196237

197-
/* integer registers */
198-
/*
199-
* Aarch64 encoder only accepts 9 bits signed offset. Do not put this
200-
* structure below the section.
201-
*/
238+
/* Integer registers - Aarch64 encoder needs 9-bit signed offset access */
202239
riscv_word_t X[N_RV_REGS];
203240
riscv_word_t PC;
204241

205-
uint64_t timer; /* strictly increment timer */
242+
uint64_t timer; /**< strictly increment timer */
206243

207244
#if RV32_HAS(SYSTEM)
208-
/* is_trapped must be within 256-byte offset for ARM64 JIT access.
209-
* Placed early in struct to ensure accessibility from JIT-generated code.
210-
*/
245+
/* is_trapped must be within 256-byte offset for ARM64 JIT access */
211246
bool is_trapped;
212247
#endif
213248

249+
#if !RV32_HAS(JIT)
250+
/* L1 block cache - tag/pointer separation for cache efficiency.
251+
* Tags checked first (1KB), pointers loaded only on hit (2KB).
252+
* Placed near hot fields for interpreter fast path.
253+
*/
254+
block_l1_cache_t block_l1 __ALIGNED(CACHE_LINE_SIZE);
255+
#endif
256+
214257
#if RV32_HAS(JIT) && RV32_HAS(SYSTEM)
215258
/*
216259
* Aarch64 encoder only accepts 9 bits signed offset. Do not put this
@@ -270,7 +313,7 @@ struct riscv_internal {
270313

271314
bool compressed; /**< current instruction is compressed or not */
272315
#if !RV32_HAS(JIT)
273-
block_map_t block_map; /**< basic block map */
316+
block_map_t block_map; /**< basic block map (fallback on L1 miss) */
274317
#else
275318
struct cache *block_cache;
276319
struct list_head block_list; /**< list of all translated blocks */

src/rv32_template.c

Lines changed: 23 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -104,7 +104,7 @@ RVOP(jal, {
104104
#if !RV32_HAS(JIT)
105105
#define LOOKUP_OR_UPDATE_BRANCH_HISTORY_TABLE() \
106106
/* \
107-
* lookup branch history table \
107+
* Direct-mapped branch history table lookup. \
108108
* \
109109
* When handling trap, the branch history table should not be lookup since \
110110
* it causes return from the trap_handler. \
@@ -116,20 +116,18 @@ RVOP(jal, {
116116
{ \
117117
IIF(RV32_HAS(SYSTEM)(if (!rv->is_trapped && !reloc_enable_mmu), )) \
118118
{ \
119-
for (int i = 0; i < HISTORY_SIZE; i++) { \
120-
if (ir->branch_table->PC[i] == PC) { \
121-
MUST_TAIL return ir->branch_table->target[i]->impl( \
122-
rv, ir->branch_table->target[i], cycle, PC); \
123-
} \
119+
/* Direct-mapped lookup: O(1) instead of O(n) linear search */ \
120+
const uint32_t bht_idx = (PC >> 2) & (HISTORY_SIZE - 1); \
121+
if (ir->branch_table->PC[bht_idx] == PC && \
122+
ir->branch_table->target[bht_idx]) { \
123+
MUST_TAIL return ir->branch_table->target[bht_idx]->impl( \
124+
rv, ir->branch_table->target[bht_idx], cycle, PC); \
124125
} \
125126
block_t *block = block_find(&rv->block_map, PC); \
126127
if (block) { \
127-
/* update branch history table */ \
128-
ir->branch_table->PC[ir->branch_table->idx] = PC; \
129-
ir->branch_table->target[ir->branch_table->idx] = \
130-
block->ir_head; \
131-
ir->branch_table->idx = \
132-
(ir->branch_table->idx + 1) % HISTORY_SIZE; \
128+
/* Direct replacement at computed index */ \
129+
ir->branch_table->PC[bht_idx] = PC; \
130+
ir->branch_table->target[bht_idx] = block->ir_head; \
133131
MUST_TAIL return block->ir_head->impl(rv, block->ir_head, \
134132
cycle, PC); \
135133
} \
@@ -141,23 +139,22 @@ RVOP(jal, {
141139
{ \
142140
block_t *block = cache_get(rv->block_cache, PC, true); \
143141
if (block) { \
144-
for (int i = 0; i < HISTORY_SIZE; i++) { \
145-
if (ir->branch_table->PC[i] == PC) { \
146-
IIF(RV32_HAS(SYSTEM))( \
147-
if (ir->branch_table->satp[i] == rv->csr_satp), ) \
148-
{ \
149-
ir->branch_table->times[i]++; \
150-
if (cache_hot(rv->block_cache, PC)) \
151-
goto end_op; \
152-
} \
142+
/* Direct-mapped lookup: O(1) instead of O(n) linear search */ \
143+
const uint32_t bht_idx = (PC >> 2) & (HISTORY_SIZE - 1); \
144+
if (ir->branch_table->PC[bht_idx] == PC) { \
145+
IIF(RV32_HAS(SYSTEM))( \
146+
if (ir->branch_table->satp[bht_idx] == rv->csr_satp), ) \
147+
{ \
148+
ir->branch_table->times[bht_idx]++; \
149+
if (cache_hot(rv->block_cache, PC)) \
150+
goto end_op; \
153151
} \
154152
} \
155-
/* update branch history table using LFU replacement */ \
156-
int min_idx = bht_find_min_idx(ir->branch_table); \
157-
ir->branch_table->times[min_idx] = 1; \
158-
ir->branch_table->PC[min_idx] = PC; \
153+
/* Direct replacement at computed index */ \
154+
ir->branch_table->times[bht_idx] = 1; \
155+
ir->branch_table->PC[bht_idx] = PC; \
159156
IIF(RV32_HAS(SYSTEM))( \
160-
ir->branch_table->satp[min_idx] = rv->csr_satp, ); \
157+
ir->branch_table->satp[bht_idx] = rv->csr_satp, ); \
161158
if (cache_hot(rv->block_cache, PC)) \
162159
goto end_op; \
163160
MUST_TAIL return block->ir_head->impl(rv, block->ir_head, cycle, \

0 commit comments

Comments
 (0)