-
Notifications
You must be signed in to change notification settings - Fork 0
/
iwram_code.s
467 lines (430 loc) · 14.4 KB
/
iwram_code.s
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
.cpu arm7tdmi
.section ".iwram", "ax", %progbits
.func isr_switchboard
.global isr_switchboard
.type isr_switchboard, %function
.extern __isr_table; @ void (*isr_table[16])(void)
.arm
.Lswitchboard_return:
pop {r0, r3, lr} @ {spsr_irq mixed, lr_irq, lr_usr}
msr cpsr_c, #0x92 @ nesting off, Back in IRQ mode
mov lr, r3
mov r0, r0, ror#16
msr spsr_cf, r0 @ the cf flag specifier will mask out embedded irq index
mov r0, #0x04000000
@ fallthrough to check for more interrupts,
@ to reduce BIOS context switching overhead.
isr_switchboard:
@ mov r0, #0x04000000 @ r0 already has REG_BASE from GBA BIOS
ldr r1, [r0, #0x0208] @ check if IME is cleared (can happen in called code
tst r1, #1 @ or when "timing agrees"). ignore all interrupts if so.
ldrne r1, [r0, #0x0200]! @ 0x04000200: IE, and 0x04000202: IF
andnes r1, r1, lsr #16 @ r1: IE & IF
bxeq lr @ exit if no more request flags to service (or ignored).
rsb r2, r1, #0x00
and r1, r2, r1 @ get lowest set bit with (x & -x)
@ r0: REG_IE, r1: chosen irq bit
strh r1, [r0, #0x2] @ Acknowledge IRQ
ldr r2, [r0, #-0x200-8] @ Acknowledge BIOS's copy of IRQ Flags
orr r2, r2, r1
str r2, [r0, #-0x200-8]!
@ compute isr address.
add r3, r0, #8-0x140 @ r2: REG_BASE-0x140 aka irq_table
@ compute log2(r1) for the table index via a 16 bit de-Bruijn string
adr r0, .Ldebruijn_tab
@ Note: the de-Bruijn string overlaps nesting types by 3 bits,
@ if the cart nesting type bit is 1 that'll make irq types
@ 14, and 15 invalid (which is fine because those don't exist anyway)
ldr r2, =0x0f2d3f79 @ mixed de-Bruijn string and isr nesting types
tst r1, r2 @ ~0x3f79: hblank, vcount, serial
mul r2, r1, r2 @ wider bit pattren on Rs so to be const timed
ldrb r0, [r0, r2, lsr #28]
ldr r12, [r3, r0, lsl #2]
@ r0: irq index, r1/r2: junk, r3: REG_BASE-0x140, r12: isr address
@ nesting type bit == 0: on IRQ stack, returns to BIOS, non nesting, low overhead.
bxeq r12
@ else nesting type bit == 1: prepare for nesting
@ get system clock kept at timer 2 and 3
add r1, r3, #0x140+0x108 @ 0x04000108: Timer 2 and 3 Counters
ldmia r1, {r1, r2} @ fetch hardware timers
add r1, #1 @ correct cycle diffrence between words
bic r1, #0x00ff0000
orr r1, r1, r2, lsl#16
add r3, r3, #(14*4) @ 0x03fffef8: saved 64 bit timer
ldr r2, [r3, #0]
cmp r2, r1 @ if (prev 32 bits)-(current 32 bits) > 0
ldr r2, [r3, #4]
adc r2, r2, #0 @ add that carry bit to high 32-bits
stmia r3, {r1, r2} @ save 64-bit timer for next check
@ push registers.
mrs r3, spsr @ ran out of room so stuff spsr fields
orr r0, r0, r3, ror#16 @ with irq index (8-bit)
mov r3, lr
msr cpsr_c, #0x1f @ switch to user stack, nesting IRQ are now enabled
push {r0, r3, lr} @ {spsr_irq mixed, lr_irq, lr_usr}
and r0, r0, #0x000000ff
adr lr, .Lswitchboard_return
@ r0: irq index, r1/r2: 64-bit system time, r3: lr_irq/0x138, r12: isr address
@ a check on r3 can distinguish between nesting and non nesting modes
@ "if (r3 < 0x4000) then is in nesting mode"
bx r12
@ in total about 5 μs (or 30% hblank) from hardware interrupt
.align
.Ldebruijn_tab:
.byte 0, 1, 8, 2, 14, 9, 11, 3, 15, 7, 13, 10, 6, 12, 5, 4
.pool
.size isr_switchboard,.-.Lswitchboard_return
.endfunc
.func get_system_clock
.global get_system_clock
.type get_system_clock, %function
@ u64 get_system_clock();
.arm
get_system_clock:
mov r12, #0x04000000
@ str r12, [r12, #0x0208]
add r0, r12, #0x108 @ 0x04000108: Timer 2 and 3 Counters
ldmia r0, {r0, r1} @ fetch hardware timers
add r0, #1 @ correct cycle diffrence between words
bic r0, #0x00ff0000
orr r0, r0, r1, lsl#16
add r3, r12, #-0x140+(14*4) @ 0x03fffef8: saved 64 bit timer
ldmia r3, {r1, r2}
cmp r1, r0 @ if (prev 32 bits)-(current 32 bits) > 0
adc r1, r2, #0 @ add that carry bit to high 32-bits
stmia r3, {r0, r1} @ save 64-bit timer for next check
@ mov r2, #1
@ str r12, [r2, #0x0208]
bx lr
.size get_system_clock,.-get_system_clock
.endfunc
.func unlz4_vram
.global unlz4_vram
.type unlz4_vram, %function
@ void unlz4_vram(void* dest, const void* src, size_t src_size);
@ GBA vram has a 8 bit read bus, but only a 16 bit write bus.
@ this routine works around that by keeping track of a halfword write buffer.
@ Use only for trusted input, as this will not check for invalid blocks
@ including things like a 0 byte src length, offsets of 0, read/write out of
@ buffer bounds.
.arm
unlz4_vram:
push {r4-r7, lr}
add r2, r2, r1
@ r0: dest, r1: src, r2: src end, r3: halfword write buffer
@ r4: copy ptr, r5: copy len, r6: token, r7: lsb of dest (aligned flag)
@ r12: read scratch
mov r3, #0
ands r7, r0, #1 @ if dest addr already misaligned
ldrneb r3, [r0, #-1]! @ then read low byte, and round down
b .Lget_token
.Lmatch_offset:
ldrb r4, [r1], #1
ldrb r12, [r1], #1
orr r4, r4, r12, LSL #8 @ read match offset
ands r5, r6, #0x0f
blne .Lget_len
add r5, r5, #4 @ add the min length
cmp r4, #1 @ if offset == 1
beq .Lwrite_run @ then do a run instead of a match copy
sub r4, r0, r4
add r4, r4, r7 @ copy ptr = dest + dest_flag - offset
bl .Lcopy_data
.Lget_token:
ldrb r6, [r1], #1
lsrs r5, r6, #4
blne .Lget_len @ "not equal" will skip literals if token == 0
mov r4, r1 @ .Lget_len normaly exits with Z=0, but
blne .Lcopy_data @ if it got skiped, then Z is left as 1
mov r1, r4
cmp r1, r2 @ check if at end of compressed stream
blt .Lmatch_offset
.Lend:
tst r7, #1 @ if a byte remains in buffer
ldrneb r12, [r0, #1] @ then load the upper dest byte,
orrne r3, r3, r12, LSL #8
strneh r3, [r0], #1 @ store it, and point dest to byte end
pop {r4-r7}
pop {lr}
bx lr
.Lget_len:
cmp r5, #15 @ if len == 15, then add next byte
.Lextend_len_loop:
bxne lr @ done if len != 15 or 255.
ldrb r12, [r1], #1
add r5, r5, r12
cmp r12, #255 @ if new byte == 255, add more.
b .Lextend_len_loop
.Lcopy_data:
ldrb r12, [r4], #1 @ read from literals/match offset
orr r3, r3, r12, LSL #8 @ into the write buffer.
eors r7, r7, #1 @ add 1 to dest + dest_flag
streqh r3, [r0], #2 @ if aligned write halfword
lsr r3, r3, #8 @ make room for next byte read
subs r5, r5, #1
bne .Lcopy_data
bx lr
.Lwrite_run:
orr r12, r3, r3, LSL #8
.Lrun_loop:
eors r7, r7, #1
streqh r12, [r0], #2
subs r5, r5, #1
bne .Lrun_loop
b .Lget_token
.size unlz4_vram,.-unlz4_vram
.endfunc
.func load_default_palette
.global load_default_palette
.type load_default_palette, %function
@ void load_default_palette(u16* memory);
@ print("""GIMP Palette
@ Name: RGBI Arranged.
@ Columns: 16
@ #""")
@ for i in range(256):
@ print(((((i>>4)&0xc)+i)&0xf)*17, (i&0xf)*17, ((((i>>2)&0xc)+i)&0xf)*17)
.arm
load_default_palette:
mov r12, #256*2
ldr r2, =0x781e63d8
ldr r3, =0x42107bde
1:
subs r12, #2 @ r12: .... ...7 6543 210.
mov r1, r12, lsl #26 @ 4321 0... .... .... .... .... .... ....
orr r1, r12, lsr #4 @ 4321 0... .... .... .... .... ...7 6543
orr r1, r1, lsl #12 @ 4321 0... .... ...7 6543 .... ...7 6543
orr r1, r1, lsr #21 @ 4321 0... .... ...7 6543 .432 10.7 6543
orr r1, r12, lsl #16 @ 4321 0..7 6543 2107 6543 .432 10.7 6543
and r1, r2 @ .321 0... ...3 210. .54. ..32 10.7 6...
add r1, r1, ror #16 @ + .54. ..32 10.7 6... .321 0... ...3 210.
@ xBBB B.GG GGxR RRR. xBBB B.GG GGxR RRR.
and r1, r3 @ .b.. ..g. ...r .... .BBB B.GG GG.R RRR.
orr r1, r1, lsr #20 @ .BBB BbGG GGgR RRRr
strh r1, [r0, r12]
bge 1b @ flags set by "subs" way above
bx lr
.pool
.align
.size load_default_palette,.-load_default_palette
.endfunc
.func load_default_8bit_bitmap
.global load_default_8bit_bitmap
.type load_default_8bit_bitmap, %function
@ void load_default_8bit_bitmap(void* memory);
.arm
load_default_8bit_bitmap:
push {r4-r5}
mov r12, #1
2:
ldr r1, =0x0c0c0c0c
mov r2, r1
mov r3, r1
sub r4, r1, #0x01000000
tst r12, #0x01
subeq r2, r2, #0x01000000
subeq r4, r4, #0x01000000
tst r12, #0x07
bne 3f
tst r12, #0x0f
ldreq r1, =0x0a0b0a0b
ldrne r1, =0x0b0c0b0c
mov r2, r1
mov r3, r1
moveq r4, r1
subne r4, r1, #0x01000000
3:
@ mov r5, #16
mov r5, #15
1:
stmia r0!, {r1-r4}
subs r5, #1
bne 1b
add r12, #1
@ cmp r12, #248
cmp r12, #160
ble 2b
pop {r4-r5}
bx lr
.pool
.align
.size load_default_8bit_bitmap,.-load_default_8bit_bitmap
.endfunc
.func load_default_16bit_bitmap
.global load_default_16bit_bitmap
.type load_default_16bit_bitmap, %function
@ void load_default_16bit_bitmap(void* memory);
@ writes a particular sequence of 60 KiB that's a gray grid
.arm
load_default_16bit_bitmap:
push {r4-r8, r10}
mov r12, #1
2:
tst r12, #0x07
ldreq r1, .Lgray_25_gray_23
ldrne r1, .Lgray_25_gray_25
tst r12, #0x0f
ldreq r1, .Lgray_23_gray_21
mov r2, r1
mov r3, r1
mov r4, r1
mov r5, r1
mov r6, r1
mov r7, r1
mov r8, r1
beq 3f
tst r12, #0x01
ldr r8, .Lgray_25_gray_23
moveq r4, r8
ldreq r8, .Lgray_25_gray_21
3:
@ mov r10, #12
mov r10, #15
1:
stmia r0!, {r1-r8}
subs r10, #1
bne 1b
add r12, #1
cmp r12, #160
ble 2b
pop {r4-r8, r10}
bx lr
.Lgray_25_gray_25:
.word 0x67396739
.Lgray_25_gray_23:
.word 0x5ef76739
.Lgray_25_gray_21:
.word 0x56b56739
.Lgray_23_gray_21:
.word 0x56b55ef7
.size load_default_16bit_bitmap,.-load_default_16bit_bitmap
.endfunc
.func plot_pixel
.global plot_pixel
.type plot_pixel, %function
@ struct pixel_plot_surface {
@ volatile void* buffer;
@ // bits are laied out all in little endian order. the top-left pixel is lsb.
@ u32 bpp; // bits to move right 1 pixel, must be 1, 2, 4, 8, or 16
@ u32 pitch; // bytes to move down 1 pixel
@ u32 width; // in pixels. exceeding this counts as a tile movement
@ u32 height; // ...
@ u32 tile_pitch_h; // bytes to move right 1 tile
@ u32 tile_pitch_v; // bytes to move down 1 tile
@ bool vram_bus; // buffer is pointing to VRAM which can't take byte writes
@ };
@ void plot_pixel (const pixel_plot_surface* surface, u32 x, u32 y, u32 color);
.arm
plot_pixel:
push {r4-r10, lr}
ldmia r0, {r4-r10, r12} @ r4: buffer / offset
cmp r5, #8
movhi r12, #1 @ bool write_halfword = (vram_bus || (bpp > 8))
mov r0, r1
movs r1, r7 @ flag for next branch
mov r7, r2
push {r3, r12}
moveq r1, r0 @ if (width == 0) skip divmod, and just use x
beq 1f
bl __aeabi_uidivmod
mla r4, r0, r9, r4 @ buf += (x / tile_width) * tile_pitch_h
1:
mul r2, r1, r5
and r9, r2, #7 @ offset_bits = ((x % tile_width) * bpp) % 8
add r4, r4, r2, lsr#3 @ buf += ((x % tile_width) * bpp) >> 3
mov r0, r7
movs r1, r8
moveq r1, r0 @ if (height == 0) skip divmod, and just use y
beq 1f
bl __aeabi_uidivmod
mla r4, r0, r10, r4 @ buf += (y / tile_height) * tile_pitch_v
1:
mla r4, r1, r6, r4 @ buf += (y % tile_height) * pitch
pop {r3, r12}
cmp r12, #0
andne r0, r4, #1
orrne r9, r9, r0, lsl#3 @ offset_bits |= (offset & 1) << 3;
eorne r4, r4, r0 @ offset &= ~1;
ldreqb r0, [r4]
ldrneh r0, [r4]
mov r1, #1
rsb r1, r1, r1, lsl r5 @ bitmask = (1 << bpp)-1
bic r0, r0, r1, lsl r9
and r3, r3, r1
orr r0, r0, r3, lsl r9
streqb r0, [r4]
strneh r0, [r4]
pop {r4-r10, lr}
bx lr
.size plot_pixel,.-plot_pixel
.endfunc
.func read_pixel
.global read_pixel
.type read_pixel, %function
@ u32 read_pixel (const pixel_plot_surface* surface, u32 x, u32 y);
.arm
read_pixel:
push {r4-r10, lr}
ldmia r0, {r4-r10, r12} @ r4: buffer / offset
cmp r5, #8
movhi r12, #1 @ bool write_halfword = (vram_bus || (bpp > 8))
mov r0, r1
movs r1, r7 @ flag for next branch
mov r7, r2
push {r12}
moveq r1, r0 @ if (width == 0) skip divmod, and just use x
beq 1f
bl __aeabi_uidivmod
mla r4, r0, r9, r4 @ buf += (x / tile_width) * tile_pitch_h
1:
mul r2, r1, r5
and r9, r2, #7 @ offset_bits = ((x % tile_width) * bpp) % 8
add r4, r4, r2, lsr#3 @ buf += ((x % tile_width) * bpp) >> 3
mov r0, r7
movs r1, r8
moveq r1, r0 @ if (height == 0) skip divmod, and just use y
beq 1f
bl __aeabi_uidivmod
mla r4, r0, r10, r4 @ buf += (y / tile_height) * tile_pitch_v
1:
mla r4, r1, r6, r4 @ buf += (y % tile_height) * pitch
pop {r12}
cmp r12, #0
andne r0, r4, #1
orrne r9, r9, r0, lsl#3 @ offset_bits |= (offset & 1) << 3;
eorne r4, r4, r0 @ offset &= ~1;
ldreqb r0, [r4]
ldrneh r0, [r4] @ val = (vram_bus) ? (u16*)*buf : (u8*)*buf;
mov r1, #1
rsb r1, r1, r1, lsl r5 @ bitmask = (1 << bpp)-1
and r0, r1, r0, lsr r9 @ return (val >> offset_bits) & ((1 << bpp)-1)
pop {r4-r10, lr}
bx lr
.size read_pixel,.-read_pixel
.endfunc
.func log2
.global log2
.type log2, %function
@ u32 log2(u32 val);
.thumb
log2:
neg r1, r0
beq .Lreturn_nan
and r0, r1
ldr r1, =0x077CB531
mul r0, r1
lsr r0, #27
adr r1, .Llog2_debruijn_table
ldrb r0, [r1, r0]
bx lr @ return table[((x & -x) * 0x077CB531) >> 27]
.Lreturn_nan:
sub r0, #1 @ return -1
bx lr
.align
.Llog2_debruijn_table:
.byte 0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8
.byte 31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9
.pool
.size log2,.-log2
.endfunc
.end