diff options
Diffstat (limited to 'flashimg/cpu_hdmi/tmds_decode.S')
-rw-r--r-- | flashimg/cpu_hdmi/tmds_decode.S | 82 |
1 files changed, 82 insertions, 0 deletions
diff --git a/flashimg/cpu_hdmi/tmds_decode.S b/flashimg/cpu_hdmi/tmds_decode.S new file mode 100644 index 0000000..888b5b9 --- /dev/null +++ b/flashimg/cpu_hdmi/tmds_decode.S @@ -0,0 +1,82 @@ +// flashimg/cpu_hdmi/tmds_decode.S - Decode TMDS-encoded scanlines +// +// Copyright (C) 2025 Luke T. Shumaker <lukeshu@lukeshu.com> +// SPDX-License-Identifier: AGPL-3.0-or-later + +.syntax unified +.cpu cortex-m0plus +.thumb + +// .tmds_table is a LUT from 10-bit TMDS-encoded values to the 8-bit +// decoded values, shifted right 6 bits. +.section .scratch_x.tmds_table, "a" +tmds_table: +#define X(dec) .byte dec>>6 +#include "tmds_decode_table.h" +#undef X + +// void tmds_decode_line(uint16_t in_r[width], +// uint16_t in_g[width], +// uint16_t in_b[width], +// uint8_t out[width/2], +// size_t width); +// +// Read in `width` values from 10-bit-encoded TMDS streams of R/G/B +// channels of 24-bit RGB pixels; and write that out as RGB 2:3:2, +// scaled down horizontally by half. +// +// `width` must be either 640 or 720. +// +// BUG: Scaling down happens by discarding every other pixel, not by +// interpolation. +.section .scratch_x.tmds_decode_line, "ax" +.global tmds_decode_line +.type tmds_decode_line,%function +.thumb_func +tmds_decode_line: + // Of our supported screen formats, the one with the tightest + // cycles/pixel requirement is 720x480p@60Hz, which has + // H_active=720, H_total=858, meaning that we need to decode + // 720 pixels in under 858*10 cycles, giving us a budget of + // ~11.9 cycles per pixel. +#define iR r0 +#define iG r1 +#define iB r2 +#define o r3 +#define w r4 +#define vTab r5 +#define vAcc r6 +#define vTmp r7 + push {r4-r7, lr} // ONCE += 1+5 cyc + lsrs w, w, #1 // ONCE += 1 cyc + add w, o, w // ONCE += 1 cyc + ldr vTab, =tmds_table // ONCE += 2 cyc +loop: +.rept 4 + // red + ldmia iR!, {vTmp} // REPT += 1+1 cyc + uxth vTmp, vTmp // REPT += 1 cyc + ldsb vTmp, [vTab, vTmp] // REPT += 2 cyc + lsls vAcc, vTmp, #5 // REPT += 1 cyc + // green + ldmia iG!, {vTmp} // REPT += 1+1 cyc + uxth vTmp, vTmp // REPT += 1 cyc + ldsb vTmp, [vTab, vTmp] // REPT += 2 cyc + lsls vTmp, vTmp, #3 // REPT += 1 cyc + orrs vAcc, vAcc, vTmp // REPT += 1 cyc + // blue + ldmia iB!, {vTmp} // REPT += 1+1 cyc + uxth vTmp, vTmp // REPT += 1 cyc + ldsb vTmp, [vTab, vTmp] // REPT += 2 cyc + orrs vAcc, vAcc, vTmp // REPT += 1 cyc + // store + strb vAcc, [o, #0] // REPT += 2 cyc + adds o, o, #1 // REPT += 1 cyc +.endr + cmp o, w // LOOP += 1 cyc + bne loop // LOOP += 2 cyc ; ONCE -= 1 cyc + pop {r4-r7, pc} // ONCE += 3+5 cyc + // TOTAL = ONCE+((720/2)/N)*LOOP+(720/2)*REPT cyc + // = 17 +( 360 /4)* 3 + 360 * 22 cyc + // = 8207 cyc + // BUDGET = 8580 cyc |