diff options
Diffstat (limited to 'flashimg/cpu_hdmi/tmds_decode-interp.S')
-rw-r--r-- | flashimg/cpu_hdmi/tmds_decode-interp.S | 95 |
1 files changed, 95 insertions, 0 deletions
diff --git a/flashimg/cpu_hdmi/tmds_decode-interp.S b/flashimg/cpu_hdmi/tmds_decode-interp.S new file mode 100644 index 0000000..de1605a --- /dev/null +++ b/flashimg/cpu_hdmi/tmds_decode-interp.S @@ -0,0 +1,95 @@ +// flashimg/cpu_hdmi/tmds_decode-interp.S - Variant of tmds_decode.S that does linear interpolation +// +// Copyright (C) 2025 Luke T. Shumaker <lukeshu@lukeshu.com> +// SPDX-License-Identifier: AGPL-3.0-or-later + +.syntax unified +.cpu cortex-m0plus +.thumb + +// .tmds_table is a LUT from 10-bit TMDS-encoded values to the 8-bit +// decoded values. +.section .scratch_x.tmds_table, "a" +tmds_table: +#define X(dec) .byte dec +#include "tmds_decode_table.h" +#undef X + +// void tmds_decode_line(uint16_t in_r[width], +// uint16_t in_g[width], +// uint16_t in_b[width], +// uint8_t out[width/2], +// size_t width); +// +// Read in `width` values from 10-bit-encoded TMDS streams of R/G/B +// channels of 24-bit RGB pixels; and write that out as RGB 2:3:2, +// scaled down horizontally by half. +// +// `width` must be either 640 or 720. +.section .scratch_x.tmds_decode_line, "ax" +.global tmds_decode_line +.type tmds_decode_line,%function +.thumb_func +tmds_decode_line: + // Of our supported screen formats, the one with the tightest + // cycles/pixel requirement is 720x480p@60Hz, which has + // H_active=720, H_total=858, meaning that we need to decode + // 720 pixels in under 858*10 cycles, giving us a budget of + // ~11.9 cycles per pixel. +#define iR r0 +#define iG r1 +#define iB r2 +#define o r3 +#define w r4 +#define vTab r4 +#define vAcc r5 +#define vTmp1 r6 +#define vTmp2 r7 +#define vOEnd r12 + push {r4-r7, lr} // ONCE += 1+5 cyc + lsrs w, w, #1 // ONCE += 1 cyc + add w, o, w // ONCE += 1 cyc + mov vOEnd, w // ONCE += 1 cyc +#undef w + ldr vTab, =tmds_table // ONCE += 2 cyc +loop: +.rept 4 + // red + ldmia iR!, {vTmp2} // REPT += 1+1 cyc + lsrs vTmp1, vTmp2, #16 // REPT += 1 cyc + uxth vTmp2, vTmp2 // REPT += 1 cyc + ldsb vTmp1, [vTab, vTmp1] // REPT += 2 cyc + ldsb vTmp2, [vTab, vTmp2] // REPT += 2 cyc + add vTmp1, vTmp1, vTmp2 // REPT += 1 cyc // add 2 8-bit values, producing a 9-bit value... + lsrs vTmp1, vTmp1, #7 // REPT += 1 cyc // ...shrink it from 9-bits to 2-bits + lsls vAcc, vTmp1, #5 // REPT += 1 cyc + // green + ldmia iG!, {vTmp2} // REPT += 1+1 cyc + lsrs vTmp1, vTmp2, #16 // REPT += 1 cyc + uxth vTmp2, vTmp2 // REPT += 1 cyc + ldsb vTmp1, [vTab, vTmp1] // REPT += 2 cyc + ldsb vTmp2, [vTab, vTmp2] // REPT += 2 cyc + add vTmp1, vTmp1, vTmp2 // REPT += 1 cyc // add 2 8-bit values, producing a 9-bit value... + lsrs vTmp1, vTmp1, #6 // REPT += 1 cyc // ...shrink it from 9-bits to 3-bits + lsls vTmp1, vTmp1, #2 // REPT += 1 cyc + orrs vAcc, vAcc, vTmp1 // REPT += 1 cyc + // blue + ldmia iB!, {vTmp2} // REPT += 1+1 cyc + lsrs vTmp1, vTmp2, #16 // REPT += 1 cyc + uxth vTmp2, vTmp2 // REPT += 1 cyc + ldsb vTmp1, [vTab, vTmp1] // REPT += 2 cyc + ldsb vTmp2, [vTab, vTmp2] // REPT += 2 cyc + add vTmp1, vTmp1, vTmp2 // REPT += 1 cyc // add 2 8-bit values, producing a 9-bit value... + lsrs vTmp1, vTmp1, #7 // REPT += 1 cyc // ...shrink it from 9-bits to 2-bits + orrs vAcc, vAcc, vTmp1 // REPT += 1 cyc + // store + strb vAcc, [o, #0] // REPT += 2 cyc + adds o, o, #1 // REPT += 1 cyc +.endr + cmp o, vOEnd // LOOP += 1 cyc + bne loop // LOOP += 2 cyc ; ONCE -= 1 cyc + pop {r4-r7, pc} // ONCE += 3+5 cyc + // TOTAL = ONCE+((720/2)/N)*LOOP+(720/2)*REPT cyc + // = 18 +( 360 /4)* 3 + 360 * 37 cyc + // = 13608 cyc + // BUDGET = 8580 cyc |