1 files changed, 95 insertions, 0 deletions
diff --git a/flashimg/cpu_hdmi/tmds_decode-interp.S b/flashimg/cpu_hdmi/tmds_decode-interp.S
new file mode 100644
index 0000000..de1605a
--- /dev/null
+++ b/flashimg/cpu_hdmi/tmds_decode-interp.S
@@ -0,0 +1,95 @@
+// flashimg/cpu_hdmi/tmds_decode-interp.S - Variant of tmds_decode.S that does linear interpolation
+//
+// Copyright (C) 2025  Luke T. Shumaker <lukeshu@lukeshu.com>
+// SPDX-License-Identifier: AGPL-3.0-or-later
+
+.syntax unified
+.cpu cortex-m0plus
+.thumb
+
+// .tmds_table is a LUT from 10-bit TMDS-encoded values to the 8-bit
+// decoded values.
+.section .scratch_x.tmds_table, "a"
+tmds_table:
+#define X(dec) .byte dec
+#include "tmds_decode_table.h"
+#undef X
+
+// void tmds_decode_line(uint16_t in_r[width],
+//                       uint16_t in_g[width],
+//                       uint16_t in_b[width],
+//                       uint8_t  out[width/2],
+//                       size_t width);
+//
+// Read in `width` values from 10-bit-encoded TMDS streams of R/G/B
+// channels of 24-bit RGB pixels; and write that out as RGB 2:3:2,
+// scaled down horizontally by half.
+//
+// `width` must be either 640 or 720.
+.section .scratch_x.tmds_decode_line, "ax"
+.global tmds_decode_line
+.type tmds_decode_line,%function
+.thumb_func
+tmds_decode_line:
+	// Of our supported screen formats, the one with the tightest
+	// cycles/pixel requirement is 720x480p@60Hz, which has
+	// H_active=720, H_total=858, meaning that we need to decode
+	// 720 pixels in under 858*10 cycles, giving us a budget of
+	// ~11.9 cycles per pixel.
+#define iR      r0
+#define iG      r1
+#define iB      r2
+#define o       r3
+#define w       r4
+#define vTab    r4
+#define vAcc    r5
+#define vTmp1   r6
+#define vTmp2   r7
+#define vOEnd   r12
+	push    {r4-r7, lr}             // ONCE += 1+5 cyc
+	lsrs    w, w, #1                // ONCE += 1 cyc
+	add     w, o, w                 // ONCE += 1 cyc
+	mov     vOEnd, w                // ONCE += 1 cyc
+#undef w
+	ldr     vTab, =tmds_table       // ONCE += 2 cyc
+loop:
+.rept 4
+	// red
+	ldmia   iR!, {vTmp2}            // REPT += 1+1 cyc
+	lsrs    vTmp1, vTmp2, #16       // REPT += 1 cyc
+	uxth    vTmp2, vTmp2            // REPT += 1 cyc
+	ldsb    vTmp1, [vTab, vTmp1]    // REPT += 2 cyc
+	ldsb    vTmp2, [vTab, vTmp2]    // REPT += 2 cyc
+	add     vTmp1, vTmp1, vTmp2     // REPT += 1 cyc // add 2 8-bit values, producing a 9-bit value...
+	lsrs    vTmp1, vTmp1, #7        // REPT += 1 cyc // ...shrink it from 9-bits to 2-bits
+	lsls    vAcc, vTmp1, #5         // REPT += 1 cyc
+	// green
+	ldmia   iG!, {vTmp2}            // REPT += 1+1 cyc
+	lsrs    vTmp1, vTmp2, #16       // REPT += 1 cyc
+	uxth    vTmp2, vTmp2            // REPT += 1 cyc
+	ldsb    vTmp1, [vTab, vTmp1]    // REPT += 2 cyc
+	ldsb    vTmp2, [vTab, vTmp2]    // REPT += 2 cyc
+	add     vTmp1, vTmp1, vTmp2     // REPT += 1 cyc // add 2 8-bit values, producing a 9-bit value...
+	lsrs    vTmp1, vTmp1, #6        // REPT += 1 cyc // ...shrink it from 9-bits to 3-bits
+	lsls    vTmp1, vTmp1, #2        // REPT += 1 cyc
+	orrs    vAcc, vAcc, vTmp1       // REPT += 1 cyc
+	// blue
+	ldmia   iB!, {vTmp2}            // REPT += 1+1 cyc
+	lsrs    vTmp1, vTmp2, #16       // REPT += 1 cyc
+	uxth    vTmp2, vTmp2            // REPT += 1 cyc
+	ldsb    vTmp1, [vTab, vTmp1]    // REPT += 2 cyc
+	ldsb    vTmp2, [vTab, vTmp2]    // REPT += 2 cyc
+	add     vTmp1, vTmp1, vTmp2     // REPT += 1 cyc // add 2 8-bit values, producing a 9-bit value...
+	lsrs    vTmp1, vTmp1, #7        // REPT += 1 cyc // ...shrink it from 9-bits to 2-bits
+	orrs    vAcc, vAcc, vTmp1       // REPT += 1 cyc
+	// store
+	strb    vAcc, [o, #0]           // REPT += 2 cyc
+	adds    o, o, #1                // REPT += 1 cyc
+.endr
+	cmp     o, vOEnd                // LOOP += 1 cyc
+	bne     loop                    // LOOP += 2 cyc ; ONCE -= 1 cyc
+	pop     {r4-r7, pc}             // ONCE += 3+5 cyc
+	// TOTAL  = ONCE+((720/2)/N)*LOOP+(720/2)*REPT cyc
+	//        =  18 +(  360  /4)*  3 +  360  * 37  cyc
+	//        = 13608 cyc
+	// BUDGET =  8580 cyc