summaryrefslogtreecommitdiff
path: root/flashimg/cpu_hdmi/tmds_decode-interp.S
diff options
context:
space:
mode:
Diffstat (limited to 'flashimg/cpu_hdmi/tmds_decode-interp.S')
-rw-r--r--flashimg/cpu_hdmi/tmds_decode-interp.S95
1 files changed, 95 insertions, 0 deletions
diff --git a/flashimg/cpu_hdmi/tmds_decode-interp.S b/flashimg/cpu_hdmi/tmds_decode-interp.S
new file mode 100644
index 0000000..de1605a
--- /dev/null
+++ b/flashimg/cpu_hdmi/tmds_decode-interp.S
@@ -0,0 +1,95 @@
+// flashimg/cpu_hdmi/tmds_decode-interp.S - Variant of tmds_decode.S that does linear interpolation
+//
+// Copyright (C) 2025 Luke T. Shumaker <lukeshu@lukeshu.com>
+// SPDX-License-Identifier: AGPL-3.0-or-later
+
+.syntax unified
+.cpu cortex-m0plus
+.thumb
+
+// .tmds_table is a LUT from 10-bit TMDS-encoded values to the 8-bit
+// decoded values.
+.section .scratch_x.tmds_table, "a"
+tmds_table:
+#define X(dec) .byte dec
+#include "tmds_decode_table.h"
+#undef X
+
+// void tmds_decode_line(uint16_t in_r[width],
+// uint16_t in_g[width],
+// uint16_t in_b[width],
+// uint8_t out[width/2],
+// size_t width);
+//
+// Read in `width` values from 10-bit-encoded TMDS streams of R/G/B
+// channels of 24-bit RGB pixels; and write that out as RGB 2:3:2,
+// scaled down horizontally by half.
+//
+// `width` must be either 640 or 720.
+.section .scratch_x.tmds_decode_line, "ax"
+.global tmds_decode_line
+.type tmds_decode_line,%function
+.thumb_func
+tmds_decode_line:
+ // Of our supported screen formats, the one with the tightest
+ // cycles/pixel requirement is 720x480p@60Hz, which has
+ // H_active=720, H_total=858, meaning that we need to decode
+ // 720 pixels in under 858*10 cycles, giving us a budget of
+ // ~11.9 cycles per pixel.
+#define iR r0
+#define iG r1
+#define iB r2
+#define o r3
+#define w r4
+#define vTab r4
+#define vAcc r5
+#define vTmp1 r6
+#define vTmp2 r7
+#define vOEnd r12
+ push {r4-r7, lr} // ONCE += 1+5 cyc
+ lsrs w, w, #1 // ONCE += 1 cyc
+ add w, o, w // ONCE += 1 cyc
+ mov vOEnd, w // ONCE += 1 cyc
+#undef w
+ ldr vTab, =tmds_table // ONCE += 2 cyc
+loop:
+.rept 4
+ // red
+ ldmia iR!, {vTmp2} // REPT += 1+1 cyc
+ lsrs vTmp1, vTmp2, #16 // REPT += 1 cyc
+ uxth vTmp2, vTmp2 // REPT += 1 cyc
+ ldsb vTmp1, [vTab, vTmp1] // REPT += 2 cyc
+ ldsb vTmp2, [vTab, vTmp2] // REPT += 2 cyc
+ add vTmp1, vTmp1, vTmp2 // REPT += 1 cyc // add 2 8-bit values, producing a 9-bit value...
+ lsrs vTmp1, vTmp1, #7 // REPT += 1 cyc // ...shrink it from 9-bits to 2-bits
+ lsls vAcc, vTmp1, #5 // REPT += 1 cyc
+ // green
+ ldmia iG!, {vTmp2} // REPT += 1+1 cyc
+ lsrs vTmp1, vTmp2, #16 // REPT += 1 cyc
+ uxth vTmp2, vTmp2 // REPT += 1 cyc
+ ldsb vTmp1, [vTab, vTmp1] // REPT += 2 cyc
+ ldsb vTmp2, [vTab, vTmp2] // REPT += 2 cyc
+ add vTmp1, vTmp1, vTmp2 // REPT += 1 cyc // add 2 8-bit values, producing a 9-bit value...
+ lsrs vTmp1, vTmp1, #6 // REPT += 1 cyc // ...shrink it from 9-bits to 3-bits
+ lsls vTmp1, vTmp1, #2 // REPT += 1 cyc
+ orrs vAcc, vAcc, vTmp1 // REPT += 1 cyc
+ // blue
+ ldmia iB!, {vTmp2} // REPT += 1+1 cyc
+ lsrs vTmp1, vTmp2, #16 // REPT += 1 cyc
+ uxth vTmp2, vTmp2 // REPT += 1 cyc
+ ldsb vTmp1, [vTab, vTmp1] // REPT += 2 cyc
+ ldsb vTmp2, [vTab, vTmp2] // REPT += 2 cyc
+ add vTmp1, vTmp1, vTmp2 // REPT += 1 cyc // add 2 8-bit values, producing a 9-bit value...
+ lsrs vTmp1, vTmp1, #7 // REPT += 1 cyc // ...shrink it from 9-bits to 2-bits
+ orrs vAcc, vAcc, vTmp1 // REPT += 1 cyc
+ // store
+ strb vAcc, [o, #0] // REPT += 2 cyc
+ adds o, o, #1 // REPT += 1 cyc
+.endr
+ cmp o, vOEnd // LOOP += 1 cyc
+ bne loop // LOOP += 2 cyc ; ONCE -= 1 cyc
+ pop {r4-r7, pc} // ONCE += 3+5 cyc
+ // TOTAL = ONCE+((720/2)/N)*LOOP+(720/2)*REPT cyc
+ // = 18 +( 360 /4)* 3 + 360 * 37 cyc
+ // = 13608 cyc
+ // BUDGET = 8580 cyc