summaryrefslogtreecommitdiff
path: root/flashimg/cpu_hdmi/tmds_decode.S
diff options
context:
space:
mode:
Diffstat (limited to 'flashimg/cpu_hdmi/tmds_decode.S')
-rw-r--r--flashimg/cpu_hdmi/tmds_decode.S82
1 files changed, 82 insertions, 0 deletions
diff --git a/flashimg/cpu_hdmi/tmds_decode.S b/flashimg/cpu_hdmi/tmds_decode.S
new file mode 100644
index 0000000..888b5b9
--- /dev/null
+++ b/flashimg/cpu_hdmi/tmds_decode.S
@@ -0,0 +1,82 @@
+// flashimg/cpu_hdmi/tmds_decode.S - Decode TMDS-encoded scanlines
+//
+// Copyright (C) 2025 Luke T. Shumaker <lukeshu@lukeshu.com>
+// SPDX-License-Identifier: AGPL-3.0-or-later
+
+.syntax unified
+.cpu cortex-m0plus
+.thumb
+
+// .tmds_table is a LUT from 10-bit TMDS-encoded values to the 8-bit
+// decoded values, shifted right 6 bits.
+.section .scratch_x.tmds_table, "a"
+tmds_table:
+#define X(dec) .byte dec>>6
+#include "tmds_decode_table.h"
+#undef X
+
+// void tmds_decode_line(uint16_t in_r[width],
+// uint16_t in_g[width],
+// uint16_t in_b[width],
+// uint8_t out[width/2],
+// size_t width);
+//
+// Read in `width` values from 10-bit-encoded TMDS streams of R/G/B
+// channels of 24-bit RGB pixels; and write that out as RGB 2:3:2,
+// scaled down horizontally by half.
+//
+// `width` must be either 640 or 720.
+//
+// BUG: Scaling down happens by discarding every other pixel, not by
+// interpolation.
+.section .scratch_x.tmds_decode_line, "ax"
+.global tmds_decode_line
+.type tmds_decode_line,%function
+.thumb_func
+tmds_decode_line:
+ // Of our supported screen formats, the one with the tightest
+ // cycles/pixel requirement is 720x480p@60Hz, which has
+ // H_active=720, H_total=858, meaning that we need to decode
+ // 720 pixels in under 858*10 cycles, giving us a budget of
+ // ~11.9 cycles per pixel.
+#define iR r0
+#define iG r1
+#define iB r2
+#define o r3
+#define w r4
+#define vTab r5
+#define vAcc r6
+#define vTmp r7
+ push {r4-r7, lr} // ONCE += 1+5 cyc
+ lsrs w, w, #1 // ONCE += 1 cyc
+ add w, o, w // ONCE += 1 cyc
+ ldr vTab, =tmds_table // ONCE += 2 cyc
+loop:
+.rept 4
+ // red
+ ldmia iR!, {vTmp} // REPT += 1+1 cyc
+ uxth vTmp, vTmp // REPT += 1 cyc
+ ldsb vTmp, [vTab, vTmp] // REPT += 2 cyc
+ lsls vAcc, vTmp, #5 // REPT += 1 cyc
+ // green
+ ldmia iG!, {vTmp} // REPT += 1+1 cyc
+ uxth vTmp, vTmp // REPT += 1 cyc
+ ldsb vTmp, [vTab, vTmp] // REPT += 2 cyc
+ lsls vTmp, vTmp, #3 // REPT += 1 cyc
+ orrs vAcc, vAcc, vTmp // REPT += 1 cyc
+ // blue
+ ldmia iB!, {vTmp} // REPT += 1+1 cyc
+ uxth vTmp, vTmp // REPT += 1 cyc
+ ldsb vTmp, [vTab, vTmp] // REPT += 2 cyc
+ orrs vAcc, vAcc, vTmp // REPT += 1 cyc
+ // store
+ strb vAcc, [o, #0] // REPT += 2 cyc
+ adds o, o, #1 // REPT += 1 cyc
+.endr
+ cmp o, w // LOOP += 1 cyc
+ bne loop // LOOP += 2 cyc ; ONCE -= 1 cyc
+ pop {r4-r7, pc} // ONCE += 3+5 cyc
+ // TOTAL = ONCE+((720/2)/N)*LOOP+(720/2)*REPT cyc
+ // = 17 +( 360 /4)* 3 + 360 * 22 cyc
+ // = 8207 cyc
+ // BUDGET = 8580 cyc