[vlc-devel] [RFC] [PATCH] deinterlace: rewrite ARM optimizations for 8-bits merge
Rémi Denis-Courmont
remi at remlab.net
Fri Aug 3 09:34:08 CEST 2012
- Assembler code out of line
- ARM NEON run-time detection
- Better choice of registers
- Prefetching
- ARMv6 SIMD optimizations where Advanced SIMD not available
(not yet in use)
Scheduling is not completely optimal.
16-bits merge could easily be added later.
---
modules/video_filter/Modules.am | 3 +
modules/video_filter/deinterlace/merge.c | 63 ++---------------
modules/video_filter/deinterlace/merge_arm.S | 94 ++++++++++++++++++++++++++
3 files changed, 102 insertions(+), 58 deletions(-)
create mode 100644 modules/video_filter/deinterlace/merge_arm.S
diff --git a/modules/video_filter/Modules.am b/modules/video_filter/Modules.am
index 2d2e2a2..1301123 100644
--- a/modules/video_filter/Modules.am
+++ b/modules/video_filter/Modules.am
@@ -28,6 +28,9 @@ libdeinterlace_plugin_la_SOURCES = \
deinterlace/yadif.h deinterlace/yadif_template.h \
deinterlace/algo_phosphor.c deinterlace/algo_phosphor.h \
deinterlace/algo_ivtc.c deinterlace/algo_ivtc.h
+if HAVE_NEON
+libdeinterlace_plugin_la_SOURCES += deinterlace/merge_arm.S
+endif
libdeinterlace_plugin_la_CFLAGS = $(AM_CFLAGS)
libdeinterlace_plugin_la_LIBADD = $(AM_LIBADD)
libdeinterlace_plugin_la_DEPENDENCIES =
diff --git a/modules/video_filter/deinterlace/merge.c b/modules/video_filter/deinterlace/merge.c
index 06c0334..be86450 100644
--- a/modules/video_filter/deinterlace/merge.c
+++ b/modules/video_filter/deinterlace/merge.c
@@ -243,64 +243,11 @@ void MergeAltivec( void *_p_dest, const void *_p_s1,
}
#endif
-#ifdef __ARM_NEON__
-void MergeNEON (void *restrict out, const void *in1,
- const void *in2, size_t n)
-{
- uint8_t *outp = out;
- const uint8_t *in1p = in1;
- const uint8_t *in2p = in2;
- size_t mis = __MIN((16 - ((uintptr_t)outp & 15)) & 15, n);
-
- if (mis)
- {
- Merge8BitGeneric (outp, in1p, in2p, mis);
- outp += mis;
- in1p += mis;
- in2p += mis;
- n -= mis;
- }
-
- uint8_t *end = outp + (n & ~15);
-
- if ((((uintptr_t)in1p)|((uintptr_t)in2p)) & 15)
- while (outp < end)
- asm volatile (
- "vld1.u8 {q0-q1}, [%[in1]]!\n"
- "vld1.u8 {q2-q3}, [%[in2]]!\n"
- "vhadd.u8 q4, q0, q2\n"
- "vld1.u8 {q6-q7}, [%[in1]]!\n"
- "vhadd.u8 q5, q1, q3\n"
- "vld1.u8 {q8-q9}, [%[in2]]!\n"
- "vhadd.u8 q10, q6, q8\n"
- "vhadd.u8 q11, q7, q9\n"
- "vst1.u8 {q4-q5}, [%[out],:128]!\n"
- "vst1.u8 {q10-q11}, [%[out],:128]!\n"
- : [out] "+r" (outp), [in1] "+r" (in1p), [in2] "+r" (in2p)
- :
- : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
- "q8", "q9", "q10", "q11", "memory");
- else
- while (outp < end)
- asm volatile (
- "vld1.u8 {q0-q1}, [%[in1],:128]!\n"
- "vld1.u8 {q2-q3}, [%[in2],:128]!\n"
- "vhadd.u8 q4, q0, q2\n"
- "vld1.u8 {q6-q7}, [%[in1],:128]!\n"
- "vhadd.u8 q5, q1, q3\n"
- "vld1.u8 {q8-q9}, [%[in2],:128]!\n"
- "vhadd.u8 q10, q6, q8\n"
- "vhadd.u8 q11, q7, q9\n"
- "vst1.u8 {q4-q5}, [%[out],:128]!\n"
- "vst1.u8 {q10-q11}, [%[out],:128]!\n"
- : [out] "+r" (outp), [in1] "+r" (in1p), [in2] "+r" (in2p)
- :
- : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
- "q8", "q9", "q10", "q11", "memory");
- n &= 15;
- if (n)
- Merge8BitGeneric (outp, in1p, in2p, n);
-}
+#ifdef __arm__
+void MergeNEON(void *restrict out, const void *in1, const void *in2, size_t n)
+ asm("merge_arm_neon");
+void MergeARMv6(void *restrict out, const void *in1, const void *in2, size_t n)
+ asm("merge_armv6");
#endif
/*****************************************************************************
diff --git a/modules/video_filter/deinterlace/merge_arm.S b/modules/video_filter/deinterlace/merge_arm.S
new file mode 100644
index 0000000..80c652b
--- /dev/null
+++ b/modules/video_filter/deinterlace/merge_arm.S
@@ -0,0 +1,94 @@
+ @*****************************************************************************
+ @ i420_yuyv.S : ARM NEONv1 I420 to YUYV chroma conversion
+ @*****************************************************************************
+ @ Copyright (C) 2009-2012 Rémi Denis-Courmont
+ @
+ @ This program is free software; you can redistribute it and/or modify
+ @ it under the terms of the GNU Lesser General Public License as published by
+ @ the Free Software Foundation; either version 2.1 of the License, or
+ @ (at your option) any later version.
+ @
+ @ This program is distributed in the hope that it will be useful,
+ @ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ @ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ @ GNU Lesser General Public License for more details.
+ @
+ @ You should have received a copy of the GNU Lesser General Public License
+ @ along with this program; if not, write to the Free Software Foundation,
+ @ Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
+ @****************************************************************************/
+
+ .syntax unified
+ .arm
+ .arch armv6
+ .fpu neon
+ .text
+
+#define DEST r0
+#define SRC1 r1
+#define SRC2 r2
+#define SIZE r3
+
+ .align 2
+ .global merge8_arm_neon
+ .type merge8_arm_neon, %function
+ @ NOTE: Offset and pitch must be multiple of 16-bytes in VLC.
+merge8_arm_neon:
+ cmp SIZE, #64
+ blo 2f
+1:
+ pld [SRC1, #64]
+ vld1.u8 {q0-q1}, [SRC1,:128]!
+ pld [SRC2, #64]
+ vld1.u8 {q8-q9}, [SRC2,:128]!
+ vhadd.u8 q0, q0, q8
+ sub SIZE, SIZE, #64
+ vld1.u8 {q2-q3}, [SRC1,:128]!
+ vhadd.u8 q1, q1, q9
+ vld1.u8 {q10-q11}, [SRC2,:128]!
+ vhadd.u8 q2, q2, q10
+ cmp SIZE, #64
+ vhadd.u8 q3, q3, q11
+ vst1.u8 {q0-q1}, [DEST,:128]!
+ vst1.u8 {q2-q3}, [DEST,:128]!
+ bhs 1b
+2:
+ cmp SIZE, #32
+ blo 3f
+ vld1.u8 {q0-q1}, [SRC1,:128]!
+ sub SIZE, SIZE, #32
+ vld1.u8 {q8-q9}, [SRC2,:128]!
+ vhadd.u8 q0, q0, q8
+ vhadd.u8 q1, q1, q9
+ vst1.u8 {q0-q1}, [DEST,:128]!
+3:
+ cmp SIZE, #16
+ bxlo lr
+ vld1.u8 {q0}, [SRC1,:128]!
+ sub SIZE, SIZE, #16
+ vld1.u8 {q8}, [SRC2,:128]!
+ vhadd.u8 q0, q0, q8
+ vst1.u8 {q0}, [DEST,:128]!
+ bx lr
+
+ .align 2
+ .global merge8_armv6
+ .type merge8_armv6, %function
+merge8_armv6:
+ push {r4-r9,lr}
+1:
+ pld [SRC1, #64]
+ ldm SRC1!, {r4-r5}
+ pld [SRC2, #64]
+ ldm SRC2!, {r8-r9}
+ subs SIZE, SIZE, #16
+ uhadd8 r4, r4, r8
+ ldm SRC1!, {r6-r7}
+ uhadd8 r5, r5, r9
+ ldm SRC2!, {ip,lr}
+ uhadd8 r6, r6, ip
+ stm DEST!, {r4-r5}
+ uhadd8 r7, r7, lr
+ stm DEST!, {r6-r7}
+ popeq {r4-r9,pc}
+ b 1b
--
1.7.10.4
More information about the vlc-devel
mailing list