[vlc-commits] deinterlace: rewrite ARM optimizations for 8-bits merge
Rémi Denis-Courmont
git at videolan.org
Sat Aug 4 17:35:59 CEST 2012
vlc | branch: master | Rémi Denis-Courmont <remi at remlab.net> | Fri Aug 3 10:25:10 2012 +0300| [95eb79718122dcc8f586cbbd10f8b0feb2fa4fa4] | committer: Rémi Denis-Courmont
deinterlace: rewrite ARM optimizations for 8-bits merge
- Assembler code out of line
- ARM NEON run-time detection
- Better choice of registers
- Prefetching
- ARMv6 SIMD optimizations where Advanced SIMD not available
Scheduling is not completely optimal.
> http://git.videolan.org/gitweb.cgi/vlc.git/?a=commit;h=95eb79718122dcc8f586cbbd10f8b0feb2fa4fa4
---
modules/video_filter/Modules.am | 3 +
modules/video_filter/deinterlace/deinterlace.c | 7 +-
modules/video_filter/deinterlace/merge.c | 61 ---------------
modules/video_filter/deinterlace/merge.h | 14 ++--
modules/video_filter/deinterlace/merge_arm.S | 94 ++++++++++++++++++++++++
5 files changed, 109 insertions(+), 70 deletions(-)
diff --git a/modules/video_filter/Modules.am b/modules/video_filter/Modules.am
index 2d2e2a2..1301123 100644
--- a/modules/video_filter/Modules.am
+++ b/modules/video_filter/Modules.am
@@ -28,6 +28,9 @@ libdeinterlace_plugin_la_SOURCES = \
deinterlace/yadif.h deinterlace/yadif_template.h \
deinterlace/algo_phosphor.c deinterlace/algo_phosphor.h \
deinterlace/algo_ivtc.c deinterlace/algo_ivtc.h
+if HAVE_NEON
+libdeinterlace_plugin_la_SOURCES += deinterlace/merge_arm.S
+endif
libdeinterlace_plugin_la_CFLAGS = $(AM_CFLAGS)
libdeinterlace_plugin_la_LIBADD = $(AM_LIBADD)
libdeinterlace_plugin_la_DEPENDENCIES =
diff --git a/modules/video_filter/deinterlace/deinterlace.c b/modules/video_filter/deinterlace/deinterlace.c
index ac83dfc..9d52f99 100644
--- a/modules/video_filter/deinterlace/deinterlace.c
+++ b/modules/video_filter/deinterlace/deinterlace.c
@@ -656,9 +656,12 @@ int Open( vlc_object_t *p_this )
}
else
#endif
-#if defined __ARM_NEON__ // FIXME: runtime detect support
+#if defined(__arm__)
if( chroma->pixel_size == 1 && vlc_CPU_ARM_NEON() )
- p_sys->pf_merge = MergeNEON;
+ p_sys->pf_merge = merge8_arm_neon;
+ else
+ if( chroma->pixel_size == 1 && vlc_CPU_ARMv6() )
+ p_sys->pf_merge = merge8_armv6;
else
#endif
{
diff --git a/modules/video_filter/deinterlace/merge.c b/modules/video_filter/deinterlace/merge.c
index 06c0334..0ab5608 100644
--- a/modules/video_filter/deinterlace/merge.c
+++ b/modules/video_filter/deinterlace/merge.c
@@ -7,7 +7,6 @@
* Author: Sam Hocevar <sam at zoy.org> (generic C routine)
* Sigmund Augdal Helberg <sigmunau at videolan.org> (MMXEXT, 3DNow, SSE2)
* Eric Petit <eric.petit at lapsus.org> (Altivec)
- * Rémi Denis-Courmont <remi at remlab.net> (ARM NEON)
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
@@ -243,66 +242,6 @@ void MergeAltivec( void *_p_dest, const void *_p_s1,
}
#endif
-#ifdef __ARM_NEON__
-void MergeNEON (void *restrict out, const void *in1,
- const void *in2, size_t n)
-{
- uint8_t *outp = out;
- const uint8_t *in1p = in1;
- const uint8_t *in2p = in2;
- size_t mis = __MIN((16 - ((uintptr_t)outp & 15)) & 15, n);
-
- if (mis)
- {
- Merge8BitGeneric (outp, in1p, in2p, mis);
- outp += mis;
- in1p += mis;
- in2p += mis;
- n -= mis;
- }
-
- uint8_t *end = outp + (n & ~15);
-
- if ((((uintptr_t)in1p)|((uintptr_t)in2p)) & 15)
- while (outp < end)
- asm volatile (
- "vld1.u8 {q0-q1}, [%[in1]]!\n"
- "vld1.u8 {q2-q3}, [%[in2]]!\n"
- "vhadd.u8 q4, q0, q2\n"
- "vld1.u8 {q6-q7}, [%[in1]]!\n"
- "vhadd.u8 q5, q1, q3\n"
- "vld1.u8 {q8-q9}, [%[in2]]!\n"
- "vhadd.u8 q10, q6, q8\n"
- "vhadd.u8 q11, q7, q9\n"
- "vst1.u8 {q4-q5}, [%[out],:128]!\n"
- "vst1.u8 {q10-q11}, [%[out],:128]!\n"
- : [out] "+r" (outp), [in1] "+r" (in1p), [in2] "+r" (in2p)
- :
- : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
- "q8", "q9", "q10", "q11", "memory");
- else
- while (outp < end)
- asm volatile (
- "vld1.u8 {q0-q1}, [%[in1],:128]!\n"
- "vld1.u8 {q2-q3}, [%[in2],:128]!\n"
- "vhadd.u8 q4, q0, q2\n"
- "vld1.u8 {q6-q7}, [%[in1],:128]!\n"
- "vhadd.u8 q5, q1, q3\n"
- "vld1.u8 {q8-q9}, [%[in2],:128]!\n"
- "vhadd.u8 q10, q6, q8\n"
- "vhadd.u8 q11, q7, q9\n"
- "vst1.u8 {q4-q5}, [%[out],:128]!\n"
- "vst1.u8 {q10-q11}, [%[out],:128]!\n"
- : [out] "+r" (outp), [in1] "+r" (in1p), [in2] "+r" (in2p)
- :
- : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
- "q8", "q9", "q10", "q11", "memory");
- n &= 15;
- if (n)
- Merge8BitGeneric (outp, in1p, in2p, n);
-}
-#endif
-
/*****************************************************************************
* EndMerge routines
*****************************************************************************/
diff --git a/modules/video_filter/deinterlace/merge.h b/modules/video_filter/deinterlace/merge.h
index 285c1a3..04634db 100644
--- a/modules/video_filter/deinterlace/merge.h
+++ b/modules/video_filter/deinterlace/merge.h
@@ -158,16 +158,16 @@ void Merge8BitSSE2( void *, const void *, const void *, size_t );
void Merge16BitSSE2( void *, const void *, const void *, size_t );
#endif
-#if defined __ARM_NEON__
+#ifdef __arm__
/**
* ARM NEON routine to blend pixels from two picture lines.
- *
- * @param _p_dest Target
- * @param _p_s1 Source line A
- * @param _p_s2 Source line B
- * @param i_bytes Number of bytes to merge
*/
-void MergeNEON (void *, const void *, const void *, size_t);
+void merge8_arm_neon (void *, const void *, const void *, size_t);
+
+/**
+ * ARMv6 SIMD routine to blend pixels from two picture lines.
+ */
+void merge8_armv6 (void *, const void *, const void *, size_t);
#endif
/*****************************************************************************
diff --git a/modules/video_filter/deinterlace/merge_arm.S b/modules/video_filter/deinterlace/merge_arm.S
new file mode 100644
index 0000000..80c652b
--- /dev/null
+++ b/modules/video_filter/deinterlace/merge_arm.S
@@ -0,0 +1,94 @@
+ @*****************************************************************************
+ @ i420_yuyv.S : ARM NEONv1 I420 to YUYV chroma conversion
+ @*****************************************************************************
+ @ Copyright (C) 2009-2012 Rémi Denis-Courmont
+ @
+ @ This program is free software; you can redistribute it and/or modify
+ @ it under the terms of the GNU Lesser General Public License as published by
+ @ the Free Software Foundation; either version 2.1 of the License, or
+ @ (at your option) any later version.
+ @
+ @ This program is distributed in the hope that it will be useful,
+ @ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ @ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ @ GNU Lesser General Public License for more details.
+ @
+ @ You should have received a copy of the GNU Lesser General Public License
+ @ along with this program; if not, write to the Free Software Foundation,
+ @ Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
+ @****************************************************************************/
+
+ .syntax unified
+ .arm
+ .arch armv6
+ .fpu neon
+ .text
+
+#define DEST r0
+#define SRC1 r1
+#define SRC2 r2
+#define SIZE r3
+
+ .align 2
+ .global merge8_arm_neon
+ .type merge8_arm_neon, %function
+ @ NOTE: Offset and pitch must be multiple of 16-bytes in VLC.
+merge8_arm_neon:
+ cmp SIZE, #64
+ blo 2f
+1:
+ pld [SRC1, #64]
+ vld1.u8 {q0-q1}, [SRC1,:128]!
+ pld [SRC2, #64]
+ vld1.u8 {q8-q9}, [SRC2,:128]!
+ vhadd.u8 q0, q0, q8
+ sub SIZE, SIZE, #64
+ vld1.u8 {q2-q3}, [SRC1,:128]!
+ vhadd.u8 q1, q1, q9
+ vld1.u8 {q10-q11}, [SRC2,:128]!
+ vhadd.u8 q2, q2, q10
+ cmp SIZE, #64
+ vhadd.u8 q3, q3, q11
+ vst1.u8 {q0-q1}, [DEST,:128]!
+ vst1.u8 {q2-q3}, [DEST,:128]!
+ bhs 1b
+2:
+ cmp SIZE, #32
+ blo 3f
+ vld1.u8 {q0-q1}, [SRC1,:128]!
+ sub SIZE, SIZE, #32
+ vld1.u8 {q8-q9}, [SRC2,:128]!
+ vhadd.u8 q0, q0, q8
+ vhadd.u8 q1, q1, q9
+ vst1.u8 {q0-q1}, [DEST,:128]!
+3:
+ cmp SIZE, #16
+ bxlo lr
+ vld1.u8 {q0}, [SRC1,:128]!
+ sub SIZE, SIZE, #16
+ vld1.u8 {q8}, [SRC2,:128]!
+ vhadd.u8 q0, q0, q8
+ vst1.u8 {q0}, [DEST,:128]!
+ bx lr
+
+ .align 2
+ .global merge8_armv6
+ .type merge8_armv6, %function
+merge8_armv6:
+ push {r4-r9,lr}
+1:
+ pld [SRC1, #64]
+ ldm SRC1!, {r4-r5}
+ pld [SRC2, #64]
+ ldm SRC2!, {r8-r9}
+ subs SIZE, SIZE, #16
+ uhadd8 r4, r4, r8
+ ldm SRC1!, {r6-r7}
+ uhadd8 r5, r5, r9
+ ldm SRC2!, {ip,lr}
+ uhadd8 r6, r6, ip
+ stm DEST!, {r4-r5}
+ uhadd8 r7, r7, lr
+ stm DEST!, {r6-r7}
+ popeq {r4-r9,pc}
+ b 1b
More information about the vlc-commits
mailing list