[vlc-devel] commit: deinterlace: preliminary merge ARM NEON optimization ( Rémi Denis-Courmont )
git version control
git at videolan.org
Sat Sep 12 18:03:29 CEST 2009
vlc | branch: master | Rémi Denis-Courmont <remi at remlab.net> | Sat Sep 12 19:00:49 2009 +0300| [a1204db23c3f2bea126c7cb7882d04a0d3236e72] | committer: Rémi Denis-Courmont
deinterlace: preliminary merge ARM NEON optimization
(It wouldn't hurt to make VLC align pixel lines more strongly, but I am
not sure if this is feasible)
> http://git.videolan.org/gitweb.cgi/vlc.git/?a=commit;h=a1204db23c3f2bea126c7cb7882d04a0d3236e72
---
modules/video_filter/deinterlace.c | 69 ++++++++++++++++++++++++++++++++++++
1 files changed, 69 insertions(+), 0 deletions(-)
diff --git a/modules/video_filter/deinterlace.c b/modules/video_filter/deinterlace.c
index 7a92d93..1de201e 100644
--- a/modules/video_filter/deinterlace.c
+++ b/modules/video_filter/deinterlace.c
@@ -93,6 +93,9 @@ static void EndMMX ( void );
#if defined(CAN_COMPILE_3DNOW)
static void End3DNow ( void );
#endif
+#if defined __ARM_NEON__
+static void MergeNEON (void *, const void *, const void *, size_t);
+#endif
static void SetFilterMethod( vout_thread_t *p_vout, const char *psz_method );
static vout_thread_t *SpawnRealVout( vout_thread_t *p_vout );
@@ -245,6 +248,14 @@ static int Create( vlc_object_t *p_this )
}
else
#endif
+#if defined __ARM_NEON__
+ if( vlc_CPU() & CPU_CAPABILITY_NEON )
+ {
+ p_sys->pf_merge = MergeNEON;
+ p_sys->pf_end_merge = NULL;
+ }
+ else
+#endif
{
p_sys->pf_merge = MergeGeneric;
p_sys->pf_end_merge = NULL;
@@ -1119,6 +1130,64 @@ static void MergeAltivec( void *_p_dest, const void *_p_s1,
}
#endif
+#ifdef __ARM_NEON__
+static void MergeNEON (void *restrict out, const void *in1,
+ const void *in2, size_t n)
+{
+ uint8_t *outp = out;
+ const uint8_t *in1p = in1;
+ const uint8_t *in2p = in2;
+ size_t mis = ((uintptr_t)outp) & 15;
+
+ if (mis)
+ {
+ MergeGeneric (outp, in1p, in2p, mis);
+ outp += mis;
+ in1p += mis;
+ in2p += mis;
+ n -= mis;
+ }
+
+ uint8_t *end = outp + (n & ~15);
+
+ if ((((uintptr_t)in1p)|((uintptr_t)in2p)) & 15)
+ while (outp < end)
+ asm volatile (
+ "vld1.u8 {q0-q1}, [%[in1]]!\n"
+ "vld1.u8 {q2-q3}, [%[in2]]!\n"
+ "vhadd.u8 q4, q0, q2\n"
+ "vld1.u8 {q6-q7}, [%[in1]]!\n"
+ "vhadd.u8 q5, q1, q3\n"
+ "vld1.u8 {q8-q9}, [%[in2]]!\n"
+ "vhadd.u8 q10, q6, q8\n"
+ "vhadd.u8 q11, q7, q9\n"
+ "vst1.u8 {q4-q5}, [%[out],:128]!\n"
+ "vst1.u8 {q10-q11}, [%[out],:128]!\n"
+ : [out] "+r" (outp), [in1] "+r" (in1p), [in2] "+r" (in2p)
+ :
+ : "q0", "q1", "q2", "memory");
+ else
+ while (outp < end)
+ asm volatile (
+ "vld1.u8 {q0-q1}, [%[in1],:128]!\n"
+ "vld1.u8 {q2-q3}, [%[in2],:128]!\n"
+ "vhadd.u8 q4, q0, q2\n"
+ "vld1.u8 {q6-q7}, [%[in1],:128]!\n"
+ "vhadd.u8 q5, q1, q3\n"
+ "vld1.u8 {q8-q9}, [%[in2],:128]!\n"
+ "vhadd.u8 q10, q6, q8\n"
+ "vhadd.u8 q11, q7, q9\n"
+ "vst1.u8 {q4-q5}, [%[out],:128]!\n"
+ "vst1.u8 {q10-q11}, [%[out],:128]!\n"
+ : [out] "+r" (outp), [in1] "+r" (in1p), [in2] "+r" (in2p)
+ :
+ : "q0", "q1", "q2", "memory");
+ n &= 15;
+ if (n)
+ MergeGeneric (outp, in1p, in2p, n);
+}
+#endif
+
/*****************************************************************************
* RenderX: This algo works on a 8x8 block basic, it copies the top field
* and apply a process to recreate the bottom field :
More information about the vlc-devel
mailing list