[vlc-devel] [PATCH 21/25] deinterlace: rewrite render_mean SSE2 asm

Victorien Le Couviour--Tuffet victorien.lecouviour.tuffet at gmail.com
Tue Apr 14 12:40:32 CEST 2020


8-bit:
   before: 35443 cycles
    after: 34230 cycles
16-bit:
   before: 75695 cycles
    after: 68427 cycles
---
 modules/video_filter/deinterlace/algo_basic.c | 40 +++++++++++++++----
 .../deinterlace/algo_basic_x86.asm            | 23 +++++++++++
 2 files changed, 56 insertions(+), 7 deletions(-)

diff --git a/modules/video_filter/deinterlace/algo_basic.c b/modules/video_filter/deinterlace/algo_basic.c
index 8ec649c096..45d06a99e3 100644
--- a/modules/video_filter/deinterlace/algo_basic.c
+++ b/modules/video_filter/deinterlace/algo_basic.c
@@ -315,16 +315,42 @@ static int RenderMean##bpc##Bit_##feature( filter_t *p_filter,              \
     return VLC_SUCCESS;                                                     \
 }
 
-#define RENDER_MEAN_SIMD(bpc, feature) \
-    RENDER_MEAN(Merge##bpc##Bit##feature, bpc, feature)
+#define RENDER_MEAN_SIMD(bpc, feature)                                      \
+                                                                            \
+void vlcpriv_deint_mean_##bpc##bit_##feature(uint8_t *dst,                  \
+                                             ptrdiff_t dst_stride,          \
+                                             uint8_t const *src,            \
+                                             ptrdiff_t src_stride,          \
+                                             unsigned int w, unsigned h);   \
+                                                                            \
+static int RenderMean##bpc##Bit_##feature(filter_t *filter,                 \
+                                          picture_t *opic,                  \
+                                          picture_t *ipic)                  \
+{                                                                           \
+    VLC_UNUSED(filter);                                                     \
+    for (int plane = 0 ; plane < ipic->i_planes ; ++plane)                  \
+    {                                                                       \
+        uint8_t *dst = opic->p[plane].p_pixels;                             \
+        uint8_t *src = ipic->p[plane].p_pixels;                             \
+        ptrdiff_t dst_stride = opic->p[plane].i_pitch;                      \
+        ptrdiff_t src_stride = ipic->p[plane].i_pitch;                      \
+        unsigned int w = opic->p[plane].i_visible_pitch / (bpc / 8);        \
+        unsigned int h = opic->p[plane].i_visible_lines;                    \
+        vlcpriv_deint_mean_##bpc##bit_##feature(dst, dst_stride,            \
+                                                src, src_stride,            \
+                                                w, h);                      \
+    }                                                                       \
+    return VLC_SUCCESS;                                                     \
+}
+
 #define RENDER_MEAN_ARM(bpc, feature) \
     RENDER_MEAN(merge##bpc##_##feature, bpc, feature)
 
 RENDER_MEAN(Merge8BitGeneric, 8, c)
 RENDER_MEAN(Merge16BitGeneric, 16, c)
-#if defined(CAN_COMPILE_SSE2)
-RENDER_MEAN_SIMD(8, SSE2)
-RENDER_MEAN_SIMD(16, SSE2)
+#if defined(__i386__) || defined(__x86_64__)
+RENDER_MEAN_SIMD(8, sse2)
+RENDER_MEAN_SIMD(16, sse2)
 #endif
 #if defined(CAN_COMPILE_ARM)
 RENDER_MEAN_ARM(8, arm_neon)
@@ -343,9 +369,9 @@ RENDER_MEAN_ARM(16, arm64_neon)
 
 single_pic_renderer_t MeanRenderer(unsigned pixel_size)
 {
-#if defined(CAN_COMPILE_SSE2)
+#if defined(__i386__) || defined(__x86_64__)
     if (vlc_CPU_SSE2())
-        return pixel_size & 1 ? RenderMean8Bit_SSE2 : RenderMean16Bit_SSE2;
+        return pixel_size & 1 ? RenderMean8Bit_sse2 : RenderMean16Bit_sse2;
     else
 #endif
 #if defined(CAN_COMPILE_ARM)
diff --git a/modules/video_filter/deinterlace/algo_basic_x86.asm b/modules/video_filter/deinterlace/algo_basic_x86.asm
index bccef7aaf3..d792928786 100644
--- a/modules/video_filter/deinterlace/algo_basic_x86.asm
+++ b/modules/video_filter/deinterlace/algo_basic_x86.asm
@@ -85,8 +85,31 @@ cglobal deint_linear_%1bit, 7, 11, 2, dst, ds, src, ss, w, h, field, \
     RET
 %endmacro
 
+%macro DEINT_MEAN 1
+ %assign pxsize %1 / 8
+cglobal deint_mean_%1bit, 6, 8, 2, dst, ds, src, ss, w, h, src1, x
+.main_loop:
+    lea              src1q, [srcq+ssq]
+    xor                 xd, xd
+.main_loop_x:
+    mova                m0, [srcq +xq*pxsize]
+    AVG                 m0, [src1q+xq*pxsize]
+    mova  [dstq+xq*pxsize], m0
+    add                 xd, mmsize / pxsize
+    cmp                 xd, wd
+    jl .main_loop_x
+    lea               srcq, [srcq+ssq*2]
+    lea              src1q, [src1q+ssq*2]
+    add               dstq, dsq
+    dec                 hd
+    jg .main_loop
+    RET
+%endmacro
+
 INIT_XMM sse2
 
 DEINT_LINEAR 8
+DEINT_MEAN 8
 
 DEINT_LINEAR 16
+DEINT_MEAN 16
-- 
2.24.1



More information about the vlc-devel mailing list