[vlc-devel] [PATCH 22/25] deinterlace: rewrite render_blend SSE2 asm

Victorien Le Couviour--Tuffet victorien.lecouviour.tuffet at gmail.com
Tue Apr 14 12:40:33 CEST 2020


8-bit:
   before: 60067 cycles
    after: 57423 cycles
16-bit:
   before: 113490 cycles
    after: 111841 cycles
---
 modules/video_filter/deinterlace/algo_basic.c | 40 +++++++++++++++----
 .../deinterlace/algo_basic_x86.asm            | 26 ++++++++++++
 2 files changed, 59 insertions(+), 7 deletions(-)

diff --git a/modules/video_filter/deinterlace/algo_basic.c b/modules/video_filter/deinterlace/algo_basic.c
index 45d06a99e3..2ed07079eb 100644
--- a/modules/video_filter/deinterlace/algo_basic.c
+++ b/modules/video_filter/deinterlace/algo_basic.c
@@ -435,16 +435,42 @@ static int RenderBlend##bpc##Bit_##feature( filter_t *p_filter,             \
     return VLC_SUCCESS;                                                     \
 }
 
-#define RENDER_BLEND_SIMD(bpc, feature) \
-    RENDER_BLEND(Merge##bpc##Bit##feature, bpc, feature)
+#define RENDER_BLEND_SIMD(bpc, feature)                                     \
+                                                                            \
+void vlcpriv_deint_blend_##bpc##bit_##feature(uint8_t *dst,                 \
+                                              ptrdiff_t dst_stride,         \
+                                              uint8_t const *src,           \
+                                              ptrdiff_t src_stride,         \
+                                              unsigned int w, unsigned h);  \
+                                                                            \
+static int RenderBlend##bpc##Bit_##feature(filter_t *filter,                \
+                                           picture_t *opic,                 \
+                                           picture_t *ipic)                 \
+{                                                                           \
+    VLC_UNUSED(filter);                                                     \
+    for (int plane = 0 ; plane < ipic->i_planes ; ++plane)                  \
+    {                                                                       \
+        uint8_t *dst = opic->p[plane].p_pixels;                             \
+        uint8_t *src = ipic->p[plane].p_pixels;                             \
+        ptrdiff_t dst_stride = opic->p[plane].i_pitch;                      \
+        ptrdiff_t src_stride = ipic->p[plane].i_pitch;                      \
+        unsigned int w = opic->p[plane].i_visible_pitch / (bpc / 8);        \
+        unsigned int h = opic->p[plane].i_visible_lines;                    \
+        vlcpriv_deint_blend_##bpc##bit_##feature(dst, dst_stride,           \
+                                                 src, src_stride,           \
+                                                 w, h);                     \
+    }                                                                       \
+    return VLC_SUCCESS;                                                     \
+}
+
 #define RENDER_BLEND_ARM(bpc, feature) \
     RENDER_BLEND(merge##bpc##_##feature, bpc, feature)
 
 RENDER_BLEND(Merge8BitGeneric, 8, c)
 RENDER_BLEND(Merge16BitGeneric, 16, c)
-#if defined(CAN_COMPILE_SSE2)
-RENDER_BLEND_SIMD(8, SSE2)
-RENDER_BLEND_SIMD(16, SSE2)
+#if defined(__i386__) || defined(__x86_64__)
+RENDER_BLEND_SIMD(8, sse2)
+RENDER_BLEND_SIMD(16, sse2)
 #endif
 #if defined(CAN_COMPILE_ARM)
 RENDER_BLEND_ARM(8, arm_neon)
@@ -463,9 +489,9 @@ RENDER_BLEND_ARM(16, arm64_neon)
 
 single_pic_renderer_t BlendRenderer(unsigned pixel_size)
 {
-#if defined(CAN_COMPILE_SSE2)
+#if defined(__i386__) || defined(__x86_64__)
     if (vlc_CPU_SSE2())
-        return pixel_size & 1 ? RenderBlend8Bit_SSE2 : RenderBlend16Bit_SSE2;
+        return pixel_size & 1 ? RenderBlend8Bit_sse2 : RenderBlend16Bit_sse2;
     else
 #endif
 #if defined(CAN_COMPILE_ARM)
diff --git a/modules/video_filter/deinterlace/algo_basic_x86.asm b/modules/video_filter/deinterlace/algo_basic_x86.asm
index d792928786..19db8ba08d 100644
--- a/modules/video_filter/deinterlace/algo_basic_x86.asm
+++ b/modules/video_filter/deinterlace/algo_basic_x86.asm
@@ -106,10 +106,36 @@ cglobal deint_mean_%1bit, 6, 8, 2, dst, ds, src, ss, w, h, src1, x
     RET
 %endmacro
 
+%macro DEINT_BLEND 1
+ %assign pxsize %1 / 8
+cglobal deint_blend_%1bit, 6, 8, 2, dst, ds, src, ss, w, h, src1, x
+    COPY_LINE first_line
+    add               dstq, dsq
+    dec                 hd
+.main_loop:
+    lea              src1q, [srcq+ssq]
+    xor                 xd, xd
+.main_loop_x:
+    mova                m0, [srcq +xq*pxsize]
+    AVG                 m0, [src1q+xq*pxsize]
+    mova  [dstq+xq*pxsize], m0
+    add                 xd, mmsize / pxsize
+    cmp                 xd, wd
+    jl .main_loop_x
+    add               srcq, ssq
+    add              src1q, ssq
+    add               dstq, dsq
+    dec                 hd
+    jg .main_loop
+    RET
+%endmacro
+
 INIT_XMM sse2
 
 DEINT_LINEAR 8
 DEINT_MEAN 8
+DEINT_BLEND 8
 
 DEINT_LINEAR 16
 DEINT_MEAN 16
+DEINT_BLEND 16
-- 
2.24.1



More information about the vlc-devel mailing list