[vlc-devel] [PATCH 20/25] deinterlace: rewrite render_linear SSE2 asm

Victorien Le Couviour--Tuffet victorien.lecouviour.tuffet at gmail.com
Tue Apr 14 12:40:31 CEST 2020


8-bit:
   before: 52606 cycles
    after: 47257 cycles
16-bit:
   before: 99002 cycles
    after: 95914 cycles
---
 modules/video_filter/Makefile.am              |  1 +
 modules/video_filter/deinterlace/algo_basic.c | 42 +++++++--
 .../deinterlace/algo_basic_x86.asm            | 92 +++++++++++++++++++
 test/Makefile.am                              |  2 +
 4 files changed, 130 insertions(+), 7 deletions(-)
 create mode 100644 modules/video_filter/deinterlace/algo_basic_x86.asm

diff --git a/modules/video_filter/Makefile.am b/modules/video_filter/Makefile.am
index 51ee92aff1..5b63c30f08 100644
--- a/modules/video_filter/Makefile.am
+++ b/modules/video_filter/Makefile.am
@@ -129,6 +129,7 @@ libdeinterlace_plugin_la_SOURCES = \
 	video_filter/deinterlace/merge.c video_filter/deinterlace/merge.h \
 	video_filter/deinterlace/helpers.c video_filter/deinterlace/helpers.h \
 	video_filter/deinterlace/algo_basic.c video_filter/deinterlace/algo_basic.h \
+	video_filter/deinterlace/algo_basic_x86.asm \
 	video_filter/deinterlace/algo_x.c video_filter/deinterlace/algo_x.h \
 	video_filter/deinterlace/algo_yadif.c video_filter/deinterlace/algo_yadif.h \
 	video_filter/deinterlace/yadif.h \
diff --git a/modules/video_filter/deinterlace/algo_basic.c b/modules/video_filter/deinterlace/algo_basic.c
index 2ef7698c31..8ec649c096 100644
--- a/modules/video_filter/deinterlace/algo_basic.c
+++ b/modules/video_filter/deinterlace/algo_basic.c
@@ -197,16 +197,44 @@ static int RenderLinear##bpc##Bit_##feature( filter_t *p_filter,            \
     return VLC_SUCCESS;                                                     \
 }
 
-#define RENDER_LINEAR_SIMD(bpc, feature) \
-    RENDER_LINEAR(Merge##bpc##Bit##feature, bpc, feature)
+#define RENDER_LINEAR_SIMD(bpc, feature)                                    \
+                                                                            \
+void vlcpriv_deint_linear_##bpc##bit_##feature(uint8_t *dst,                \
+                                               ptrdiff_t dst_stride,        \
+                                               uint8_t const *src,          \
+                                               ptrdiff_t src_stride,        \
+                                               unsigned int w, unsigned h,  \
+                                               int field);                  \
+                                                                            \
+static int RenderLinear##bpc##Bit_##feature(filter_t *filter,               \
+                                            picture_t *opic,                \
+                                            picture_t *ipic,                \
+                                            int order, int field)           \
+{                                                                           \
+    VLC_UNUSED(filter); VLC_UNUSED(order);                                  \
+    for (int plane = 0 ; plane < ipic->i_planes ; ++plane)                  \
+    {                                                                       \
+        void *dst = opic->p[plane].p_pixels;                                \
+        void *src = ipic->p[plane].p_pixels;                                \
+        ptrdiff_t dst_stride = opic->p[plane].i_pitch;                      \
+        ptrdiff_t src_stride = opic->p[plane].i_pitch;                      \
+        unsigned int w = opic->p[plane].i_visible_pitch / (bpc / 8);        \
+        unsigned int h = opic->p[plane].i_visible_lines;                    \
+        vlcpriv_deint_linear_##bpc##bit_##feature(dst, dst_stride,          \
+                                                  src, src_stride,          \
+                                                  w, h, field);             \
+    }                                                                       \
+    return VLC_SUCCESS;                                                     \
+}
+
 #define RENDER_LINEAR_ARM(bpc, feature) \
     RENDER_LINEAR(merge##bpc##_##feature, bpc, feature)
 
 RENDER_LINEAR(Merge8BitGeneric, 8, c)
 RENDER_LINEAR(Merge16BitGeneric, 16, c)
-#if defined(CAN_COMPILE_SSE2)
-RENDER_LINEAR_SIMD(8, SSE2)
-RENDER_LINEAR_SIMD(16, SSE2)
+#if defined(__i386__) || defined(__x86_64__)
+RENDER_LINEAR_SIMD(8, sse2)
+RENDER_LINEAR_SIMD(16, sse2)
 #endif
 #if defined(CAN_COMPILE_ARM)
 RENDER_LINEAR_ARM(8, arm_neon)
@@ -225,9 +253,9 @@ RENDER_LINEAR_ARM(16, arm64_neon)
 
 ordered_renderer_t LinearRenderer(unsigned pixel_size)
 {
-#if defined(CAN_COMPILE_SSE2)
+#if defined(__i386__) || defined(__x86_64__)
     if (vlc_CPU_SSE2())
-        return pixel_size & 1 ? RenderLinear8Bit_SSE2 : RenderLinear16Bit_SSE2;
+        return pixel_size & 1 ? RenderLinear8Bit_sse2 : RenderLinear16Bit_sse2;
     else
 #endif
 #if defined(CAN_COMPILE_ARM)
diff --git a/modules/video_filter/deinterlace/algo_basic_x86.asm b/modules/video_filter/deinterlace/algo_basic_x86.asm
new file mode 100644
index 0000000000..bccef7aaf3
--- /dev/null
+++ b/modules/video_filter/deinterlace/algo_basic_x86.asm
@@ -0,0 +1,92 @@
+;*****************************************************************************
+;* algo_basic_x86.asm : Basic algorithms for the VLC deinterlacer
+;*****************************************************************************
+;* Copyright (C) 2019 VideoLAN
+;*
+;* Author: Victorien Le Couviour--Tuffet victorien at videolan.org
+;*
+;* This program is free software; you can redistribute it and/or modify it
+;* under the terms of the GNU Lesser General Public License as published by
+;* the Free Software Foundation; either version 2.1 of the License, or
+;* (at your option) any later version.
+;*
+;* This program is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+;* GNU Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public License
+;* along with this program; if not, write to the Free Software Foundation,
+;* Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
+;*****************************************************************************
+
+%include "x86inc.asm"
+
+SECTION .text
+
+%macro AVG 2
+ %if pxsize == 1
+    pavgb               %1, %2
+ %else
+    pavgw               %1, %2
+ %endif
+%endmacro
+
+%macro COPY_LINE 1
+    xor                 xd, xd
+.%1:
+    mova                m0, [srcq+xq*pxsize]
+    mova  [dstq+xq*pxsize], m0
+    add                 xd, mmsize / pxsize
+    cmp                 xd, wd
+    jl .%1
+%endmacro
+
+%macro DEINT_LINEAR 1
+cglobal deint_linear_%1bit, 7, 11, 2, dst, ds, src, ss, w, h, field, \
+                                      dst_tmp, src_tmp, src2, x
+ %assign pxsize %1 / 8
+    or          fieldd, 0           ; 0 = keep top | 1 = keep bottom
+    jz .main_loop
+    COPY_LINE skip_top_field
+    dec                 hd
+    add               srcq, ssq
+    add               dstq, dsq
+.main_loop:
+    sub                 hd, 2
+    jle .last_line_start
+    lea              src2q, [srcq+ssq*2]
+    COPY_LINE keep_field
+    add               dstq, dsq
+    mov           src_tmpq, srcq
+    mov           dst_tmpq, dstq
+    xor                 xd, xd
+.main_loop_x:
+    mova                m0, [src_tmpq]
+    AVG                 m0, [src2q]
+    mova        [dst_tmpq], m0
+    add           src_tmpq, mmsize
+    add           dst_tmpq, mmsize
+    add              src2q, mmsize
+    add                 xd, mmsize / pxsize
+    cmp                 xd, wd
+    jl .main_loop_x
+    lea               srcq, [srcq+ssq*2]
+    add               dstq, dsq
+    jmp .main_loop
+.last_line_start:
+    COPY_LINE last_line
+    or              fieldd, 0
+    jnz .ret
+    add               srcq, ssq
+    add               dstq, dsq
+    COPY_LINE skip_bottom_field
+.ret:
+    RET
+%endmacro
+
+INIT_XMM sse2
+
+DEINT_LINEAR 8
+
+DEINT_LINEAR 16
diff --git a/test/Makefile.am b/test/Makefile.am
index fc4cb2dc22..bc6cdfde47 100644
--- a/test/Makefile.am
+++ b/test/Makefile.am
@@ -175,6 +175,7 @@ test_modules_demux_ts_pes_SOURCES = modules/demux/ts_pes.c \
 test_modules_vfilter_deinterlace_simd_SOURCES = \
 				modules/video_filter/deinterlace.c \
 				$(top_srcdir)/modules/video_filter/deinterlace/algo_basic.c \
+				$(top_srcdir)/modules/video_filter/deinterlace/algo_basic_x86.asm \
 				$(top_srcdir)/modules/video_filter/deinterlace/merge.c
 test_modules_vfilter_deinterlace_simd_LDADD = $(LIBVLCCORE)
 
@@ -183,6 +184,7 @@ test_bench_asm_SOURCES = bench_asm/main.c \
 						 bench_asm/deinterlacing.c \
 						 $(top_srcdir)/modules/packetizer/startcode.asm \
 						 $(top_srcdir)/modules/video_filter/deinterlace/algo_basic.c \
+						 $(top_srcdir)/modules/video_filter/deinterlace/algo_basic_x86.asm \
 						 $(top_srcdir)/modules/video_filter/deinterlace/merge.c
 test_bench_asm_LDADD = $(LIBVLCCORE)
 
-- 
2.24.1



More information about the vlc-devel mailing list