[vlc-devel] [PATCH 20/25] deinterlace: rewrite render_linear SSE2 asm
Victorien Le Couviour--Tuffet
victorien.lecouviour.tuffet at gmail.com
Tue Apr 14 12:40:31 CEST 2020
8-bit:
before: 52606 cycles
after: 47257 cycles
16-bit:
before: 99002 cycles
after: 95914 cycles
---
modules/video_filter/Makefile.am | 1 +
modules/video_filter/deinterlace/algo_basic.c | 42 +++++++--
.../deinterlace/algo_basic_x86.asm | 92 +++++++++++++++++++
test/Makefile.am | 2 +
4 files changed, 130 insertions(+), 7 deletions(-)
create mode 100644 modules/video_filter/deinterlace/algo_basic_x86.asm
diff --git a/modules/video_filter/Makefile.am b/modules/video_filter/Makefile.am
index 51ee92aff1..5b63c30f08 100644
--- a/modules/video_filter/Makefile.am
+++ b/modules/video_filter/Makefile.am
@@ -129,6 +129,7 @@ libdeinterlace_plugin_la_SOURCES = \
video_filter/deinterlace/merge.c video_filter/deinterlace/merge.h \
video_filter/deinterlace/helpers.c video_filter/deinterlace/helpers.h \
video_filter/deinterlace/algo_basic.c video_filter/deinterlace/algo_basic.h \
+ video_filter/deinterlace/algo_basic_x86.asm \
video_filter/deinterlace/algo_x.c video_filter/deinterlace/algo_x.h \
video_filter/deinterlace/algo_yadif.c video_filter/deinterlace/algo_yadif.h \
video_filter/deinterlace/yadif.h \
diff --git a/modules/video_filter/deinterlace/algo_basic.c b/modules/video_filter/deinterlace/algo_basic.c
index 2ef7698c31..8ec649c096 100644
--- a/modules/video_filter/deinterlace/algo_basic.c
+++ b/modules/video_filter/deinterlace/algo_basic.c
@@ -197,16 +197,44 @@ static int RenderLinear##bpc##Bit_##feature( filter_t *p_filter, \
return VLC_SUCCESS; \
}
-#define RENDER_LINEAR_SIMD(bpc, feature) \
- RENDER_LINEAR(Merge##bpc##Bit##feature, bpc, feature)
+#define RENDER_LINEAR_SIMD(bpc, feature) \
+ \
+void vlcpriv_deint_linear_##bpc##bit_##feature(uint8_t *dst, \
+ ptrdiff_t dst_stride, \
+ uint8_t const *src, \
+ ptrdiff_t src_stride, \
+ unsigned int w, unsigned h, \
+ int field); \
+ \
+static int RenderLinear##bpc##Bit_##feature(filter_t *filter, \
+ picture_t *opic, \
+ picture_t *ipic, \
+ int order, int field) \
+{ \
+ VLC_UNUSED(filter); VLC_UNUSED(order); \
+ for (int plane = 0 ; plane < ipic->i_planes ; ++plane) \
+ { \
+ void *dst = opic->p[plane].p_pixels; \
+ void *src = ipic->p[plane].p_pixels; \
+ ptrdiff_t dst_stride = opic->p[plane].i_pitch; \
+ ptrdiff_t src_stride = opic->p[plane].i_pitch; \
+ unsigned int w = opic->p[plane].i_visible_pitch / (bpc / 8); \
+ unsigned int h = opic->p[plane].i_visible_lines; \
+ vlcpriv_deint_linear_##bpc##bit_##feature(dst, dst_stride, \
+ src, src_stride, \
+ w, h, field); \
+ } \
+ return VLC_SUCCESS; \
+}
+
#define RENDER_LINEAR_ARM(bpc, feature) \
RENDER_LINEAR(merge##bpc##_##feature, bpc, feature)
RENDER_LINEAR(Merge8BitGeneric, 8, c)
RENDER_LINEAR(Merge16BitGeneric, 16, c)
-#if defined(CAN_COMPILE_SSE2)
-RENDER_LINEAR_SIMD(8, SSE2)
-RENDER_LINEAR_SIMD(16, SSE2)
+#if defined(__i386__) || defined(__x86_64__)
+RENDER_LINEAR_SIMD(8, sse2)
+RENDER_LINEAR_SIMD(16, sse2)
#endif
#if defined(CAN_COMPILE_ARM)
RENDER_LINEAR_ARM(8, arm_neon)
@@ -225,9 +253,9 @@ RENDER_LINEAR_ARM(16, arm64_neon)
ordered_renderer_t LinearRenderer(unsigned pixel_size)
{
-#if defined(CAN_COMPILE_SSE2)
+#if defined(__i386__) || defined(__x86_64__)
if (vlc_CPU_SSE2())
- return pixel_size & 1 ? RenderLinear8Bit_SSE2 : RenderLinear16Bit_SSE2;
+ return pixel_size & 1 ? RenderLinear8Bit_sse2 : RenderLinear16Bit_sse2;
else
#endif
#if defined(CAN_COMPILE_ARM)
diff --git a/modules/video_filter/deinterlace/algo_basic_x86.asm b/modules/video_filter/deinterlace/algo_basic_x86.asm
new file mode 100644
index 0000000000..bccef7aaf3
--- /dev/null
+++ b/modules/video_filter/deinterlace/algo_basic_x86.asm
@@ -0,0 +1,92 @@
+;*****************************************************************************
+;* algo_basic_x86.asm : Basic algorithms for the VLC deinterlacer
+;*****************************************************************************
+;* Copyright (C) 2019 VideoLAN
+;*
+;* Author: Victorien Le Couviour--Tuffet victorien at videolan.org
+;*
+;* This program is free software; you can redistribute it and/or modify it
+;* under the terms of the GNU Lesser General Public License as published by
+;* the Free Software Foundation; either version 2.1 of the License, or
+;* (at your option) any later version.
+;*
+;* This program is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+;* GNU Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public License
+;* along with this program; if not, write to the Free Software Foundation,
+;* Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
+;*****************************************************************************
+
+%include "x86inc.asm"
+
+SECTION .text
+
+%macro AVG 2
+ %if pxsize == 1
+ pavgb %1, %2
+ %else
+ pavgw %1, %2
+ %endif
+%endmacro
+
+%macro COPY_LINE 1
+ xor xd, xd
+.%1:
+ mova m0, [srcq+xq*pxsize]
+ mova [dstq+xq*pxsize], m0
+ add xd, mmsize / pxsize
+ cmp xd, wd
+ jl .%1
+%endmacro
+
+%macro DEINT_LINEAR 1
+cglobal deint_linear_%1bit, 7, 11, 2, dst, ds, src, ss, w, h, field, \
+ dst_tmp, src_tmp, src2, x
+ %assign pxsize %1 / 8
+ or fieldd, 0 ; 0 = keep top | 1 = keep bottom
+ jz .main_loop
+ COPY_LINE skip_top_field
+ dec hd
+ add srcq, ssq
+ add dstq, dsq
+.main_loop:
+ sub hd, 2
+ jle .last_line_start
+ lea src2q, [srcq+ssq*2]
+ COPY_LINE keep_field
+ add dstq, dsq
+ mov src_tmpq, srcq
+ mov dst_tmpq, dstq
+ xor xd, xd
+.main_loop_x:
+ mova m0, [src_tmpq]
+ AVG m0, [src2q]
+ mova [dst_tmpq], m0
+ add src_tmpq, mmsize
+ add dst_tmpq, mmsize
+ add src2q, mmsize
+ add xd, mmsize / pxsize
+ cmp xd, wd
+ jl .main_loop_x
+ lea srcq, [srcq+ssq*2]
+ add dstq, dsq
+ jmp .main_loop
+.last_line_start:
+ COPY_LINE last_line
+ or fieldd, 0
+ jnz .ret
+ add srcq, ssq
+ add dstq, dsq
+ COPY_LINE skip_bottom_field
+.ret:
+ RET
+%endmacro
+
+INIT_XMM sse2
+
+DEINT_LINEAR 8
+
+DEINT_LINEAR 16
diff --git a/test/Makefile.am b/test/Makefile.am
index fc4cb2dc22..bc6cdfde47 100644
--- a/test/Makefile.am
+++ b/test/Makefile.am
@@ -175,6 +175,7 @@ test_modules_demux_ts_pes_SOURCES = modules/demux/ts_pes.c \
test_modules_vfilter_deinterlace_simd_SOURCES = \
modules/video_filter/deinterlace.c \
$(top_srcdir)/modules/video_filter/deinterlace/algo_basic.c \
+ $(top_srcdir)/modules/video_filter/deinterlace/algo_basic_x86.asm \
$(top_srcdir)/modules/video_filter/deinterlace/merge.c
test_modules_vfilter_deinterlace_simd_LDADD = $(LIBVLCCORE)
@@ -183,6 +184,7 @@ test_bench_asm_SOURCES = bench_asm/main.c \
bench_asm/deinterlacing.c \
$(top_srcdir)/modules/packetizer/startcode.asm \
$(top_srcdir)/modules/video_filter/deinterlace/algo_basic.c \
+ $(top_srcdir)/modules/video_filter/deinterlace/algo_basic_x86.asm \
$(top_srcdir)/modules/video_filter/deinterlace/merge.c
test_bench_asm_LDADD = $(LIBVLCCORE)
--
2.24.1
More information about the vlc-devel
mailing list