[vlc-devel] [PATCH 22/25] deinterlace: rewrite render_blend SSE2 asm
Victorien Le Couviour--Tuffet
victorien.lecouviour.tuffet at gmail.com
Tue Apr 14 12:40:33 CEST 2020
8-bit:
before: 60067 cycles
after: 57423 cycles
16-bit:
before: 113490 cycles
after: 111841 cycles
---
modules/video_filter/deinterlace/algo_basic.c | 40 +++++++++++++++----
.../deinterlace/algo_basic_x86.asm | 26 ++++++++++++
2 files changed, 59 insertions(+), 7 deletions(-)
diff --git a/modules/video_filter/deinterlace/algo_basic.c b/modules/video_filter/deinterlace/algo_basic.c
index 45d06a99e3..2ed07079eb 100644
--- a/modules/video_filter/deinterlace/algo_basic.c
+++ b/modules/video_filter/deinterlace/algo_basic.c
@@ -435,16 +435,42 @@ static int RenderBlend##bpc##Bit_##feature( filter_t *p_filter, \
return VLC_SUCCESS; \
}
-#define RENDER_BLEND_SIMD(bpc, feature) \
- RENDER_BLEND(Merge##bpc##Bit##feature, bpc, feature)
+#define RENDER_BLEND_SIMD(bpc, feature) \
+ \
+void vlcpriv_deint_blend_##bpc##bit_##feature(uint8_t *dst, \
+ ptrdiff_t dst_stride, \
+ uint8_t const *src, \
+ ptrdiff_t src_stride, \
+ unsigned int w, unsigned h); \
+ \
+static int RenderBlend##bpc##Bit_##feature(filter_t *filter, \
+ picture_t *opic, \
+ picture_t *ipic) \
+{ \
+ VLC_UNUSED(filter); \
+ for (int plane = 0 ; plane < ipic->i_planes ; ++plane) \
+ { \
+ uint8_t *dst = opic->p[plane].p_pixels; \
+ uint8_t *src = ipic->p[plane].p_pixels; \
+ ptrdiff_t dst_stride = opic->p[plane].i_pitch; \
+ ptrdiff_t src_stride = ipic->p[plane].i_pitch; \
+ unsigned int w = opic->p[plane].i_visible_pitch / (bpc / 8); \
+ unsigned int h = opic->p[plane].i_visible_lines; \
+ vlcpriv_deint_blend_##bpc##bit_##feature(dst, dst_stride, \
+ src, src_stride, \
+ w, h); \
+ } \
+ return VLC_SUCCESS; \
+}
+
#define RENDER_BLEND_ARM(bpc, feature) \
RENDER_BLEND(merge##bpc##_##feature, bpc, feature)
RENDER_BLEND(Merge8BitGeneric, 8, c)
RENDER_BLEND(Merge16BitGeneric, 16, c)
-#if defined(CAN_COMPILE_SSE2)
-RENDER_BLEND_SIMD(8, SSE2)
-RENDER_BLEND_SIMD(16, SSE2)
+#if defined(__i386__) || defined(__x86_64__)
+RENDER_BLEND_SIMD(8, sse2)
+RENDER_BLEND_SIMD(16, sse2)
#endif
#if defined(CAN_COMPILE_ARM)
RENDER_BLEND_ARM(8, arm_neon)
@@ -463,9 +489,9 @@ RENDER_BLEND_ARM(16, arm64_neon)
single_pic_renderer_t BlendRenderer(unsigned pixel_size)
{
-#if defined(CAN_COMPILE_SSE2)
+#if defined(__i386__) || defined(__x86_64__)
if (vlc_CPU_SSE2())
- return pixel_size & 1 ? RenderBlend8Bit_SSE2 : RenderBlend16Bit_SSE2;
+ return pixel_size & 1 ? RenderBlend8Bit_sse2 : RenderBlend16Bit_sse2;
else
#endif
#if defined(CAN_COMPILE_ARM)
diff --git a/modules/video_filter/deinterlace/algo_basic_x86.asm b/modules/video_filter/deinterlace/algo_basic_x86.asm
index d792928786..19db8ba08d 100644
--- a/modules/video_filter/deinterlace/algo_basic_x86.asm
+++ b/modules/video_filter/deinterlace/algo_basic_x86.asm
@@ -106,10 +106,36 @@ cglobal deint_mean_%1bit, 6, 8, 2, dst, ds, src, ss, w, h, src1, x
RET
%endmacro
+%macro DEINT_BLEND 1
+ %assign pxsize %1 / 8
+cglobal deint_blend_%1bit, 6, 8, 2, dst, ds, src, ss, w, h, src1, x
+ COPY_LINE first_line
+ add dstq, dsq
+ dec hd
+.main_loop:
+ lea src1q, [srcq+ssq]
+ xor xd, xd
+.main_loop_x:
+ mova m0, [srcq +xq*pxsize]
+ AVG m0, [src1q+xq*pxsize]
+ mova [dstq+xq*pxsize], m0
+ add xd, mmsize / pxsize
+ cmp xd, wd
+ jl .main_loop_x
+ add srcq, ssq
+ add src1q, ssq
+ add dstq, dsq
+ dec hd
+ jg .main_loop
+ RET
+%endmacro
+
INIT_XMM sse2
DEINT_LINEAR 8
DEINT_MEAN 8
+DEINT_BLEND 8
DEINT_LINEAR 16
DEINT_MEAN 16
+DEINT_BLEND 16
--
2.24.1
More information about the vlc-devel
mailing list