[vlc-devel] [PATCH 21/25] deinterlace: rewrite render_mean SSE2 asm
Victorien Le Couviour--Tuffet
victorien.lecouviour.tuffet at gmail.com
Tue Apr 14 12:40:32 CEST 2020
8-bit:
before: 35443 cycles
after: 34230 cycles
16-bit:
before: 75695 cycles
after: 68427 cycles
---
modules/video_filter/deinterlace/algo_basic.c | 40 +++++++++++++++----
.../deinterlace/algo_basic_x86.asm | 23 +++++++++++
2 files changed, 56 insertions(+), 7 deletions(-)
diff --git a/modules/video_filter/deinterlace/algo_basic.c b/modules/video_filter/deinterlace/algo_basic.c
index 8ec649c096..45d06a99e3 100644
--- a/modules/video_filter/deinterlace/algo_basic.c
+++ b/modules/video_filter/deinterlace/algo_basic.c
@@ -315,16 +315,42 @@ static int RenderMean##bpc##Bit_##feature( filter_t *p_filter, \
return VLC_SUCCESS; \
}
-#define RENDER_MEAN_SIMD(bpc, feature) \
- RENDER_MEAN(Merge##bpc##Bit##feature, bpc, feature)
+#define RENDER_MEAN_SIMD(bpc, feature) \
+ \
+void vlcpriv_deint_mean_##bpc##bit_##feature(uint8_t *dst, \
+ ptrdiff_t dst_stride, \
+ uint8_t const *src, \
+ ptrdiff_t src_stride, \
+ unsigned int w, unsigned h); \
+ \
+static int RenderMean##bpc##Bit_##feature(filter_t *filter, \
+ picture_t *opic, \
+ picture_t *ipic) \
+{ \
+ VLC_UNUSED(filter); \
+ for (int plane = 0 ; plane < ipic->i_planes ; ++plane) \
+ { \
+ uint8_t *dst = opic->p[plane].p_pixels; \
+ uint8_t *src = ipic->p[plane].p_pixels; \
+ ptrdiff_t dst_stride = opic->p[plane].i_pitch; \
+ ptrdiff_t src_stride = ipic->p[plane].i_pitch; \
+ unsigned int w = opic->p[plane].i_visible_pitch / (bpc / 8); \
+ unsigned int h = opic->p[plane].i_visible_lines; \
+ vlcpriv_deint_mean_##bpc##bit_##feature(dst, dst_stride, \
+ src, src_stride, \
+ w, h); \
+ } \
+ return VLC_SUCCESS; \
+}
+
#define RENDER_MEAN_ARM(bpc, feature) \
RENDER_MEAN(merge##bpc##_##feature, bpc, feature)
RENDER_MEAN(Merge8BitGeneric, 8, c)
RENDER_MEAN(Merge16BitGeneric, 16, c)
-#if defined(CAN_COMPILE_SSE2)
-RENDER_MEAN_SIMD(8, SSE2)
-RENDER_MEAN_SIMD(16, SSE2)
+#if defined(__i386__) || defined(__x86_64__)
+RENDER_MEAN_SIMD(8, sse2)
+RENDER_MEAN_SIMD(16, sse2)
#endif
#if defined(CAN_COMPILE_ARM)
RENDER_MEAN_ARM(8, arm_neon)
@@ -343,9 +369,9 @@ RENDER_MEAN_ARM(16, arm64_neon)
single_pic_renderer_t MeanRenderer(unsigned pixel_size)
{
-#if defined(CAN_COMPILE_SSE2)
+#if defined(__i386__) || defined(__x86_64__)
if (vlc_CPU_SSE2())
- return pixel_size & 1 ? RenderMean8Bit_SSE2 : RenderMean16Bit_SSE2;
+ return pixel_size & 1 ? RenderMean8Bit_sse2 : RenderMean16Bit_sse2;
else
#endif
#if defined(CAN_COMPILE_ARM)
diff --git a/modules/video_filter/deinterlace/algo_basic_x86.asm b/modules/video_filter/deinterlace/algo_basic_x86.asm
index bccef7aaf3..d792928786 100644
--- a/modules/video_filter/deinterlace/algo_basic_x86.asm
+++ b/modules/video_filter/deinterlace/algo_basic_x86.asm
@@ -85,8 +85,31 @@ cglobal deint_linear_%1bit, 7, 11, 2, dst, ds, src, ss, w, h, field, \
RET
%endmacro
+%macro DEINT_MEAN 1
+ %assign pxsize %1 / 8
+cglobal deint_mean_%1bit, 6, 8, 2, dst, ds, src, ss, w, h, src1, x
+.main_loop:
+ lea src1q, [srcq+ssq]
+ xor xd, xd
+.main_loop_x:
+ mova m0, [srcq +xq*pxsize]
+ AVG m0, [src1q+xq*pxsize]
+ mova [dstq+xq*pxsize], m0
+ add xd, mmsize / pxsize
+ cmp xd, wd
+ jl .main_loop_x
+ lea srcq, [srcq+ssq*2]
+ lea src1q, [src1q+ssq*2]
+ add dstq, dsq
+ dec hd
+ jg .main_loop
+ RET
+%endmacro
+
INIT_XMM sse2
DEINT_LINEAR 8
+DEINT_MEAN 8
DEINT_LINEAR 16
+DEINT_MEAN 16
--
2.24.1
More information about the vlc-devel
mailing list