[x265] [PATCH] asm: avx2 code for weight_sp() for 8bpp
chen
chenm003 at 163.com
Thu Apr 2 12:02:36 CEST 2015
we can use this version and improve in future
please make a flag in list to remember we improve it
At 2015-04-02 17:51:08,sumalatha at multicorewareinc.com wrote:
># HG changeset patch
># User Sumalatha Polureddy
># Date 1427968258 -19800
># Thu Apr 02 15:20:58 2015 +0530
># Node ID 7f976e1e89c5940a8bb2f5b965ebd9ed6e6948a6
># Parent ac85c775620f1dcb0df056874633cbf916098bd2
>asm: avx2 code for weight_sp() for 8bpp
>
>sse4
>weight_sp 16.40x 7768.71 127369.20
>
>avx2
>weight_sp 25.83x 4918.74 127040.17
>
>diff -r ac85c775620f -r 7f976e1e89c5 source/common/x86/asm-primitives.cpp
>--- a/source/common/x86/asm-primitives.cpp Tue Mar 31 20:04:28 2015 -0500
>+++ b/source/common/x86/asm-primitives.cpp Thu Apr 02 15:20:58 2015 +0530
>@@ -1604,6 +1604,7 @@
>
> p.scale1D_128to64 = x265_scale1D_128to64_avx2;
> p.weight_pp = x265_weight_pp_avx2;
>+ p.weight_sp = x265_weight_sp_avx2;
>
> // intra_pred functions
> p.cu[BLOCK_8x8].intra_pred[3] = x265_intra_pred_ang8_3_avx2;
>diff -r ac85c775620f -r 7f976e1e89c5 source/common/x86/pixel-util8.asm
>--- a/source/common/x86/pixel-util8.asm Tue Mar 31 20:04:28 2015 -0500
>+++ b/source/common/x86/pixel-util8.asm Thu Apr 02 15:20:58 2015 +0530
>@@ -1492,6 +1492,84 @@
> dec r5d
> jnz .loopH
> RET
>+
>+%if ARCH_X86_64
>+INIT_YMM avx2
>+cglobal weight_sp, 6, 9, 7
>+ mov r7d, r7m
>+ shl r7d, 16
>+ or r7d, r6m
>+ vpbroadcastd m0, r7d ; m0 = times 8 dw w0, round
>+ movd xm1, r8m ; m1 = [shift]
>+ vpbroadcastd m2, r9m ; m2 = times 16 dw offset
>+ vpbroadcastw m3, [pw_1]
>+ vpbroadcastw m4, [pw_2000]
>+
>+ add r2d, r2d ; 2 * srcstride
>+
>+ mov r7, r0
>+ mov r8, r1
>+.loopH:
>+ mov r6d, r4d ; width
>+
>+ ; save old src and dst
>+ mov r0, r7 ; src
>+ mov r1, r8 ; dst
>+.loopW:
>+ movu m5, [r0]
>+ paddw m5, m4
>+
>+ punpcklwd m6,m5, m3
>+ pmaddwd m6, m0
>+ psrad m6, xm1
>+ paddd m6, m2
>+
>+ punpckhwd m5, m3
>+ pmaddwd m5, m0
>+ psrad m5, xm1
>+ paddd m5, m2
>+
>+ packssdw m6, m5
>+ packuswb m6, m6
>+ vpermq m6, m6, 10001000b
>+
>+ sub r6d, 16
>+ jl .width8
>+ movu [r1], xm6
>+ je .nextH
>+ add r0, 32
>+ add r1, 16
>+ jmp .loopW
>+
>+.width8:
>+ add r6d, 16
>+ cmp r6d, 8
>+ jl .width4
>+ movq [r1], xm6
>+ je .nextH
>+ psrldq m6, 8
>+ sub r6d, 8
>+ add r1, 8
>+
>+.width4:
>+ cmp r6d, 4
>+ jl .width2
>+ movd [r1], xm6
>+ je .nextH
>+ add r1, 4
>+ pshufd m6, m6, 1
>+
>+.width2:
>+ pextrw [r1], xm6, 0
>+
>+.nextH:
>+ lea r7, [r7 + r2]
>+ lea r8, [r8 + r3]
>+
>+ dec r5d
>+ jnz .loopH
>+ RET
>+%endif
> %endif ; end of (HIGH_BIT_DEPTH == 0)
>
>
>diff -r ac85c775620f -r 7f976e1e89c5 source/common/x86/pixel.h
>--- a/source/common/x86/pixel.h Tue Mar 31 20:04:28 2015 -0500
>+++ b/source/common/x86/pixel.h Thu Apr 02 15:20:58 2015 +0530
>@@ -272,6 +272,7 @@
> int x265_psyCost_ss_16x16_avx2(const int16_t* source, intptr_t sstride, const int16_t* recon, intptr_t rstride);
> int x265_psyCost_ss_32x32_avx2(const int16_t* source, intptr_t sstride, const int16_t* recon, intptr_t rstride);
> int x265_psyCost_ss_64x64_avx2(const int16_t* source, intptr_t sstride, const int16_t* recon, intptr_t rstride);
>+void x265_weight_sp_avx2(const int16_t* src, pixel* dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset);
>
> #undef DECL_PIXELS
> #undef DECL_HEVC_SSD
>_______________________________________________
>x265-devel mailing list
>x265-devel at videolan.org
>https://mailman.videolan.org/listinfo/x265-devel
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20150402/1e452a6f/attachment.html>
More information about the x265-devel
mailing list