[x265] [PATCH] asm: avx2 code for weight_sp() 16bpp
Aasaipriya Chandran
aasaipriya at multicorewareinc.com
Tue Jun 30 14:15:25 CEST 2015
Kindly dont push this patch. Need to add ARCH_X86_64 guard.
Thanks,
Aasaipriya
On Tue, Jun 30, 2015 at 5:11 PM, <aasaipriya at multicorewareinc.com> wrote:
> # HG changeset patch
> # User Aasaipriya Chandran <aasaipriya at multicorewareinc.com>
> # Date 1435664485 -19800
> # Tue Jun 30 17:11:25 2015 +0530
> # Node ID 0cc8a97207523ab1d1c14ee5bcd8c808be66f446
> # Parent b1301944894051b9641006797e4d6253b277f3e4
> asm: avx2 code for weight_sp() 16bpp
>
> avx2: weight_sp 12.10x 4537.14 54879.57
> sse4: weight_sp 6.48x 8163.87 52870.36
>
> diff -r b13019448940 -r 0cc8a9720752 source/common/x86/asm-primitives.cpp
> --- a/source/common/x86/asm-primitives.cpp Mon Jun 29 17:19:07 2015
> +0530
> +++ b/source/common/x86/asm-primitives.cpp Tue Jun 30 17:11:25 2015
> +0530
> @@ -1522,6 +1522,7 @@
> p.scale1D_128to64 = PFX(scale1D_128to64_avx2);
> p.scale2D_64to32 = PFX(scale2D_64to32_avx2);
> p.weight_pp = PFX(weight_pp_avx2);
> + p.weight_sp = PFX(weight_sp_avx2);
> p.sign = PFX(calSign_avx2);
> p.planecopy_cp = PFX(upShift_8_avx2);
>
> diff -r b13019448940 -r 0cc8a9720752 source/common/x86/pixel-util8.asm
> --- a/source/common/x86/pixel-util8.asm Mon Jun 29 17:19:07 2015 +0530
> +++ b/source/common/x86/pixel-util8.asm Tue Jun 30 17:11:25 2015 +0530
> @@ -1669,8 +1669,128 @@
> dec r5d
> jnz .loopH
> RET
> -
> -%if ARCH_X86_64
> +%endif
> +
> +
> +%if HIGH_BIT_DEPTH
> +INIT_YMM avx2
> +cglobal weight_sp, 6,10,9
> + mova m1, [pw_1023]
> + mova m2, [pw_1]
> + mov r6d, r7m
> + shl r6d, 16
> + or r6d, r6m
> + vpbroadcastd m3, r6d ; m3 = [round w0]
> + movd xm4, r8m ; m4 = [shift]
> + vpbroadcastd m5, r9m ; m5 = [offset]
> +
> + ; correct row stride
> + add r3d, r3d
> + add r2d, r2d
> + mov r6d, r4d
> + and r6d, ~(mmsize / SIZEOF_PIXEL - 1)
> + sub r3d, r6d
> + sub r3d, r6d
> + sub r2d, r6d
> + sub r2d, r6d
> +
> + ; generate partial width mask (MUST BE IN YMM0)
> + mov r6d, r4d
> + and r6d, (mmsize / SIZEOF_PIXEL - 1)
> + movd xm0, r6d
> + pshuflw m0, m0, 0
> + punpcklqdq m0, m0
> + vinserti128 m0, m0, xm0, 1
> + pcmpgtw m0, [pw_0_15]
> +
> +.loopH:
> + mov r6d, r4d
> +
> +.loopW:
> + movu m6, [r0]
> + paddw m6, [pw_2000]
> +
> + punpcklwd m7, m6, m2
> + pmaddwd m7, m3 ;(round w0)
> + psrad m7, xm4 ;(shift)
> + paddd m7, m5 ;(offset)
> +
> + punpckhwd m6, m2
> + pmaddwd m6, m3
> + psrad m6, xm4
> + paddd m6, m5
> +
> + packusdw m7, m6
> + pminuw m7, m1
> +
> + sub r6d, (mmsize / SIZEOF_PIXEL)
> + jl .width14
> + movu [r1], m7
> + lea r0, [r0 + mmsize]
> + lea r1, [r1 + mmsize]
> + je .nextH
> + jmp .loopW
> +
> +.width14:
> + add r6d, 16
> + cmp r6d, 14
> + jl .width12
> + movu [r1], xm7
> + vextracti128 xm8, m7, 1
> + movq [r1 + 16], xm8
> + pextrd [r1 + 24], xm8, 2
> + je .nextH
> +
> +.width12:
> + cmp r6d, 12
> + jl .width10
> + movu [r1], xm7
> + vextracti128 xm8, m7, 1
> + movq [r1 + 16], xm8
> + je .nextH
> +
> +.width10:
> + cmp r6d, 10
> + jl .width8
> + movu [r1], xm7
> + vextracti128 xm8, m7, 1
> + movd [r1 + 16], xm8
> + je .nextH
> +
> +.width8:
> + cmp r6d, 8
> + jl .width6
> + movu [r1], xm7
> + je .nextH
> +
> +.width6
> + cmp r6d, 6
> + jl .width4
> + movq [r1], xm7
> + pextrd [r1 + 8], xm7, 2
> + je .nextH
> +
> +.width4:
> + cmp r6d, 4
> + jl .width2
> + movq [r1], xm7
> + je .nextH
> + add r1, 4
> + pshufd m6, m6, 1
> + je .nextH
> +
> +.width2:
> + movd [r1], xm7
> +
> +.nextH:
> + add r0, r2
> + add r1, r3
> +
> + dec r5d
> + jnz .loopH
> + RET
> +
> +%else
> INIT_YMM avx2
> cglobal weight_sp, 6, 9, 7
> mov r7d, r7m
> @@ -1747,8 +1867,6 @@
> jnz .loopH
> RET
> %endif
> -%endif ; end of (HIGH_BIT_DEPTH == 0)
> -
>
> ;-----------------------------------------------------------------
> ; void transpose_4x4(pixel *dst, pixel *src, intptr_t stride)
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20150630/22226988/attachment.html>
More information about the x265-devel
mailing list