[x265] [PATCH] weighted prediction pixel, avx2 asm code as per new interface

chen chenm003 at 163.com
Mon Oct 20 17:37:11 CEST 2014


 

At 2014-10-20 19:37:06,praveen at multicorewareinc.com wrote:
># HG changeset patch
># User Praveen Tiwari
># Date 1413805016 -19800
># Node ID 2293c5759f2e0af36141125a97b7a479e023619b
># Parent  3366be6ef59eec3d3ca69ed52942708b5d1b3bc6
>weighted prediction pixel, avx2 asm code as per new interface
>
>diff -r 3366be6ef59e -r 2293c5759f2e source/common/x86/pixel-util.h
>--- a/source/common/x86/pixel-util.h	Mon Oct 20 13:53:09 2014 +0530
>+++ b/source/common/x86/pixel-util.h	Mon Oct 20 17:06:56 2014 +0530
>@@ -58,6 +58,7 @@
> int x265_count_nonzero_ssse3(const int16_t *quantCoeff, int numCoeff);
> 
> void x265_weight_pp_sse4(pixel *src, pixel *dst, intptr_t stride, int width, int height, int w0, int round, int shift, int offset);
>+void x265_weight_pp_avx2(pixel *src, pixel *dst, intptr_t stride, int width, int height, int w0, int round, int shift, int offset);
> void x265_weight_sp_sse4(int16_t *src, pixel *dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset);
> 
> void x265_pixel_ssim_4x4x2_core_mmx2(const uint8_t * pix1, intptr_t stride1,
>diff -r 3366be6ef59e -r 2293c5759f2e source/common/x86/pixel-util8.asm
>--- a/source/common/x86/pixel-util8.asm	Mon Oct 20 13:53:09 2014 +0530
>+++ b/source/common/x86/pixel-util8.asm	Mon Oct 20 17:06:56 2014 +0530
>@@ -1363,6 +1363,56 @@
>     jnz         .loopH
>     RET
> 
>+INIT_YMM avx2
>+cglobal weight_pp, 6, 7, 6
>+
>+    shl          r5d, 6            ; m0 = [w0<<6]
>+    mov          r6d, r6m
>+    shl          r6d, 16
>+    or           r6d, r5d          ; assuming both (w0<<6) and round are using maximum of 16 bits each.
>+    movd         xm0, r6d
>+    pshufd       xm0, xm0, 0       ; m0 = [w0<<6, round]
>+    vinserti128  m0, m0, xm0, 1    ; document says (pshufd + vinserti128) can be replaced with vpbroadcastd m0, xm0, but having build problem, need to investigate

vpbroadcastd m0, xm0
>+    movd         xm1, r7m
>+    vpbroadcastd m2, r8m
>+    mova         m5, [pw_1]
>+    sub          r2d, r3d
>+    shr          r3d, 4
>+
>+.loopH:
>+    mov          r5d, r3d
>+
>+.loopW:
>+    pmovzxbw    m4, [r0]
>+    punpcklwd   m3, m4, m5
>+    pmaddwd     m3, m0
>+    psrad       m3, xm1
>+    paddd       m3, m2
>+
>+    punpckhwd   m4, m5
>+    pmaddwd     m4, m0
>+    psrad       m4, xm1
>+    paddd       m4, m2
>+
>+    packssdw    m3, m4
>+    vpermq      m4, m3, 11101110b  ;[1, 2, 1, 2]

you just want high 128bits, how about vextracti128?
>+    packuswb    m3, m4

are you need 256bits operators here?
>+    movu        [r1], xm3
>+
>+    add         r0, 16
>+    add         r1, 16
>+
>+    dec         r5d
>+    jnz         .loopW
>+
>+    lea         r0, [r0 + r2]
>+    lea         r1, [r1 + r2]
>+
>+    dec         r4d
>+    jnz         .loopH
>+    RET
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20141020/62f62ba8/attachment.html>


More information about the x265-devel mailing list