[x265] [PATCH] asm: luma_vpp[8x16, 8x32] in avx2: improve 1139c->774c, 1968c->1452c

chen chenm003 at 163.com
Thu Nov 20 22:06:28 CET 2014


 

At 2014-11-20 15:17:45,"Divya Manivannan" <divya at multicorewareinc.com> wrote:
># HG changeset patch
># User Divya Manivannan <divya at multicorewareinc.com>
># Date 1416467833 -19800
>#      Thu Nov 20 12:47:13 2014 +0530
># Node ID 02bc16b116ebfdb61c91a516291f1b19b259bcbf
># Parent  a6e1b125424acc727f9ba464ccc530550203d407
>asm: luma_vpp[8x16, 8x32] in avx2: improve 1139c->774c, 1968c->1452c
>
>diff -r a6e1b125424a -r 02bc16b116eb source/common/x86/asm-primitives.cpp
>--- a/source/common/x86/asm-primitives.cpp	Thu Nov 20 12:23:05 2014 +0530
>+++ b/source/common/x86/asm-primitives.cpp	Thu Nov 20 12:47:13 2014 +0530
>@@ -1802,6 +1802,8 @@
>         p.luma_hpp[LUMA_4x4] = x265_interp_8tap_horiz_pp_4x4_avx2;
>         p.luma_vpp[LUMA_4x4] = x265_interp_8tap_vert_pp_4x4_avx2;
>         p.luma_vpp[LUMA_8x8] = x265_interp_8tap_vert_pp_8x8_avx2;
>+        p.luma_vpp[LUMA_8x16] = x265_interp_8tap_vert_pp_8x16_avx2;
>+        p.luma_vpp[LUMA_8x32] = x265_interp_8tap_vert_pp_8x32_avx2;
>     }
> #endif // if HIGH_BIT_DEPTH
> }
>diff -r a6e1b125424a -r 02bc16b116eb source/common/x86/ipfilter8.asm
>--- a/source/common/x86/ipfilter8.asm	Thu Nov 20 12:23:05 2014 +0530
>+++ b/source/common/x86/ipfilter8.asm	Thu Nov 20 12:47:13 2014 +0530
>@@ -3729,6 +3729,53 @@
>     movhps          [r2 + r4], xm4
>     RET
> 
>+%macro FILTER_VER_LUMA_AVX2_8xN 2
>+INIT_YMM avx2
>+cglobal interp_8tap_vert_pp_%1x%2, 4, 7, 8, 0-gprsize
>+    mov             r4d, r4m
>+    shl             r4d, 7
>+
>+%ifdef PIC
>+    lea             r5, [tab_LumaCoeffVer_32]
>+    add             r5, r4
>+%else
>+    lea             r5, [tab_LumaCoeffVer_32 + r4]
>+%endif
>+
>+    lea             r4, [r1 * 3]
>+    sub             r0, r4
>+    lea             r6, [r1 * 4]
>+    mov             word [rsp], %2 / 8
>+    mova            m7, [pw_512]
>+
>+.loop:
>+    PROCESS_LUMA_AVX2_W8_8R
>+    pmulhrsw        m5, m7                          ; m5 = word: row 0, row 1
>+    pmulhrsw        m2, m7                          ; m2 = word: row 2, row 3
>+    pmulhrsw        m1, m7                          ; m1 = word: row 4, row 5
>+    pmulhrsw        m4, m7                          ; m4 = word: row 6, row 7
>+    packuswb        m5, m2
>+    packuswb        m1, m4
>+    vextracti128    xm2, m5, 1
>+    vextracti128    xm4, m1, 1
>+    movq            [r2], xm5
>+    movq            [r2 + r3], xm2
>+    lea             r2, [r2 + r3 * 2]

use free register to buffer r3*3
>+    movhps          [r2], xm5
>+    movhps          [r2 + r3], xm2
>+    lea             r2, [r2 + r3 * 2]
>+    movq            [r2], xm1
>+    movq            [r2 + r3], xm4
>+    lea             r2, [r2 + r3 * 2]
>+    movhps          [r2], xm1
>+    movhps          [r2 + r3], xm4
>+    lea             r2, [r2 + r3 * 2]
>+    sub             r0, r6
>+    dec             word [rsp]
>+    jnz             .loop
>+    RET
>+%endmacro
>+
> ;-------------------------------------------------------------------------------------------------------------
> ; void interp_8tap_vert_pp_8x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
> ;-------------------------------------------------------------------------------------------------------------
>@@ -3743,11 +3790,13 @@
> ; void interp_8tap_vert_pp_8x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
> ;-------------------------------------------------------------------------------------------------------------
> FILTER_VER_LUMA_8xN 8, 16, pp
>+FILTER_VER_LUMA_AVX2_8xN 8, 16
> 
> ;-------------------------------------------------------------------------------------------------------------
> ; void interp_8tap_vert_pp_8x32(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
> ;-------------------------------------------------------------------------------------------------------------
> FILTER_VER_LUMA_8xN 8, 32, pp
>+FILTER_VER_LUMA_AVX2_8xN 8, 32
> 
> ;-------------------------------------------------------------------------------------------------------------
> ; void interp_8tap_vert_ps_8x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
>_______________________________________________
>x265-devel mailing list
>x265-devel at videolan.org
>https://mailman.videolan.org/listinfo/x265-devel
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20141121/00fb2099/attachment.html>


More information about the x265-devel mailing list