[x265] [PATCH] asm-avx2: filter_p2s[4x4](2.26x), filter_p2s[4x8](3.01x), filter_p2s[4x16](3.00x)

chen chenm003 at 163.com
Mon Mar 9 20:02:30 CET 2015


 

At 2015-03-09 14:54:45,rajesh at multicorewareinc.com wrote:
># HG changeset patch
># User Rajesh Paulraj<rajesh at multicorewareinc.com>
># Date 1425884017 -19800
>#      Mon Mar 09 12:23:37 2015 +0530
># Node ID 7e7f03a4089255c3d2b46d536ff203653510bf3d
># Parent  38ea9788d3e652d6fd53518b3943b636d55bb0b4
>asm-avx2: filter_p2s[4x4](2.26x), filter_p2s[4x8](3.01x), filter_p2s[4x16](3.00x)
>
>diff -r 38ea9788d3e6 -r 7e7f03a40892 source/common/x86/asm-primitives.cpp
>--- a/source/common/x86/asm-primitives.cpp	Mon Mar 09 11:09:50 2015 +0530
>+++ b/source/common/x86/asm-primitives.cpp	Mon Mar 09 12:23:37 2015 +0530
>@@ -1479,6 +1479,10 @@
>         p.cu[BLOCK_16x16].calcresidual = x265_getResidual16_avx2;
>         p.cu[BLOCK_32x32].calcresidual = x265_getResidual32_avx2;
> 
>+        p.pu[LUMA_4x4].filter_p2s = x265_pixelToShort_4x4_avx2;
>+        p.pu[LUMA_4x8].filter_p2s = x265_pixelToShort_4x8_avx2;
>+        p.pu[LUMA_4x16].filter_p2s = x265_pixelToShort_4x16_avx2;
>+
>         p.scale1D_128to64 = x265_scale1D_128to64_avx2;
>         p.weight_pp = x265_weight_pp_avx2;
> 
>diff -r 38ea9788d3e6 -r 7e7f03a40892 source/common/x86/ipfilter8.asm
>--- a/source/common/x86/ipfilter8.asm	Mon Mar 09 11:09:50 2015 +0530
>+++ b/source/common/x86/ipfilter8.asm	Mon Mar 09 12:23:37 2015 +0530
>@@ -5875,6 +5875,51 @@
> PIXEL_WH_4xN 4, 4
> PIXEL_WH_4xN 4, 8
> PIXEL_WH_4xN 4, 16
>+
>+;-----------------------------------------------------------------------------
>+; void pixelToShort(pixel *src, intptr_t srcStride, int16_t *dst)
>+;-----------------------------------------------------------------------------
>+%macro P2S_H_4xN_avx2 1
>+INIT_YMM avx2
>+cglobal pixelToShort_4x%1, 3, 5, 6
>+
>+    ; load height
>+    mov             r3d, %1
>+
>+    ; load constant
>+    vbroadcasti128  m4, [pb_128]
>+    vbroadcasti128  m5, [tab_c_64_n64]
>+
>+%rep %1 / 4
>+    movd            xm0, [r0]
>+    movd            xm1, [r0 + r1]
>+    movd            xm2, [r0 + r1 * 2]
>+    lea             r4, [r0 + r1 * 2]
why not buffer r1*3 in r4?

>+    movd            xm3, [r4 + r1]
>+
>+    punpckldq       m0, m0, m1
>+    punpckldq       m2, m2 , m3
>+    punpcklbw       m0, m4
>+    punpcklbw       m2, m4
>+    vinserti128     m2, m0, xm2, 1
>+    pmaddubsw       m2,m5  
>+
>+    movq            [r2 + FENC_STRIDE * 0], xm2
>+    movhps          [r2 + FENC_STRIDE * 2], xm2
>+    vextracti128    xm2, m2, 1
>+    movq            [r2 + FENC_STRIDE * 4], xm2
>+    movhps          [r2 + FENC_STRIDE * 6], xm2
>+
>+    lea             r0, [r0 + r1 * 4]
>+    add             r2, FENC_STRIDE * 8
>+
>+%endrep
>+    RET
>+%endmacro
>+P2S_H_4xN_avx2 4
>+P2S_H_4xN_avx2 8
>+P2S_H_4xN_avx2 16
>+
> ;-----------------------------------------------------------------------------
> ; void pixelToShort(pixel *src, intptr_t srcStride, int16_t *dst)
> ;-----------------------------------------------------------------------------
>diff -r 38ea9788d3e6 -r 7e7f03a40892 source/common/x86/ipfilter8.h
>--- a/source/common/x86/ipfilter8.h	Mon Mar 09 11:09:50 2015 +0530
>+++ b/source/common/x86/ipfilter8.h	Mon Mar 09 12:23:37 2015 +0530
>@@ -646,6 +646,10 @@
> void x265_pixelToShort_24x32_ssse3(const pixel* src, intptr_t srcStride, int16_t* dst);
> void x265_pixelToShort_48x64_ssse3(const pixel* src, intptr_t srcStride, int16_t* dst);
> 
>+void x265_pixelToShort_4x4_avx2(const pixel* src, intptr_t srcStride, int16_t* dst);
>+void x265_pixelToShort_4x8_avx2(const pixel* src, intptr_t srcStride, int16_t* dst);
>+void x265_pixelToShort_4x16_avx2(const pixel* src, intptr_t srcStride, int16_t* dst);
>+
> #undef LUMA_FILTERS
> #undef LUMA_SP_FILTERS
> #undef LUMA_SS_FILTERS
>_______________________________________________
>x265-devel mailing list
>x265-devel at videolan.org
>https://mailman.videolan.org/listinfo/x265-devel
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20150310/3f58d041/attachment.html>


More information about the x265-devel mailing list