[x265] [PATCH] asm: avx2 10bit code for luma_hpp[16xN]

rajesh at multicorewareinc.com rajesh at multicorewareinc.com
Tue May 19 10:57:45 CEST 2015


# HG changeset patch
# User Rajesh Paulraj<rajesh at multicorewareinc.com>
# Date 1432019267 -19800
#      Tue May 19 12:37:47 2015 +0530
# Node ID d0f54566d1f457f00fc071c47cbb04186e4da99e
# Parent  712f3f1950098d1603a662944359978e19e39752
asm: avx2 10bit code for luma_hpp[16xN]

avx2:
luma_hpp[ 16x4]         7.81x    955.42          7466.10
luma_hpp[ 16x8]         8.23x    1847.69         15211.04
luma_hpp[16x12]         8.35x    2747.88         22941.08
luma_hpp[16x16]         8.64x    3651.71         31536.71
luma_hpp[16x32]         8.17x    7369.20         60188.46
luma_hpp[16x64]         8.49x    14626.30        124106.34

sse4:
luma_hpp[ 16x4]         4.40x    1700.57         7484.82
luma_hpp[ 16x8]         4.60x    3332.67         15319.88
luma_hpp[16x12]         5.14x    4922.31         25296.77
luma_hpp[16x16]         4.56x    6548.32         29836.87
luma_hpp[16x32]         4.65x    12974.12        60339.52
luma_hpp[16x64]         5.03x    25374.62        127527.44

diff -r 712f3f195009 -r d0f54566d1f4 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Tue May 19 12:24:04 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp	Tue May 19 12:37:47 2015 +0530
@@ -1392,6 +1392,12 @@
         p.pu[LUMA_8x8].luma_hpp = x265_interp_8tap_horiz_pp_8x8_avx2;
         p.pu[LUMA_8x16].luma_hpp = x265_interp_8tap_horiz_pp_8x16_avx2;
         p.pu[LUMA_8x32].luma_hpp = x265_interp_8tap_horiz_pp_8x32_avx2;
+        p.pu[LUMA_16x4].luma_hpp = x265_interp_8tap_horiz_pp_16x4_avx2;
+        p.pu[LUMA_16x8].luma_hpp = x265_interp_8tap_horiz_pp_16x8_avx2;
+        p.pu[LUMA_16x12].luma_hpp = x265_interp_8tap_horiz_pp_16x12_avx2;
+        p.pu[LUMA_16x16].luma_hpp = x265_interp_8tap_horiz_pp_16x16_avx2;
+        p.pu[LUMA_16x32].luma_hpp = x265_interp_8tap_horiz_pp_16x32_avx2;
+        p.pu[LUMA_16x64].luma_hpp = x265_interp_8tap_horiz_pp_16x64_avx2;
 
         if (cpuMask & X265_CPU_BMI2)
             p.scanPosLast = x265_scanPosLast_avx2_bmi2;
diff -r 712f3f195009 -r d0f54566d1f4 source/common/x86/ipfilter16.asm
--- a/source/common/x86/ipfilter16.asm	Tue May 19 12:24:04 2015 +0530
+++ b/source/common/x86/ipfilter16.asm	Tue May 19 12:37:47 2015 +0530
@@ -958,6 +958,104 @@
 FILTER_HOR_LUMA_W8 16
 FILTER_HOR_LUMA_W8 32
 
+;-------------------------------------------------------------------------------------------------------------
+; void interp_8tap_horiz_pp(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx
+;-------------------------------------------------------------------------------------------------------------
+%macro FILTER_HOR_LUMA_W16 1
+INIT_YMM avx2
+cglobal interp_8tap_horiz_pp_16x%1, 4,6,8
+    add              r1d, r1d
+    add              r3d, r3d
+    sub              r0, 6
+    mov              r4d, r4m
+    shl              r4d, 4
+%ifdef PIC
+    lea              r5, [tab_LumaCoeff]
+    vpbroadcastq     m0, [r5 + r4]
+    vpbroadcastq     m1, [r5 + r4 + 8]
+%else
+    vpbroadcastq     m0, [tab_LumaCoeff + r4]
+    vpbroadcastq     m1, [tab_LumaCoeff + r4 + 8]
+%endif
+    mova             m3, [interp8_hpp_shuf]
+    mova             m7, [pd_32]
+    pxor             m2, m2
+
+    ; register map
+    ; m0 , m1 interpolate coeff
+
+    mov              r4d, %1
+
+.loop:
+    vbroadcasti128   m4, [r0]
+    vbroadcasti128   m5, [r0 + 8]
+    pshufb           m4, m3
+    pshufb           m5, m3
+
+    pmaddwd          m4, m0
+    pmaddwd          m5, m1
+    paddd            m4, m5
+
+    vbroadcasti128   m5, [r0 + 8]
+    vbroadcasti128   m6, [r0 + 16]
+    pshufb           m5, m3
+    pshufb           m6, m3
+
+    pmaddwd          m5, m0
+    pmaddwd          m6, m1
+    paddd            m5, m6
+
+    phaddd           m4, m5
+    vpermq           m4, m4, q3120
+    paddd            m4, m7
+    psrad            m4, 6
+
+    packusdw         m4, m4
+    vpermq           m4, m4, q2020
+    CLIPW            m4, m2, [pw_pixel_max]
+    movu             [r2], xm4
+
+    vbroadcasti128   m4, [r0 + 16]
+    vbroadcasti128   m5, [r0 + 24]
+    pshufb           m4, m3
+    pshufb           m5, m3
+
+    pmaddwd          m4, m0
+    pmaddwd          m5, m1
+    paddd            m4, m5
+
+    vbroadcasti128   m5, [r0 + 24]
+    vbroadcasti128   m6, [r0 + 32]
+    pshufb           m5, m3
+    pshufb           m6, m3
+
+    pmaddwd          m5, m0
+    pmaddwd          m6, m1
+    paddd            m5, m6
+
+    phaddd           m4, m5
+    vpermq           m4, m4, q3120
+    paddd            m4, m7
+    psrad            m4, 6
+
+    packusdw         m4, m4
+    vpermq           m4, m4, q2020
+    CLIPW            m4, m2, [pw_pixel_max]
+    movu             [r2 + 16], xm4
+
+    add              r2, r3
+    add              r0, r1
+    dec              r4d
+    jnz              .loop
+    RET
+%endmacro
+FILTER_HOR_LUMA_W16 4
+FILTER_HOR_LUMA_W16 8
+FILTER_HOR_LUMA_W16 12
+FILTER_HOR_LUMA_W16 16
+FILTER_HOR_LUMA_W16 32
+FILTER_HOR_LUMA_W16 64
+
 ;-----------------------------------------------------------------------------
 ; void interp_4tap_horiz_%3_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
 ;-----------------------------------------------------------------------------


More information about the x265-devel mailing list