[x265] [PATCH] asm: avx2 10bit code for luma_hpp[8xN]

rajesh at multicorewareinc.com rajesh at multicorewareinc.com
Tue May 19 10:57:08 CEST 2015


# HG changeset patch
# User Rajesh Paulraj<rajesh at multicorewareinc.com>
# Date 1432018444 -19800
#      Tue May 19 12:24:04 2015 +0530
# Node ID 712f3f1950098d1603a662944359978e19e39752
# Parent  d7b100e51e828833eee006f1da93e499ac161d28
asm: avx2 10bit code for luma_hpp[8xN]

avx2:
luma_hpp[  8x4]         7.30x    507.64          3706.06
luma_hpp[  8x8]         7.64x    982.30          7503.45
luma_hpp[ 8x16]         7.78x    1898.72         14779.64
luma_hpp[ 8x32]         7.93x    3778.05         29954.26

sse4:
luma_hpp[  8x4]         4.34x    877.69          3806.35
luma_hpp[  8x8]         4.45x    1702.32         7569.03
luma_hpp[ 8x16]         4.44x    3335.36         14812.65
luma_hpp[ 8x32]         4.39x    6785.18         29815.67

diff -r d7b100e51e82 -r 712f3f195009 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Mon May 18 18:24:08 2015 -0500
+++ b/source/common/x86/asm-primitives.cpp	Tue May 19 12:24:04 2015 +0530
@@ -1388,6 +1388,11 @@
         p.pu[LUMA_4x8].luma_hps = x265_interp_8tap_horiz_ps_4x8_avx2;
         p.pu[LUMA_4x16].luma_hps = x265_interp_8tap_horiz_ps_4x16_avx2;
 
+        p.pu[LUMA_8x4].luma_hpp = x265_interp_8tap_horiz_pp_8x4_avx2;
+        p.pu[LUMA_8x8].luma_hpp = x265_interp_8tap_horiz_pp_8x8_avx2;
+        p.pu[LUMA_8x16].luma_hpp = x265_interp_8tap_horiz_pp_8x16_avx2;
+        p.pu[LUMA_8x32].luma_hpp = x265_interp_8tap_horiz_pp_8x32_avx2;
+
         if (cpuMask & X265_CPU_BMI2)
             p.scanPosLast = x265_scanPosLast_avx2_bmi2;
     }
diff -r d7b100e51e82 -r 712f3f195009 source/common/x86/const-a.asm
--- a/source/common/x86/const-a.asm	Mon May 18 18:24:08 2015 -0500
+++ b/source/common/x86/const-a.asm	Tue May 19 12:24:04 2015 +0530
@@ -110,7 +110,7 @@
 const pd_4,                 times  4 dd 4
 const pd_8,                 times  4 dd 8
 const pd_16,                times  4 dd 16
-const pd_32,                times  4 dd 32
+const pd_32,                times  8 dd 32
 const pd_64,                times  4 dd 64
 const pd_128,               times  4 dd 128
 const pd_256,               times  4 dd 256
diff -r d7b100e51e82 -r 712f3f195009 source/common/x86/ipfilter16.asm
--- a/source/common/x86/ipfilter16.asm	Mon May 18 18:24:08 2015 -0500
+++ b/source/common/x86/ipfilter16.asm	Tue May 19 12:24:04 2015 +0530
@@ -115,6 +115,9 @@
 
 const interp8_hps_shuf,     dd 0, 4, 1, 5, 2, 6, 3, 7
 
+const interp8_hpp_shuf,     db 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9
+                            db 4, 5, 6, 7, 8, 9, 10, 11, 6, 7, 8, 9, 10, 11, 12, 13
+
 SECTION .text
 cextern pd_32
 cextern pw_pixel_max
@@ -859,6 +862,102 @@
     movhps      [r2 + r3],  m3
 %endmacro
 
+;-------------------------------------------------------------------------------------------------------------
+; void interp_8tap_horiz_pp(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx
+;-------------------------------------------------------------------------------------------------------------
+%macro FILTER_HOR_LUMA_W8 1
+INIT_YMM avx2
+cglobal interp_8tap_horiz_pp_8x%1, 4,6,8
+    add              r1d, r1d
+    add              r3d, r3d
+    sub              r0, 6
+    mov              r4d, r4m
+    shl              r4d, 4
+%ifdef PIC
+    lea              r5, [tab_LumaCoeff]
+    vpbroadcastq     m0, [r5 + r4]
+    vpbroadcastq     m1, [r5 + r4 + 8]
+%else
+    vpbroadcastq     m0, [tab_LumaCoeff + r4]
+    vpbroadcastq     m1, [tab_LumaCoeff + r4 + 8]
+%endif
+    mova             m3, [interp8_hpp_shuf]
+    mova             m7, [pd_32]
+    pxor             m2, m2
+
+    ; register map
+    ; m0 , m1 interpolate coeff
+
+    mov              r4d, %1/2
+
+.loop:
+    vbroadcasti128   m4, [r0]
+    vbroadcasti128   m5, [r0 + 8]
+    pshufb           m4, m3
+    pshufb           m5, m3
+
+    pmaddwd          m4, m0
+    pmaddwd          m5, m1
+    paddd            m4, m5
+
+    vbroadcasti128   m5, [r0 + 8]
+    vbroadcasti128   m6, [r0 + 16]
+    pshufb           m5, m3
+    pshufb           m6, m3
+
+    pmaddwd          m5, m0
+    pmaddwd          m6, m1
+    paddd            m5, m6
+
+    phaddd           m4, m5
+    vpermq           m4, m4, q3120
+    paddd            m4, m7
+    psrad            m4, 6
+
+    packusdw         m4, m4
+    vpermq           m4, m4, q2020
+    CLIPW            m4, m2, [pw_pixel_max]
+    movu             [r2], xm4
+
+    vbroadcasti128   m4, [r0 + r1]
+    vbroadcasti128   m5, [r0 + r1 + 8]
+    pshufb           m4, m3
+    pshufb           m5, m3
+
+    pmaddwd          m4, m0
+    pmaddwd          m5, m1
+    paddd            m4, m5
+
+    vbroadcasti128   m5, [r0 + r1 + 8]
+    vbroadcasti128   m6, [r0 + r1 + 16]
+    pshufb           m5, m3
+    pshufb           m6, m3
+
+    pmaddwd          m5, m0
+    pmaddwd          m6, m1
+    paddd            m5, m6
+
+    phaddd           m4, m5
+    vpermq           m4, m4, q3120
+    paddd            m4, m7
+    psrad            m4, 6
+
+    packusdw         m4, m4
+    vpermq           m4, m4, q2020
+    CLIPW            m4, m2, [pw_pixel_max]
+    movu             [r2 + r3], xm4
+
+    lea              r2, [r2 + 2 * r3]
+    lea              r0, [r0 + 2 * r1]
+    dec              r4d
+    jnz              .loop
+    RET
+%endmacro
+FILTER_HOR_LUMA_W8 4
+FILTER_HOR_LUMA_W8 8
+FILTER_HOR_LUMA_W8 16
+FILTER_HOR_LUMA_W8 32
+
 ;-----------------------------------------------------------------------------
 ; void interp_4tap_horiz_%3_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
 ;-----------------------------------------------------------------------------


More information about the x265-devel mailing list