[x265] [PATCH] asm: avx2 code for intra_planar_32x32

dnyaneshwar at multicorewareinc.com dnyaneshwar at multicorewareinc.com
Fri Apr 3 10:30:36 CEST 2015


# HG changeset patch
# User Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
# Date 1428041153 -19800
#      Fri Apr 03 11:35:53 2015 +0530
# Node ID cef7834897bc0d53981e5dfe8790bc207deb7346
# Parent  d9deeef2cdd8f33d543090459f2f8b3e14be296e
asm: avx2 code for intra_planar_32x32

AVX2:
intra_planar_32x32      19.93x   1813.34         36132.20

SSE4:
intra_planar_32x32      12.25x   2951.42         36140.76

diff -r d9deeef2cdd8 -r cef7834897bc source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Fri Apr 03 11:11:47 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp	Fri Apr 03 11:35:53 2015 +0530
@@ -1472,6 +1472,7 @@
     if (cpuMask & X265_CPU_AVX2)
     {
         p.cu[BLOCK_16x16].intra_pred[PLANAR_IDX] = x265_intra_pred_planar16_avx2;
+        p.cu[BLOCK_32x32].intra_pred[PLANAR_IDX] = x265_intra_pred_planar32_avx2;
 
         p.idst4x4 = x265_idst4_avx2;
         p.dst4x4 = x265_dst4_avx2;
diff -r d9deeef2cdd8 -r cef7834897bc source/common/x86/const-a.asm
--- a/source/common/x86/const-a.asm	Fri Apr 03 11:11:47 2015 +0530
+++ b/source/common/x86/const-a.asm	Fri Apr 03 11:35:53 2015 +0530
@@ -62,6 +62,7 @@
 const pw_8,                 times  8 dw 8
 const pw_16,                times 16 dw 16
 const pw_15,                times 16 dw 15
+const pw_31,                times 16 dw 31
 const pw_32,                times 16 dw 32
 const pw_64,                times  8 dw 64
 const pw_128,               times 16 dw 128
@@ -87,10 +88,11 @@
 const pw_pmmpzzzz,          times  1 dw   1,  -1,  -1,   1,   0,   0,   0,   0
 const multi_2Row,           times  1 dw   1,   2,   3,   4,   1,   2,   3,   4
 const multiH,               times  1 dw   9,  10,  11,  12,  13,  14,  15,  16
-const multiH2,              times  1 dw  17,  18,  19,  20,  21,  22,  23,  24
 const multiH3,              times  1 dw  25,  26,  27,  28,  29,  30,  31,  32
 const multiL,               times  1 dw   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,  14,  15,  16
+const multiH2,              times  1 dw  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,  30,  31,  32
 const pw_planar16_mul,      times  1 dw  15,  14,  13,  12,  11,  10,   9,   8,   7,   6,   5,   4,   3,   2,   1,   0
+const pw_planar32_mul,      times  1 dw  31,  30,  29,  28,  27,  26,  25,  24,  23,  22,  21,  20,  19,  18,  17,  16
 
 
 ;; 32-bit constants
diff -r d9deeef2cdd8 -r cef7834897bc source/common/x86/intrapred.h
--- a/source/common/x86/intrapred.h	Fri Apr 03 11:11:47 2015 +0530
+++ b/source/common/x86/intrapred.h	Fri Apr 03 11:35:53 2015 +0530
@@ -44,6 +44,7 @@
 void x265_intra_pred_planar16_sse4(pixel* dst, intptr_t dstStride, const pixel* srcPix, int, int);
 void x265_intra_pred_planar32_sse4(pixel* dst, intptr_t dstStride, const pixel* srcPix, int, int);
 void x265_intra_pred_planar16_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int, int);
+void x265_intra_pred_planar32_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int, int);
 
 #define DECL_ANG(bsize, mode, cpu) \
     void x265_intra_pred_ang ## bsize ## _ ## mode ## _ ## cpu(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
diff -r d9deeef2cdd8 -r cef7834897bc source/common/x86/intrapred8.asm
--- a/source/common/x86/intrapred8.asm	Fri Apr 03 11:11:47 2015 +0530
+++ b/source/common/x86/intrapred8.asm	Fri Apr 03 11:35:53 2015 +0530
@@ -570,6 +570,7 @@
 cextern pw_8
 cextern pw_16
 cextern pw_15
+cextern pw_31
 cextern pw_32
 cextern pw_257
 cextern pw_1024
@@ -583,6 +584,7 @@
 cextern multi_2Row
 cextern trans8_shuf
 cextern pw_planar16_mul
+cextern pw_planar32_mul
 
 ;---------------------------------------------------------------------------------------------
 ; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel *srcPix, int dirMode, int bFilter)
@@ -2595,6 +2597,91 @@
     jnz             .loop
     RET
 
+;---------------------------------------------------------------------------------------
+; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter)
+;---------------------------------------------------------------------------------------
+%if ARCH_X86_64 == 1
+INIT_YMM avx2
+cglobal intra_pred_planar32, 3,4,11
+    mova            m6, [pw_00ff]
+    vpbroadcastw    m3, [r2 + 33]               ; topRight   = above[32]
+    vpbroadcastw    m2, [r2 + 97]               ; bottomLeft = left[32]
+    pand            m3, m6
+    pand            m2, m6
+
+    pmullw          m0, m3, [multiL]            ; (x + 1) * topRight
+    pmullw          m3, [multiH2]               ; (x + 1) * topRight
+
+    paddw           m0, m2
+    paddw           m3, m2
+    paddw           m0, [pw_32]
+    paddw           m3, [pw_32]
+
+    pmovzxbw        m4, [r2 + 1]
+    pmovzxbw        m1, [r2 + 17]
+    pmullw          m5, m4, [pw_31]
+    paddw           m0, m5
+    psubw           m5, m2, m4
+    psubw           m2, m1
+    pmullw          m1, [pw_31]
+    paddw           m3, m1
+    mova            m1, m5
+
+    add             r2, 65                      ; (2 * blkSize + 1)
+    mova            m9, [pw_planar32_mul]
+    mova            m10, [pw_planar16_mul]
+
+%macro INTRA_PRED_PLANAR32_AVX2 0
+    vpbroadcastw    m4, [r2]
+    vpsrlw          m7, m4, 8
+    pand            m4, m6
+
+    pmullw          m5, m4, m9
+    pmullw          m4, m4, m10
+    paddw           m5, m0
+    paddw           m4, m3
+    paddw           m0, m1
+    paddw           m3, m2
+    psraw           m5, 6
+    psraw           m4, 6
+    packuswb        m5, m4
+    pmullw          m8, m7, m9
+    pmullw          m7, m7, m10
+    vpermq          m5, m5, 11011000b
+    paddw           m8, m0
+    paddw           m7, m3
+    paddw           m0, m1
+    paddw           m3, m2
+    psraw           m8, 6
+    psraw           m7, 6
+    packuswb        m8, m7
+    add             r2, 2
+    vpermq          m8, m8, 11011000b
+
+    movu            [r0], m5
+    movu            [r0 + r1], m8
+    lea             r0, [r0 + r1 * 2]
+%endmacro
+    INTRA_PRED_PLANAR32_AVX2
+    INTRA_PRED_PLANAR32_AVX2
+    INTRA_PRED_PLANAR32_AVX2
+    INTRA_PRED_PLANAR32_AVX2
+    INTRA_PRED_PLANAR32_AVX2
+    INTRA_PRED_PLANAR32_AVX2
+    INTRA_PRED_PLANAR32_AVX2
+    INTRA_PRED_PLANAR32_AVX2
+    INTRA_PRED_PLANAR32_AVX2
+    INTRA_PRED_PLANAR32_AVX2
+    INTRA_PRED_PLANAR32_AVX2
+    INTRA_PRED_PLANAR32_AVX2
+    INTRA_PRED_PLANAR32_AVX2
+    INTRA_PRED_PLANAR32_AVX2
+    INTRA_PRED_PLANAR32_AVX2
+    INTRA_PRED_PLANAR32_AVX2
+%undef INTRA_PRED_PLANAR32_AVX2
+    RET
+%endif ;; ARCH_X86_64 == 1
+
 ;-----------------------------------------------------------------------------------------
 ; void intraPredAng4(pixel* dst, intptr_t dstStride, pixel* src, int dirMode, int bFilter)
 ;-----------------------------------------------------------------------------------------


More information about the x265-devel mailing list