[x265] [PATCH] asm: avx2 code for intrapred_planar16x16

dnyaneshwar at multicorewareinc.com dnyaneshwar at multicorewareinc.com
Fri Apr 3 10:29:29 CEST 2015


# HG changeset patch
# User Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
# Date 1428039707 -19800
#      Fri Apr 03 11:11:47 2015 +0530
# Node ID d9deeef2cdd8f33d543090459f2f8b3e14be296e
# Parent  dd62c4e924ba5fee00180331a6a9dfd3964a6713
asm: avx2 code for intrapred_planar16x16

AVX2:
intra_planar_16x16      16.24x   583.48          9475.36

SSE4:
intra_planar_16x16      11.54x   820.01          9466.91

diff -r dd62c4e924ba -r d9deeef2cdd8 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Fri Apr 03 10:45:54 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp	Fri Apr 03 11:11:47 2015 +0530
@@ -1471,6 +1471,8 @@
 #if X86_64
     if (cpuMask & X265_CPU_AVX2)
     {
+        p.cu[BLOCK_16x16].intra_pred[PLANAR_IDX] = x265_intra_pred_planar16_avx2;
+
         p.idst4x4 = x265_idst4_avx2;
         p.dst4x4 = x265_dst4_avx2;
         p.scale2D_64to32 = x265_scale2D_64to32_avx2;
diff -r dd62c4e924ba -r d9deeef2cdd8 source/common/x86/const-a.asm
--- a/source/common/x86/const-a.asm	Fri Apr 03 10:45:54 2015 +0530
+++ b/source/common/x86/const-a.asm	Fri Apr 03 11:11:47 2015 +0530
@@ -61,6 +61,7 @@
 const pw_4,                 times  8 dw 4
 const pw_8,                 times  8 dw 8
 const pw_16,                times 16 dw 16
+const pw_15,                times 16 dw 15
 const pw_32,                times 16 dw 32
 const pw_64,                times  8 dw 64
 const pw_128,               times 16 dw 128
@@ -85,10 +86,11 @@
 const pw_pmpmpmpm,          times  1 dw   1,  -1,   1,  -1,   1,  -1,   1,  -1
 const pw_pmmpzzzz,          times  1 dw   1,  -1,  -1,   1,   0,   0,   0,   0
 const multi_2Row,           times  1 dw   1,   2,   3,   4,   1,   2,   3,   4
-const multiL,               times  1 dw   1,   2,   3,   4,   5,   6,   7,   8
 const multiH,               times  1 dw   9,  10,  11,  12,  13,  14,  15,  16
 const multiH2,              times  1 dw  17,  18,  19,  20,  21,  22,  23,  24
 const multiH3,              times  1 dw  25,  26,  27,  28,  29,  30,  31,  32
+const multiL,               times  1 dw   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,  14,  15,  16
+const pw_planar16_mul,      times  1 dw  15,  14,  13,  12,  11,  10,   9,   8,   7,   6,   5,   4,   3,   2,   1,   0
 
 
 ;; 32-bit constants
diff -r dd62c4e924ba -r d9deeef2cdd8 source/common/x86/intrapred.h
--- a/source/common/x86/intrapred.h	Fri Apr 03 10:45:54 2015 +0530
+++ b/source/common/x86/intrapred.h	Fri Apr 03 11:11:47 2015 +0530
@@ -43,6 +43,7 @@
 void x265_intra_pred_planar8_sse4(pixel* dst, intptr_t dstStride, const pixel* srcPix, int, int);
 void x265_intra_pred_planar16_sse4(pixel* dst, intptr_t dstStride, const pixel* srcPix, int, int);
 void x265_intra_pred_planar32_sse4(pixel* dst, intptr_t dstStride, const pixel* srcPix, int, int);
+void x265_intra_pred_planar16_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int, int);
 
 #define DECL_ANG(bsize, mode, cpu) \
     void x265_intra_pred_ang ## bsize ## _ ## mode ## _ ## cpu(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
diff -r dd62c4e924ba -r d9deeef2cdd8 source/common/x86/intrapred8.asm
--- a/source/common/x86/intrapred8.asm	Fri Apr 03 10:45:54 2015 +0530
+++ b/source/common/x86/intrapred8.asm	Fri Apr 03 11:11:47 2015 +0530
@@ -569,6 +569,7 @@
 cextern pw_4
 cextern pw_8
 cextern pw_16
+cextern pw_15
 cextern pw_32
 cextern pw_257
 cextern pw_1024
@@ -581,6 +582,7 @@
 cextern multiH3
 cextern multi_2Row
 cextern trans8_shuf
+cextern pw_planar16_mul
 
 ;---------------------------------------------------------------------------------------------
 ; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel *srcPix, int dirMode, int bFilter)
@@ -2435,6 +2437,57 @@
     INTRA_PRED_PLANAR16 15
     RET
 
+;---------------------------------------------------------------------------------------
+; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter)
+;---------------------------------------------------------------------------------------
+INIT_YMM avx2
+cglobal intra_pred_planar16, 3,3,6
+    vpbroadcastw    m3, [r2 + 17]
+    mova            m5, [pw_00ff]
+    vpbroadcastw    m4, [r2 + 49]
+    mova            m0, [pw_planar16_mul]
+    pmovzxbw        m2, [r2 + 1]
+    pand            m3, m5                      ; v_topRight
+    pand            m4, m5                      ; v_bottomLeft
+
+    pmullw          m3, [multiL]                ; (x + 1) * topRight
+    pmullw          m1, m2, [pw_15]             ; (blkSize - 1 - y) * above[x]
+    paddw           m3, [pw_16]
+    paddw           m3, m4
+    paddw           m3, m1
+    psubw           m4, m2
+    add             r2, 33
+
+%macro INTRA_PRED_PLANAR16_AVX2 1
+    vpbroadcastw    m1, [r2 + %1]
+    vpsrlw          m2, m1, 8
+    pand            m1, m5
+
+    pmullw          m1, m0
+    pmullw          m2, m0
+    paddw           m1, m3
+    paddw           m3, m4
+    psraw           m1, 5
+    paddw           m2, m3
+    psraw           m2, 5
+    paddw           m3, m4
+    packuswb        m1, m2
+    vpermq          m1, m1, 11011000b
+    movu            [r0], xm1
+    vextracti128    [r0 + r1], m1, 1
+    lea             r0, [r0 + r1 * 2]
+%endmacro
+    INTRA_PRED_PLANAR16_AVX2 0
+    INTRA_PRED_PLANAR16_AVX2 2
+    INTRA_PRED_PLANAR16_AVX2 4
+    INTRA_PRED_PLANAR16_AVX2 6
+    INTRA_PRED_PLANAR16_AVX2 8
+    INTRA_PRED_PLANAR16_AVX2 10
+    INTRA_PRED_PLANAR16_AVX2 12
+    INTRA_PRED_PLANAR16_AVX2 14
+%undef INTRA_PRED_PLANAR16_AVX2
+    RET
+
 ;---------------------------------------------------------------------------------------
 ; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter)
 ;---------------------------------------------------------------------------------------


More information about the x265-devel mailing list