[x265] [PATCH] asm: avx2 code for high_bit_depth intra_pred_planar32x32

dnyaneshwar at multicorewareinc.com dnyaneshwar at multicorewareinc.com
Tue May 12 08:19:50 CEST 2015


# HG changeset patch
# User Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
# Date 1431409427 -19800
#      Tue May 12 11:13:47 2015 +0530
# Node ID 4109cf92731a8a6cfe35019d205476e8719d4c67
# Parent  e112801421b324350f7de388aef19a9f582b4c2b
asm: avx2 code for high_bit_depth intra_pred_planar32x32

AVX2:
intra_planar_32x32      28.16x   1532.45         43154.74

SSE:
intra_planar_32x32      4.42x    10169.50        44932.19

diff -r e112801421b3 -r 4109cf92731a source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Mon May 11 11:07:10 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp	Tue May 12 11:13:47 2015 +0530
@@ -1182,6 +1182,7 @@
     if (cpuMask & X265_CPU_AVX2)
     {
         p.cu[BLOCK_16x16].intra_pred[PLANAR_IDX] = x265_intra_pred_planar16_avx2;
+        p.cu[BLOCK_32x32].intra_pred[PLANAR_IDX] = x265_intra_pred_planar32_avx2;
 
         p.cu[BLOCK_16x16].intra_pred[DC_IDX] = x265_intra_pred_dc16_avx2;
         p.cu[BLOCK_32x32].intra_pred[DC_IDX] = x265_intra_pred_dc32_avx2;
diff -r e112801421b3 -r 4109cf92731a source/common/x86/intrapred16.asm
--- a/source/common/x86/intrapred16.asm	Mon May 11 11:07:10 2015 +0530
+++ b/source/common/x86/intrapred16.asm	Tue May 12 11:13:47 2015 +0530
@@ -91,6 +91,7 @@
 cextern pw_8
 cextern pw_15
 cextern pw_16
+cextern pw_31
 cextern pw_32
 cextern pw_1023
 cextern pd_16
@@ -105,6 +106,7 @@
 cextern pb_unpackwq1
 cextern pb_unpackwq2
 cextern pw_planar16_mul
+cextern pw_planar32_mul
 
 ;-----------------------------------------------------------------------------------
 ; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel* above, int, int filter)
@@ -908,6 +910,69 @@
 ; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter)
 ;---------------------------------------------------------------------------------------
 INIT_YMM avx2
+cglobal intra_pred_planar32, 3,3,8
+    movu            m1, [r2 + 2]
+    movu            m4, [r2 + 34]
+    lea             r2, [r2 + 66]
+    vpbroadcastw    m3, [r2]                    ; topRight   = above[32]
+    pmullw          m0, m3, [multiL]            ; (x + 1) * topRight
+    pmullw          m2, m3, [multiH2]           ; (x + 1) * topRight
+    vpbroadcastw    m6, [r2 + 128]              ; bottomLeft = left[32]
+    mova            m5, m6
+    paddw           m5, [pw_32]
+
+    paddw           m0, m5
+    paddw           m2, m5
+    mova            m5, m6
+    psubw           m3, m6, m1
+    pmullw          m1, [pw_31]
+    paddw           m0, m1
+    psubw           m5, m4
+    pmullw          m4, [pw_31]
+    paddw           m2, m4
+
+    mova            m6, [pw_planar32_mul]
+    mova            m4, [pw_planar16_mul]
+    add             r1, r1
+
+%macro PROCESS_AVX2 1
+    vpbroadcastw    m7, [r2 + %1 * 2]
+    pmullw          m1, m7, m6
+    pmullw          m7, m4
+    paddw           m1, m0
+    paddw           m7, m2
+    psrlw           m1, 6
+    psrlw           m7, 6
+    movu            [r0], m1
+    movu            [r0 + mmsize], m7
+%endmacro
+
+%macro  INCREMENT_AVX2 0
+    paddw           m2, m5
+    paddw           m0, m3
+    add             r0, r1
+%endmacro
+
+    add             r2, mmsize*2
+%assign x 0
+%rep 4
+%assign y 0
+%rep 8
+    PROCESS_AVX2 y
+%if x + y < 10
+    INCREMENT_AVX2
+%endif
+%assign y y+1
+%endrep
+lea     r2, [r2 + 16]
+%assign x x+1
+%endrep
+    RET
+
+;---------------------------------------------------------------------------------------
+; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter)
+;---------------------------------------------------------------------------------------
+INIT_YMM avx2
 cglobal intra_pred_planar16, 3,3,4
     add             r1d, r1d
     vpbroadcastw    m3, [r2 + 34]


More information about the x265-devel mailing list