[x265] [PATCH] asm: avx2 code for intra_planar_32x32
dnyaneshwar at multicorewareinc.com
dnyaneshwar at multicorewareinc.com
Fri Apr 3 10:30:36 CEST 2015
# HG changeset patch
# User Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
# Date 1428041153 -19800
# Fri Apr 03 11:35:53 2015 +0530
# Node ID cef7834897bc0d53981e5dfe8790bc207deb7346
# Parent d9deeef2cdd8f33d543090459f2f8b3e14be296e
asm: avx2 code for intra_planar_32x32
AVX2:
intra_planar_32x32 19.93x 1813.34 36132.20
SSE4:
intra_planar_32x32 12.25x 2951.42 36140.76
diff -r d9deeef2cdd8 -r cef7834897bc source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Fri Apr 03 11:11:47 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp Fri Apr 03 11:35:53 2015 +0530
@@ -1472,6 +1472,7 @@
if (cpuMask & X265_CPU_AVX2)
{
p.cu[BLOCK_16x16].intra_pred[PLANAR_IDX] = x265_intra_pred_planar16_avx2;
+ p.cu[BLOCK_32x32].intra_pred[PLANAR_IDX] = x265_intra_pred_planar32_avx2;
p.idst4x4 = x265_idst4_avx2;
p.dst4x4 = x265_dst4_avx2;
diff -r d9deeef2cdd8 -r cef7834897bc source/common/x86/const-a.asm
--- a/source/common/x86/const-a.asm Fri Apr 03 11:11:47 2015 +0530
+++ b/source/common/x86/const-a.asm Fri Apr 03 11:35:53 2015 +0530
@@ -62,6 +62,7 @@
const pw_8, times 8 dw 8
const pw_16, times 16 dw 16
const pw_15, times 16 dw 15
+const pw_31, times 16 dw 31
const pw_32, times 16 dw 32
const pw_64, times 8 dw 64
const pw_128, times 16 dw 128
@@ -87,10 +88,11 @@
const pw_pmmpzzzz, times 1 dw 1, -1, -1, 1, 0, 0, 0, 0
const multi_2Row, times 1 dw 1, 2, 3, 4, 1, 2, 3, 4
const multiH, times 1 dw 9, 10, 11, 12, 13, 14, 15, 16
-const multiH2, times 1 dw 17, 18, 19, 20, 21, 22, 23, 24
const multiH3, times 1 dw 25, 26, 27, 28, 29, 30, 31, 32
const multiL, times 1 dw 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+const multiH2, times 1 dw 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32
const pw_planar16_mul, times 1 dw 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+const pw_planar32_mul, times 1 dw 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16
;; 32-bit constants
diff -r d9deeef2cdd8 -r cef7834897bc source/common/x86/intrapred.h
--- a/source/common/x86/intrapred.h Fri Apr 03 11:11:47 2015 +0530
+++ b/source/common/x86/intrapred.h Fri Apr 03 11:35:53 2015 +0530
@@ -44,6 +44,7 @@
void x265_intra_pred_planar16_sse4(pixel* dst, intptr_t dstStride, const pixel* srcPix, int, int);
void x265_intra_pred_planar32_sse4(pixel* dst, intptr_t dstStride, const pixel* srcPix, int, int);
void x265_intra_pred_planar16_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int, int);
+void x265_intra_pred_planar32_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int, int);
#define DECL_ANG(bsize, mode, cpu) \
void x265_intra_pred_ang ## bsize ## _ ## mode ## _ ## cpu(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
diff -r d9deeef2cdd8 -r cef7834897bc source/common/x86/intrapred8.asm
--- a/source/common/x86/intrapred8.asm Fri Apr 03 11:11:47 2015 +0530
+++ b/source/common/x86/intrapred8.asm Fri Apr 03 11:35:53 2015 +0530
@@ -570,6 +570,7 @@
cextern pw_8
cextern pw_16
cextern pw_15
+cextern pw_31
cextern pw_32
cextern pw_257
cextern pw_1024
@@ -583,6 +584,7 @@
cextern multi_2Row
cextern trans8_shuf
cextern pw_planar16_mul
+cextern pw_planar32_mul
;---------------------------------------------------------------------------------------------
; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel *srcPix, int dirMode, int bFilter)
@@ -2595,6 +2597,91 @@
jnz .loop
RET
+;---------------------------------------------------------------------------------------
+; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter)
+;---------------------------------------------------------------------------------------
+%if ARCH_X86_64 == 1
+INIT_YMM avx2
+cglobal intra_pred_planar32, 3,4,11
+ mova m6, [pw_00ff]
+ vpbroadcastw m3, [r2 + 33] ; topRight = above[32]
+ vpbroadcastw m2, [r2 + 97] ; bottomLeft = left[32]
+ pand m3, m6
+ pand m2, m6
+
+ pmullw m0, m3, [multiL] ; (x + 1) * topRight
+ pmullw m3, [multiH2] ; (x + 1) * topRight
+
+ paddw m0, m2
+ paddw m3, m2
+ paddw m0, [pw_32]
+ paddw m3, [pw_32]
+
+ pmovzxbw m4, [r2 + 1]
+ pmovzxbw m1, [r2 + 17]
+ pmullw m5, m4, [pw_31]
+ paddw m0, m5
+ psubw m5, m2, m4
+ psubw m2, m1
+ pmullw m1, [pw_31]
+ paddw m3, m1
+ mova m1, m5
+
+ add r2, 65 ; (2 * blkSize + 1)
+ mova m9, [pw_planar32_mul]
+ mova m10, [pw_planar16_mul]
+
+%macro INTRA_PRED_PLANAR32_AVX2 0
+ vpbroadcastw m4, [r2]
+ vpsrlw m7, m4, 8
+ pand m4, m6
+
+ pmullw m5, m4, m9
+ pmullw m4, m4, m10
+ paddw m5, m0
+ paddw m4, m3
+ paddw m0, m1
+ paddw m3, m2
+ psraw m5, 6
+ psraw m4, 6
+ packuswb m5, m4
+ pmullw m8, m7, m9
+ pmullw m7, m7, m10
+ vpermq m5, m5, 11011000b
+ paddw m8, m0
+ paddw m7, m3
+ paddw m0, m1
+ paddw m3, m2
+ psraw m8, 6
+ psraw m7, 6
+ packuswb m8, m7
+ add r2, 2
+ vpermq m8, m8, 11011000b
+
+ movu [r0], m5
+ movu [r0 + r1], m8
+ lea r0, [r0 + r1 * 2]
+%endmacro
+ INTRA_PRED_PLANAR32_AVX2
+ INTRA_PRED_PLANAR32_AVX2
+ INTRA_PRED_PLANAR32_AVX2
+ INTRA_PRED_PLANAR32_AVX2
+ INTRA_PRED_PLANAR32_AVX2
+ INTRA_PRED_PLANAR32_AVX2
+ INTRA_PRED_PLANAR32_AVX2
+ INTRA_PRED_PLANAR32_AVX2
+ INTRA_PRED_PLANAR32_AVX2
+ INTRA_PRED_PLANAR32_AVX2
+ INTRA_PRED_PLANAR32_AVX2
+ INTRA_PRED_PLANAR32_AVX2
+ INTRA_PRED_PLANAR32_AVX2
+ INTRA_PRED_PLANAR32_AVX2
+ INTRA_PRED_PLANAR32_AVX2
+ INTRA_PRED_PLANAR32_AVX2
+%undef INTRA_PRED_PLANAR32_AVX2
+ RET
+%endif ;; ARCH_X86_64 == 1
+
;-----------------------------------------------------------------------------------------
; void intraPredAng4(pixel* dst, intptr_t dstStride, pixel* src, int dirMode, int bFilter)
;-----------------------------------------------------------------------------------------
More information about the x265-devel
mailing list