[x265] [PATCH update] asm: AVX2 version of luma_pp[4x4], improve 320c -> 188c
Min Chen
chenm003 at 163.com
Tue Oct 28 01:02:07 CET 2014
# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1414454471 25200
# Node ID e0eca5d9c0aeba6b06efa29702f03d8a603a0e7f
# Parent 453d131f974b335278af8e98a332357d4f45c9fd
asm: AVX2 version of luma_pp[4x4], improve 320c -> 188c
diff -r 453d131f974b -r e0eca5d9c0ae source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Mon Oct 27 12:20:10 2014 -0500
+++ b/source/common/x86/asm-primitives.cpp Mon Oct 27 17:01:11 2014 -0700
@@ -1798,6 +1798,7 @@
p.transpose[BLOCK_32x32] = x265_transpose32_avx2;
p.transpose[BLOCK_64x64] = x265_transpose64_avx2;
#endif
+ p.luma_hpp[BLOCK_4x4] = x265_interp_8tap_horiz_pp_4x4_avx2;
}
#endif // if HIGH_BIT_DEPTH
}
diff -r 453d131f974b -r e0eca5d9c0ae source/common/x86/dct8.asm
--- a/source/common/x86/dct8.asm Mon Oct 27 12:20:10 2014 -0500
+++ b/source/common/x86/dct8.asm Mon Oct 27 17:01:11 2014 -0700
@@ -245,7 +245,7 @@
avx2_idct4_2: dw 64, 64, 64, -64, 83, 36, 36, -83
-idct4_shuf1: times 2 db 0, 1, 8, 9, 4, 5, 12, 13, 2, 3, 10, 11, 6, 7, 14, 15
+const idct4_shuf1, times 2 db 0, 1, 8, 9, 4, 5, 12, 13, 2, 3, 10, 11, 6, 7, 14, 15
idct4_shuf2: times 2 db 4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8 ,9 ,10, 11
diff -r 453d131f974b -r e0eca5d9c0ae source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm Mon Oct 27 12:20:10 2014 -0500
+++ b/source/common/x86/ipfilter8.asm Mon Oct 27 17:01:11 2014 -0700
@@ -31,6 +31,7 @@
db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10
db 8, 9,10,11, 9,10,11,12,10,11,12,13,11,12,13, 14
+ALIGN 32
tab_Lm: db 0, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 8
db 2, 3, 4, 5, 6, 7, 8, 9, 3, 4, 5, 6, 7, 8, 9, 10
db 4, 5, 6, 7, 8, 9, 10, 11, 5, 6, 7, 8, 9, 10, 11, 12
@@ -128,6 +129,8 @@
SECTION .text
+cextern idct4_shuf1
+cextern pw_1
cextern pw_512
cextern pw_2000
@@ -794,6 +797,64 @@
RET
%endmacro
+
+INIT_YMM avx2
+cglobal interp_8tap_horiz_pp_4x4, 4,6,6
+ mov r4d, r4m
+
+%ifdef PIC
+ lea r5, [tab_LumaCoeff]
+ vpbroadcastq m0, [r5 + r4 * 8]
+%else
+ vpbroadcastq m0, [tab_LumaCoeff + r4 * 8]
+%endif
+
+ mova m1, [tab_Lm]
+ vpbroadcastd m2, [pw_1]
+
+ ; register map
+ ; m0 - interpolate coeff
+ ; m1 - shuffle order table
+ ; m2 - constant word 1
+
+ sub r0, 3
+ ; Row 0-1
+ vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]
+ pshufb m3, m1
+ pmaddubsw m3, m0
+ pmaddwd m3, m2
+ vbroadcasti128 m4, [r0 + r1] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]
+ pshufb m4, m1
+ pmaddubsw m4, m0
+ pmaddwd m4, m2
+ phaddd m3, m4 ; DWORD [R1D R1C R0D R0C R1B R1A R0B R0A]
+
+ ; Row 2-3
+ lea r0, [r0 + r1 * 2]
+ vbroadcasti128 m4, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]
+ pshufb m4, m1
+ pmaddubsw m4, m0
+ pmaddwd m4, m2
+ vbroadcasti128 m5, [r0 + r1] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]
+ pshufb m5, m1
+ pmaddubsw m5, m0
+ pmaddwd m5, m2
+ phaddd m4, m5 ; DWORD [R3D R3C R2D R2C R3B R3A R2B R2A]
+
+ packssdw m3, m4 ; WORD [R3D R3C R2D R2C R1D R1C R0D R0C R3B R3A R2B R2A R1B R1A R0B R0A]
+ pmulhrsw m3, [pw_512]
+ vextracti128 xm4, m3, 1
+ packuswb xm3, xm4 ; BYTE [R3D R3C R2D R2C R1D R1C R0D R0C R3B R3A R2B R2A R1B R1A R0B R0A]
+ pshufb xm3, [idct4_shuf1] ; [row3 row1 row2 row0]
+
+ lea r0, [r3 * 3]
+ movd [r2], xm3
+ pextrd [r2+r3], xm3, 2
+ pextrd [r2+r3*2], xm3, 1
+ pextrd [r2+r0], xm3, 3
+ RET
+
+
;--------------------------------------------------------------------------------------------------------------
; void interp_8tap_horiz_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
;--------------------------------------------------------------------------------------------------------------
@@ -802,7 +863,6 @@
IPFILTER_LUMA 12, 16, pp
IPFILTER_LUMA 4, 16, pp
-
;--------------------------------------------------------------------------------------------------------------
; void interp_8tap_horiz_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
;--------------------------------------------------------------------------------------------------------------
diff -r 453d131f974b -r e0eca5d9c0ae source/common/x86/ipfilter8.h
--- a/source/common/x86/ipfilter8.h Mon Oct 27 12:20:10 2014 -0500
+++ b/source/common/x86/ipfilter8.h Mon Oct 27 17:01:11 2014 -0700
@@ -614,6 +614,7 @@
LUMA_FILTERS(_sse4);
LUMA_SP_FILTERS(_sse4);
LUMA_SS_FILTERS(_sse2);
+LUMA_FILTERS(_avx2);
void x265_interp_8tap_hv_pp_8x8_ssse3(pixel * src, intptr_t srcStride, pixel * dst, intptr_t dstStride, int idxX, int idxY);
void x265_luma_p2s_ssse3(pixel *src, intptr_t srcStride, int16_t *dst, int width, int height);
More information about the x265-devel
mailing list