[x264-devel] x86: AVX2 mbtree_propagate_list
Henrik Gramner
git at videolan.org
Tue Sep 20 20:57:52 CEST 2016
x264 | branch: master | Henrik Gramner <henrik at gramner.com> | Wed Sep 7 19:27:31 2016 +0200| [0ce77f9eb71051c9a6121ec12c2abaac99ee628a] | committer: Anton Mitrofanov
x86: AVX2 mbtree_propagate_list
SIMD part is around 25% faster than AVX on Haswell, around 7%
faster when including the runtime of the scalar C wrapper.
> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=0ce77f9eb71051c9a6121ec12c2abaac99ee628a
---
common/x86/const-a.asm | 3 +++
common/x86/mc-a2.asm | 65 ++++++++++++++++++++++++++++++++++++++++++++++-
common/x86/mc-c.c | 2 ++
common/x86/trellis-64.asm | 6 ++---
4 files changed, 72 insertions(+), 4 deletions(-)
diff --git a/common/x86/const-a.asm b/common/x86/const-a.asm
index dba36d2..ea61c81 100644
--- a/common/x86/const-a.asm
+++ b/common/x86/const-a.asm
@@ -38,6 +38,8 @@ const pw_00ff, times 16 dw 0x00ff
const pw_pixel_max,times 16 dw ((1 << BIT_DEPTH)-1)
const pw_0to15, dw 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
const pd_1, times 8 dd 1
+const pd_0123, dd 0,1,2,3
+const pd_4567, dd 4,5,6,7
const deinterleave_shufd, dd 0,4,1,5,2,6,3,7
const pb_unpackbd1, times 2 db 0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3
const pb_unpackbd2, times 2 db 4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7
@@ -63,6 +65,7 @@ const pw_ppmmppmm, dw 1,1,-1,-1,1,1,-1,-1
const pw_pmpmpmpm, dw 1,-1,1,-1,1,-1,1,-1
const pw_pmmpzzzz, dw 1,-1,-1,1,0,0,0,0
+const pd_8, times 4 dd 8
const pd_32, times 4 dd 32
const pd_1024, times 4 dd 1024
const pd_ffff, times 4 dd 0xffff
diff --git a/common/x86/mc-a2.asm b/common/x86/mc-a2.asm
index f5c3418..b2b5641 100644
--- a/common/x86/mc-a2.asm
+++ b/common/x86/mc-a2.asm
@@ -94,6 +94,8 @@ cextern pw_00ff
cextern pw_3fff
cextern pw_pixel_max
cextern pw_0to15
+cextern pd_8
+cextern pd_0123
cextern pd_ffff
%macro LOAD_ADD 4
@@ -2178,7 +2180,7 @@ MBTREE_AVX
%macro MBTREE_PROPAGATE_LIST 0
;-----------------------------------------------------------------------------
-; void mbtree_propagate_list_internal( int16_t (*mvs)[2], int *propagate_amount, uint16_t *lowres_costs,
+; void mbtree_propagate_list_internal( int16_t (*mvs)[2], int16_t *propagate_amount, uint16_t *lowres_costs,
; int16_t *output, int bipred_weight, int mb_y, int len )
;-----------------------------------------------------------------------------
cglobal mbtree_propagate_list_internal, 4,6,8
@@ -2268,6 +2270,67 @@ MBTREE_PROPAGATE_LIST
INIT_XMM avx
MBTREE_PROPAGATE_LIST
+INIT_YMM avx2
+cglobal mbtree_propagate_list_internal, 4+2*UNIX64,5+UNIX64,8
+ mova xm4, [pw_0xc000]
+%if UNIX64
+ shl r4d, 9
+ shl r5d, 16
+ movd xm5, r4d
+ movd xm6, r5d
+ vpbroadcastw xm5, xm5
+ vpbroadcastd m6, xm6
+%else
+ vpbroadcastw xm5, r4m
+ vpbroadcastd m6, r5m
+ psllw xm5, 9 ; bipred_weight << 9
+ pslld m6, 16
+%endif
+ mov r4d, r6m
+ lea r1, [r1+r4*2]
+ lea r2, [r2+r4*2]
+ lea r0, [r0+r4*4]
+ neg r4
+ por m6, [pd_0123] ; 0 y 1 y 2 y 3 y 4 y 5 y 6 y 7 y
+ vbroadcasti128 m7, [pw_31]
+.loop:
+ mova xm3, [r1+r4*2]
+ pand xm0, xm4, [r2+r4*2]
+ pmulhrsw xm1, xm3, xm5 ; bipred_amount = (propagate_amount * bipred_weight + 32) >> 6
+ pcmpeqw xm0, xm4
+ pblendvb xm3, xm3, xm1, xm0 ; (lists_used == 3) ? bipred_amount : propagate_amount
+ vpermq m3, m3, q1100
+
+ movu m0, [r0+r4*4] ; {x, y}
+ vbroadcasti128 m1, [pd_8]
+ psraw m2, m0, 5
+ paddw m2, m6 ; {mbx, mby} = ({x, y} >> 5) + {h->mb.i_mb_x, h->mb.i_mb_y}
+ paddw m6, m1 ; i_mb_x += 8
+ mova [r3], m2
+
+ mova m1, [pw_32]
+ pand m0, m7
+ psubw m1, m0
+ packuswb m1, m0 ; {32-x, 32-y} {x, y} {32-x, 32-y} {x, y}
+ psrlw m0, m1, 3
+ pand m1, [pw_00ff] ; 32-x x 32-x x
+ pandn m0, m7, m0 ; (32-y y 32-y y) << 5
+ pshufd m2, m1, q1032
+ pmullw m1, m0 ; idx0 idx3 idx0 idx3
+ pmullw m2, m0 ; idx1 idx2 idx1 idx2
+
+ pmulhrsw m0, m1, m3 ; (idx0 idx3 idx0 idx3) * propagate_amount + 512 >> 10
+ pmulhrsw m2, m3 ; (idx1 idx2 idx1 idx2) * propagate_amount + 512 >> 10
+ psignw m0, m1 ; correct potential overflow in the idx0 input to pmulhrsw
+ punpcklwd m1, m0, m2 ; idx01weight
+ punpckhwd m2, m0 ; idx23weight
+ mova [r3+32], m1
+ mova [r3+64], m2
+ add r3, 3*mmsize
+ add r4, 8
+ jl .loop
+ RET
+
%macro MBTREE_FIX8 0
;-----------------------------------------------------------------------------
; void mbtree_fix8_pack( uint16_t *dst, float *src, int count )
diff --git a/common/x86/mc-c.c b/common/x86/mc-c.c
index 21acdeb..3bff408 100644
--- a/common/x86/mc-c.c
+++ b/common/x86/mc-c.c
@@ -532,6 +532,7 @@ do\
PROPAGATE_LIST(ssse3)
PROPAGATE_LIST(avx)
+PROPAGATE_LIST(avx2)
void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
{
@@ -843,6 +844,7 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
pf->plane_copy_swap = x264_plane_copy_swap_avx2;
pf->get_ref = get_ref_avx2;
pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_avx2;
+ pf->mbtree_propagate_list = x264_mbtree_propagate_list_avx2;
pf->mbtree_fix8_pack = x264_mbtree_fix8_pack_avx2;
pf->mbtree_fix8_unpack = x264_mbtree_fix8_unpack_avx2;
}
diff --git a/common/x86/trellis-64.asm b/common/x86/trellis-64.asm
index bb1282d..0c25914 100644
--- a/common/x86/trellis-64.asm
+++ b/common/x86/trellis-64.asm
@@ -53,14 +53,14 @@
SECTION_RODATA
-pd_8: times 4 dd 8
pd_m16: times 4 dd -16
-pd_0123: dd 0, 1, 2, 3
-pd_4567: dd 4, 5, 6, 7
sq_1: dq 1, 0
pq_128: times 2 dq 128
pq_ffffffff: times 2 dq 0xffffffff
+cextern pd_8
+cextern pd_0123
+cextern pd_4567
cextern cabac_entropy
cextern cabac_transition
cextern cabac_size_unary
More information about the x264-devel
mailing list