[x265] [PATCH 4 of 4] asm: improve AVX2 sad_x4[32xN] by new faster algorithm
Min Chen
chenm003 at 163.com
Tue Jun 23 02:54:57 CEST 2015
# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1435019994 25200
# Node ID e3c31f11936b5e915ec773200e4c1b1d8db2730f
# Parent 965b507acd52ad89160723253b770ef0036c71a5
asm: improve AVX2 sad_x4[32xN] by new faster algorithm
Old:
sad_x4[32x32] 41.91x 2379.45 99724.21
sad_x4[32x16] 35.79x 1395.48 49947.39
sad_x4[32x24] 39.03x 1890.22 73777.83
sad_x4[32x64] 39.64x 4997.68 198107.81
New:
sad_x4[32x32] 60.80x 1672.85 101713.55
sad_x4[32x16] 50.97x 989.42 50435.25
sad_x4[32x24] 55.34x 1416.17 78370.77
sad_x4[32x64] 70.01x 2830.01 198127.63
---
source/common/x86/sad-a.asm | 140 +++++++++++++++++++++----------------------
1 files changed, 68 insertions(+), 72 deletions(-)
diff -r 965b507acd52 -r e3c31f11936b source/common/x86/sad-a.asm
--- a/source/common/x86/sad-a.asm Mon Jun 22 17:39:51 2015 -0700
+++ b/source/common/x86/sad-a.asm Mon Jun 22 17:39:54 2015 -0700
@@ -2785,81 +2785,61 @@
%endmacro
%macro SAD_X4_START_2x32P_AVX2 0
- vbroadcasti128 m4, [r0]
- vbroadcasti128 m5, [r0+FENC_STRIDE]
- movu xm0, [r1]
- movu xm1, [r2]
- movu xm2, [r1+r5]
- movu xm3, [r2+r5]
- vinserti128 m0, m0, [r3], 1
- vinserti128 m1, m1, [r4], 1
- vinserti128 m2, m2, [r3+r5], 1
- vinserti128 m3, m3, [r4+r5], 1
- psadbw m0, m4
- psadbw m1, m4
- psadbw m2, m5
- psadbw m3, m5
- paddw m0, m2
- paddw m1, m3
-
- vbroadcasti128 m6, [r0+16]
- vbroadcasti128 m7, [r0+FENC_STRIDE+16]
- movu xm2, [r1+16]
- movu xm3, [r2+16]
- movu xm4, [r1+r5+16]
- movu xm5, [r2+r5+16]
- vinserti128 m2, m2, [r3+16], 1
- vinserti128 m3, m3, [r4+16], 1
- vinserti128 m4, m4, [r3+r5+16], 1
- vinserti128 m5, m5, [r4+r5+16], 1
- psadbw m2, m6
- psadbw m3, m6
- psadbw m4, m7
- psadbw m5, m7
- paddd m0, m2
- paddd m1, m3
- paddd m0, m4
- paddd m1, m5
+ mova m4, [r0]
+ movu m0, [r1]
+ movu m2, [r2]
+ movu m1, [r3]
+ movu m3, [r4]
+ psadbw m0, m4
+ psadbw m2, m4
+ psadbw m1, m4
+ psadbw m3, m4
+ packusdw m0, m2
+ packusdw m1, m3
+
+ mova m6, [r0+FENC_STRIDE]
+ movu m2, [r1+r5]
+ movu m4, [r2+r5]
+ movu m3, [r3+r5]
+ movu m5, [r4+r5]
+ psadbw m2, m6
+ psadbw m4, m6
+ psadbw m3, m6
+ psadbw m5, m6
+ packusdw m2, m4
+ packusdw m3, m5
+ paddd m0, m2
+ paddd m1, m3
%endmacro
%macro SAD_X4_2x32P_AVX2 4
- vbroadcasti128 m6, [r0+%1]
- vbroadcasti128 m7, [r0+%3]
- movu xm2, [r1+%2]
- movu xm3, [r2+%2]
- movu xm4, [r1+%4]
- movu xm5, [r2+%4]
- vinserti128 m2, m2, [r3+%2], 1
- vinserti128 m3, m3, [r4+%2], 1
- vinserti128 m4, m4, [r3+%4], 1
- vinserti128 m5, m5, [r4+%4], 1
- psadbw m2, m6
- psadbw m3, m6
- psadbw m4, m7
- psadbw m5, m7
- paddd m0, m2
- paddd m1, m3
- paddd m0, m4
- paddd m1, m5
-
- vbroadcasti128 m6, [r0+%1+16]
- vbroadcasti128 m7, [r0+%3+16]
- movu xm2, [r1+%2+16]
- movu xm3, [r2+%2+16]
- movu xm4, [r1+%4+16]
- movu xm5, [r2+%4+16]
- vinserti128 m2, m2, [r3+%2+16], 1
- vinserti128 m3, m3, [r4+%2+16], 1
- vinserti128 m4, m4, [r3+%4+16], 1
- vinserti128 m5, m5, [r4+%4+16], 1
- psadbw m2, m6
- psadbw m3, m6
- psadbw m4, m7
- psadbw m5, m7
- paddd m0, m2
- paddd m1, m3
- paddd m0, m4
- paddd m1, m5
+ mova m6, [r0+%1]
+ movu m2, [r1+%2]
+ movu m4, [r2+%2]
+ movu m3, [r3+%2]
+ movu m5, [r4+%2]
+ psadbw m2, m6
+ psadbw m4, m6
+ psadbw m3, m6
+ psadbw m5, m6
+ packusdw m2, m4
+ packusdw m3, m5
+ paddd m0, m2
+ paddd m1, m3
+
+ mova m6, [r0+%3]
+ movu m2, [r1+%4]
+ movu m4, [r2+%4]
+ movu m3, [r3+%4]
+ movu m5, [r4+%4]
+ psadbw m2, m6
+ psadbw m4, m6
+ psadbw m3, m6
+ psadbw m5, m6
+ packusdw m2, m4
+ packusdw m3, m5
+ paddd m0, m2
+ paddd m1, m3
%endmacro
%macro SAD_X4_4x32P_AVX2 2
@@ -2905,6 +2885,17 @@
RET
%endmacro
+%macro SAD_X4_32P_END_AVX2 0
+ mov r0, r6mp
+ vextracti128 xm2, m0, 1
+ vextracti128 xm3, m1, 1
+ paddd xm0, xm2
+ paddd xm1, xm3
+ phaddd xm0, xm1
+ mova [r0], xm0
+ RET
+%endmacro
+
;-----------------------------------------------------------------------------
; void pixel_sad_x3_16x16( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1,
; uint8_t *pix2, intptr_t i_stride, int scores[3] )
@@ -3417,7 +3408,12 @@
SAD_X%1_4x%2P_AVX2 x, %3/4
%assign x x+1
%endrep
+
+ %if (%1==4) && (%2==32)
+ SAD_X%1_32P_END_AVX2
+ %else
SAD_X%1_END_AVX2
+ %endif
%endmacro
INIT_YMM avx2
More information about the x265-devel
mailing list