[x265] [PATCH 3 of 4] asm: AVX2 of SAD_x4[32xN]
Min Chen
chenm003 at 163.com
Tue Jun 23 02:54:56 CEST 2015
# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1435019991 25200
# Node ID 965b507acd52ad89160723253b770ef0036c71a5
# Parent 64167d1ad6d81c6c2d2ab762592d131860825fe9
asm: AVX2 of SAD_x4[32xN]
AVX:
sad_x4[32x32] 36.69x 2843.87 104330.24
sad_x4[32x16] 35.67x 1547.93 55217.42
sad_x4[32x24] 34.01x 2161.25 73503.10
sad_x4[32x64] 38.73x 5122.28 198363.05
AVX2:
sad_x4[32x32] 41.91x 2379.45 99724.21
sad_x4[32x16] 35.79x 1395.48 49947.39
sad_x4[32x24] 39.03x 1890.22 73777.83
sad_x4[32x64] 39.64x 4997.68 198107.81
---
source/common/x86/asm-primitives.cpp | 5 ++
source/common/x86/sad-a.asm | 103 ++++++++++++++++++++++++++++++++++
2 files changed, 108 insertions(+), 0 deletions(-)
diff -r 64167d1ad6d8 -r 965b507acd52 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Mon Jun 22 17:39:48 2015 -0700
+++ b/source/common/x86/asm-primitives.cpp Mon Jun 22 17:39:51 2015 -0700
@@ -2747,6 +2747,11 @@
p.pu[LUMA_16x12].sad_x4 = PFX(pixel_sad_x4_16x12_avx2);
p.pu[LUMA_16x16].sad_x4 = PFX(pixel_sad_x4_16x16_avx2);
p.pu[LUMA_16x32].sad_x4 = PFX(pixel_sad_x4_16x32_avx2);
+ p.pu[LUMA_32x32].sad_x4 = PFX(pixel_sad_x4_32x32_avx2);
+ p.pu[LUMA_32x16].sad_x4 = PFX(pixel_sad_x4_32x16_avx2);
+ p.pu[LUMA_32x64].sad_x4 = PFX(pixel_sad_x4_32x64_avx2);
+ p.pu[LUMA_32x24].sad_x4 = PFX(pixel_sad_x4_32x24_avx2);
+ p.pu[LUMA_32x8].sad_x4 = PFX(pixel_sad_x4_32x8_avx2);
p.cu[BLOCK_16x16].sse_pp = PFX(pixel_ssd_16x16_avx2);
p.cu[BLOCK_32x32].sse_pp = PFX(pixel_ssd_32x32_avx2);
diff -r 64167d1ad6d8 -r 965b507acd52 source/common/x86/sad-a.asm
--- a/source/common/x86/sad-a.asm Mon Jun 22 17:39:48 2015 -0700
+++ b/source/common/x86/sad-a.asm Mon Jun 22 17:39:51 2015 -0700
@@ -2784,6 +2784,103 @@
%endif
%endmacro
+%macro SAD_X4_START_2x32P_AVX2 0
+ vbroadcasti128 m4, [r0]
+ vbroadcasti128 m5, [r0+FENC_STRIDE]
+ movu xm0, [r1]
+ movu xm1, [r2]
+ movu xm2, [r1+r5]
+ movu xm3, [r2+r5]
+ vinserti128 m0, m0, [r3], 1
+ vinserti128 m1, m1, [r4], 1
+ vinserti128 m2, m2, [r3+r5], 1
+ vinserti128 m3, m3, [r4+r5], 1
+ psadbw m0, m4
+ psadbw m1, m4
+ psadbw m2, m5
+ psadbw m3, m5
+ paddw m0, m2
+ paddw m1, m3
+
+ vbroadcasti128 m6, [r0+16]
+ vbroadcasti128 m7, [r0+FENC_STRIDE+16]
+ movu xm2, [r1+16]
+ movu xm3, [r2+16]
+ movu xm4, [r1+r5+16]
+ movu xm5, [r2+r5+16]
+ vinserti128 m2, m2, [r3+16], 1
+ vinserti128 m3, m3, [r4+16], 1
+ vinserti128 m4, m4, [r3+r5+16], 1
+ vinserti128 m5, m5, [r4+r5+16], 1
+ psadbw m2, m6
+ psadbw m3, m6
+ psadbw m4, m7
+ psadbw m5, m7
+ paddd m0, m2
+ paddd m1, m3
+ paddd m0, m4
+ paddd m1, m5
+%endmacro
+
+%macro SAD_X4_2x32P_AVX2 4
+ vbroadcasti128 m6, [r0+%1]
+ vbroadcasti128 m7, [r0+%3]
+ movu xm2, [r1+%2]
+ movu xm3, [r2+%2]
+ movu xm4, [r1+%4]
+ movu xm5, [r2+%4]
+ vinserti128 m2, m2, [r3+%2], 1
+ vinserti128 m3, m3, [r4+%2], 1
+ vinserti128 m4, m4, [r3+%4], 1
+ vinserti128 m5, m5, [r4+%4], 1
+ psadbw m2, m6
+ psadbw m3, m6
+ psadbw m4, m7
+ psadbw m5, m7
+ paddd m0, m2
+ paddd m1, m3
+ paddd m0, m4
+ paddd m1, m5
+
+ vbroadcasti128 m6, [r0+%1+16]
+ vbroadcasti128 m7, [r0+%3+16]
+ movu xm2, [r1+%2+16]
+ movu xm3, [r2+%2+16]
+ movu xm4, [r1+%4+16]
+ movu xm5, [r2+%4+16]
+ vinserti128 m2, m2, [r3+%2+16], 1
+ vinserti128 m3, m3, [r4+%2+16], 1
+ vinserti128 m4, m4, [r3+%4+16], 1
+ vinserti128 m5, m5, [r4+%4+16], 1
+ psadbw m2, m6
+ psadbw m3, m6
+ psadbw m4, m7
+ psadbw m5, m7
+ paddd m0, m2
+ paddd m1, m3
+ paddd m0, m4
+ paddd m1, m5
+%endmacro
+
+%macro SAD_X4_4x32P_AVX2 2
+%if %1==0
+ lea r6, [r5*3]
+ SAD_X4_START_2x32P_AVX2
+%else
+ SAD_X4_2x32P_AVX2 FENC_STRIDE*(0+(%1&1)*4), r5*0, FENC_STRIDE*(1+(%1&1)*4), r5*1
+%endif
+ SAD_X4_2x32P_AVX2 FENC_STRIDE*(2+(%1&1)*4), r5*2, FENC_STRIDE*(3+(%1&1)*4), r6
+%if %1 != %2-1
+%if (%1&1) != 0
+ add r0, 8*FENC_STRIDE
+%endif
+ lea r1, [r1+4*r5]
+ lea r2, [r2+4*r5]
+ lea r3, [r3+4*r5]
+ lea r4, [r4+4*r5]
+%endif
+%endmacro
+
%macro SAD_X3_END_AVX2 0
movifnidn r5, r5mp
packssdw m0, m1 ; 0 0 1 1 0 0 1 1
@@ -3333,6 +3430,12 @@
SAD_X_AVX2 4, 16, 12, 8
SAD_X_AVX2 4, 16, 8, 8
+SAD_X_AVX2 4, 32, 8, 8
+SAD_X_AVX2 4, 32, 16, 8
+SAD_X_AVX2 4, 32, 24, 8
+SAD_X_AVX2 4, 32, 32, 8
+SAD_X_AVX2 4, 32, 64, 8
+
;=============================================================================
; SAD cacheline split
;=============================================================================
More information about the x265-devel
mailing list