[x265] [PATCH] asm: addAvg avx2 code for luma width >= 8
dnyaneshwar at multicorewareinc.com
dnyaneshwar at multicorewareinc.com
Thu Mar 19 06:54:47 CET 2015
# HG changeset patch
# User Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
# Date 1426744326 -19800
# Thu Mar 19 11:22:06 2015 +0530
# Node ID 2807d9a5a494de78340ab6d09867205b6676330b
# Parent cbfa66e0b50cc2393ccbcf6471406504c6c06011
asm: addAvg avx2 code for luma width >= 8
AVX2 improvement of over SSE4 asm(in cpu cycles):
AVX2 SSE4
addAvg[ 8x8] 207.85 257.95
addAvg[ 8x4] 160.07 166.73
addAvg[16x16] 517.91 704.17
addAvg[ 16x8] 265.46 366.31
addAvg[ 8x16] 426.98 510.07
addAvg[ 16x4] 145.12 213.41
addAvg[16x12] 358.53 545.57
addAvg[12x16] 768.90 808.42
addAvg[32x32] 1386.84 2566.90
addAvg[32x16] 692.19 1088.32
addAvg[16x32] 847.97 1355.29
addAvg[ 32x8] 397.61 650.59
addAvg[32x24] 1245.11 1860.74
addAvg[ 8x32] 941.76 885.06
addAvg[24x32] 1745.70 2055.57
addAvg[64x64] 5541.37 9395.08
addAvg[64x32] 2566.79 4392.35
addAvg[32x64] 3033.06 4320.25
addAvg[64x16] 1493.78 2148.42
addAvg[64x48] 4478.40 7165.04
addAvg[16x64] 1681.48 2201.52
addAvg[48x64] 4322.58 6869.51
diff -r cbfa66e0b50c -r 2807d9a5a494 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Wed Mar 18 18:16:51 2015 -0500
+++ b/source/common/x86/asm-primitives.cpp Thu Mar 19 11:22:06 2015 +0530
@@ -1417,6 +1417,35 @@
#if X86_64
if (cpuMask & X265_CPU_AVX2)
{
+ p.pu[LUMA_8x4].addAvg = x265_addAvg_8x4_avx2;
+ p.pu[LUMA_8x8].addAvg = x265_addAvg_8x8_avx2;
+ p.pu[LUMA_8x16].addAvg = x265_addAvg_8x16_avx2;
+ p.pu[LUMA_8x32].addAvg = x265_addAvg_8x32_avx2;
+
+ p.pu[LUMA_12x16].addAvg = x265_addAvg_12x16_avx2;
+
+ p.pu[LUMA_16x4].addAvg = x265_addAvg_16x4_avx2;
+ p.pu[LUMA_16x8].addAvg = x265_addAvg_16x8_avx2;
+ p.pu[LUMA_16x12].addAvg = x265_addAvg_16x12_avx2;
+ p.pu[LUMA_16x16].addAvg = x265_addAvg_16x16_avx2;
+ p.pu[LUMA_16x32].addAvg = x265_addAvg_16x32_avx2;
+ p.pu[LUMA_16x64].addAvg = x265_addAvg_16x64_avx2;
+
+ p.pu[LUMA_24x32].addAvg = x265_addAvg_24x32_avx2;
+
+ p.pu[LUMA_32x8].addAvg = x265_addAvg_32x8_avx2;
+ p.pu[LUMA_32x16].addAvg = x265_addAvg_32x16_avx2;
+ p.pu[LUMA_32x24].addAvg = x265_addAvg_32x24_avx2;
+ p.pu[LUMA_32x32].addAvg = x265_addAvg_32x32_avx2;
+ p.pu[LUMA_32x64].addAvg = x265_addAvg_32x64_avx2;
+
+ p.pu[LUMA_48x64].addAvg = x265_addAvg_48x64_avx2;
+
+ p.pu[LUMA_64x16].addAvg = x265_addAvg_64x16_avx2;
+ p.pu[LUMA_64x32].addAvg = x265_addAvg_64x32_avx2;
+ p.pu[LUMA_64x48].addAvg = x265_addAvg_64x48_avx2;
+ p.pu[LUMA_64x64].addAvg = x265_addAvg_64x64_avx2;
+
p.cu[BLOCK_16x16].add_ps = x265_pixel_add_ps_16x16_avx2;
p.cu[BLOCK_32x32].add_ps = x265_pixel_add_ps_32x32_avx2;
p.cu[BLOCK_64x64].add_ps = x265_pixel_add_ps_64x64_avx2;
diff -r cbfa66e0b50c -r 2807d9a5a494 source/common/x86/mc-a.asm
--- a/source/common/x86/mc-a.asm Wed Mar 18 18:16:51 2015 -0500
+++ b/source/common/x86/mc-a.asm Thu Mar 19 11:22:06 2015 +0530
@@ -1759,7 +1759,492 @@
ADDAVG_W16_H4 24
;-----------------------------------------------------------------------------
-
+; addAvg avx2 code start
+;-----------------------------------------------------------------------------
+
+%macro ADDAVG_W8_H4_AVX2 1
+INIT_YMM avx2
+cglobal addAvg_8x%1, 6,7,6, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride
+ mova m4, [pw_256]
+ mova m5, [pw_128]
+ add r3, r3
+ add r4, r4
+ mov r6d, %1/4
+
+.loop:
+ movu xm0, [r0]
+ vinserti128 m0, m0, [r0 + r3], 1
+
+ movu xm2, [r1]
+ vinserti128 m2, m2, [r1 + r4], 1
+
+ paddw m0, m2
+ pmulhrsw m0, m4
+ paddw m0, m5
+
+ packuswb m0, m0
+ vextracti128 xm1, m0, 1
+ movq [r2], xm0
+ movq [r2 + r5], xm1
+
+ lea r2, [r2 + 2 * r5]
+ lea r0, [r0 + 2 * r3]
+ lea r1, [r1 + 2 * r4]
+
+ movu xm0, [r0]
+ vinserti128 m0, m0, [r0 + r3], 1
+
+ movu m2, [r1]
+ vinserti128 m2, m2, [r1 + r4], 1
+
+ paddw m0, m2
+ pmulhrsw m0, m4
+ paddw m0, m5
+
+ packuswb m0, m0
+ vextracti128 xm1, m0, 1
+ movq [r2], xm0
+ movq [r2 + r5], xm1
+
+ lea r2, [r2 + 2 * r5]
+ lea r0, [r0 + 2 * r3]
+ lea r1, [r1 + 2 * r4]
+
+ dec r6d
+ jnz .loop
+ RET
+%endmacro
+
+ADDAVG_W8_H4_AVX2 4
+ADDAVG_W8_H4_AVX2 8
+ADDAVG_W8_H4_AVX2 16
+ADDAVG_W8_H4_AVX2 32
+
+%macro ADDAVG_W12_H4_AVX2 1
+INIT_YMM avx2
+cglobal addAvg_12x%1, 6,7,7, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride
+ mova m4, [pw_256]
+ mova m5, [pw_128]
+ add r3, r3
+ add r4, r4
+ mov r6d, %1/4
+
+.loop:
+ movu xm0, [r0]
+ movu xm1, [r1]
+ movq xm2, [r0 + 16]
+ movq xm3, [r1 + 16]
+ vinserti128 m0, m0, xm2, 1
+ vinserti128 m1, m1, xm3, 1
+
+ paddw m0, m1
+ pmulhrsw m0, m4
+ paddw m0, m5
+
+ movu xm1, [r0 + r3]
+ movu xm2, [r1 + r4]
+ movq xm3, [r0 + r3 + 16]
+ movq xm6, [r1 + r3 + 16]
+ vinserti128 m1, m1, xm3, 1
+ vinserti128 m2, m2, xm6, 1
+
+ paddw m1, m2
+ pmulhrsw m1, m4
+ paddw m1, m5
+
+ packuswb m0, m1
+ vextracti128 xm1, m0, 1
+ movq [r2], xm0
+ movd [r2 + 8], xm1
+ vpshufd m1, m1, 2
+ movhps [r2 + r5], xm0
+ movd [r2 + r5 + 8], xm1
+
+ lea r2, [r2 + 2 * r5]
+ lea r0, [r0 + 2 * r3]
+ lea r1, [r1 + 2 * r4]
+
+ movu xm0, [r0]
+ movu xm1, [r1]
+ movq xm2, [r0 + 16]
+ movq xm3, [r1 + 16]
+ vinserti128 m0, m0, xm2, 1
+ vinserti128 m1, m1, xm3, 1
+
+ paddw m0, m1
+ pmulhrsw m0, m4
+ paddw m0, m5
+
+ movu xm1, [r0 + r3]
+ movu xm2, [r1 + r4]
+ movq xm3, [r0 + r3 + 16]
+ movq xm6, [r1 + r3 + 16]
+ vinserti128 m1, m1, xm3, 1
+ vinserti128 m2, m2, xm6, 1
+
+ paddw m1, m2
+ pmulhrsw m1, m4
+ paddw m1, m5
+
+ packuswb m0, m1
+ vextracti128 xm1, m0, 1
+ movq [r2], xm0
+ movd [r2 + 8], xm1
+ vpshufd m1, m1, 2
+ movhps [r2 + r5], xm0
+ movd [r2 + r5 + 8], xm1
+
+ lea r2, [r2 + 2 * r5]
+ lea r0, [r0 + 2 * r3]
+ lea r1, [r1 + 2 * r4]
+
+ dec r6d
+ jnz .loop
+ RET
+%endmacro
+
+ADDAVG_W12_H4_AVX2 16
+
+%macro ADDAVG_W16_H4_AVX2 1
+INIT_YMM avx2
+cglobal addAvg_16x%1, 6,7,6, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride
+ mova m4, [pw_256]
+ mova m5, [pw_128]
+ add r3, r3
+ add r4, r4
+ mov r6d, %1/4
+
+.loop:
+ movu m0, [r0]
+ movu m1, [r1]
+ paddw m0, m1
+ pmulhrsw m0, m4
+ paddw m0, m5
+
+ movu m1, [r0 + r3]
+ movu m2, [r1 + r4]
+ paddw m1, m2
+ pmulhrsw m1, m4
+ paddw m1, m5
+
+ packuswb m0, m1
+ vpermq m0, m0, 11011000b
+ vextracti128 [r2], m0, 0
+ vextracti128 [r2 + r5], m0, 1
+
+ lea r2, [r2 + 2 * r5]
+ lea r0, [r0 + 2 * r3]
+ lea r1, [r1 + 2 * r4]
+
+ movu m0, [r0]
+ movu m1, [r1]
+ paddw m0, m1
+ pmulhrsw m0, m4
+ paddw m0, m5
+
+ movu m1, [r0 + r3]
+ movu m2, [r1 + r4]
+ paddw m1, m2
+ pmulhrsw m1, m4
+ paddw m1, m5
+
+ packuswb m0, m1
+ vpermq m0, m0, 11011000b
+ vextracti128 [r2], m0, 0
+ vextracti128 [r2 + r5], m0, 1
+
+ lea r2, [r2 + 2 * r5]
+ lea r0, [r0 + 2 * r3]
+ lea r1, [r1 + 2 * r4]
+
+ dec r6d
+ jnz .loop
+ RET
+%endmacro
+
+ADDAVG_W16_H4_AVX2 4
+ADDAVG_W16_H4_AVX2 8
+ADDAVG_W16_H4_AVX2 12
+ADDAVG_W16_H4_AVX2 16
+ADDAVG_W16_H4_AVX2 32
+ADDAVG_W16_H4_AVX2 64
+
+%macro ADDAVG_W24_H2_AVX2 1
+INIT_YMM avx2
+cglobal addAvg_24x%1, 6,7,6, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride
+ mova m4, [pw_256]
+ mova m5, [pw_128]
+ add r3, r3
+ add r4, r4
+ mov r6d, %1/2
+
+.loop:
+ movu m0, [r0]
+ movu m1, [r1]
+ paddw m0, m1
+ pmulhrsw m0, m4
+ paddw m0, m5
+
+ movu xm1, [r0 + 32]
+ movu xm2, [r1 + 32]
+ paddw m1, m2
+ pmulhrsw m1, m4
+ paddw m1, m5
+
+ packuswb m0, m1
+ vpermq m0, m0, 10001101b
+ vextracti128 [r2], m0, 1
+ movq [r2 + 16], xm0
+
+ movu m0, [r0 + r3]
+ movu m1, [r1 + r4]
+ paddw m0, m1
+ pmulhrsw m0, m4
+ paddw m0, m5
+
+ movu xm1, [r0 + r3 + 32]
+ movu xm2, [r1 + r4 + 32]
+ paddw m1, m2
+ pmulhrsw m1, m4
+ paddw m1, m5
+
+ packuswb m0, m1
+ vpermq m0, m0, 10001101b
+ vextracti128 [r2 + r5], m0, 1
+ movq [r2 + r5 + 16], xm0
+
+ lea r2, [r2 + 2 * r5]
+ lea r0, [r0 + 2 * r3]
+ lea r1, [r1 + 2 * r4]
+
+ dec r6d
+ jnz .loop
+ RET
+%endmacro
+
+ADDAVG_W24_H2_AVX2 32
+
+%macro ADDAVG_W32_H2_AVX2 1
+INIT_YMM avx2
+cglobal addAvg_32x%1, 6,7,6, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride
+ mova m4, [pw_256]
+ mova m5, [pw_128]
+ add r3, r3
+ add r4, r4
+ mov r6d, %1/2
+
+.loop:
+ movu m0, [r0]
+ movu m1, [r1]
+ paddw m0, m1
+ pmulhrsw m0, m4
+ paddw m0, m5
+
+ movu m1, [r0 + 32]
+ movu m2, [r1 + 32]
+ paddw m1, m2
+ pmulhrsw m1, m4
+ paddw m1, m5
+
+ packuswb m0, m1
+ vpermq m0, m0, 11011000b
+ movu [r2], m0
+
+ movu m0, [r0 + r3]
+ movu m1, [r1 + r4]
+ paddw m0, m1
+ pmulhrsw m0, m4
+ paddw m0, m5
+
+ movu m1, [r0 + r3 + 32]
+ movu m2, [r1 + r4 + 32]
+ paddw m1, m2
+ pmulhrsw m1, m4
+ paddw m1, m5
+
+ packuswb m0, m1
+ vpermq m0, m0, 11011000b
+ movu [r2 + r5], m0
+
+ lea r2, [r2 + 2 * r5]
+ lea r0, [r0 + 2 * r3]
+ lea r1, [r1 + 2 * r4]
+
+ dec r6d
+ jnz .loop
+ RET
+%endmacro
+
+ADDAVG_W32_H2_AVX2 8
+ADDAVG_W32_H2_AVX2 16
+ADDAVG_W32_H2_AVX2 24
+ADDAVG_W32_H2_AVX2 32
+ADDAVG_W32_H2_AVX2 64
+
+%macro ADDAVG_W64_H2_AVX2 1
+INIT_YMM avx2
+cglobal addAvg_64x%1, 6,7,6, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride
+ mova m4, [pw_256]
+ mova m5, [pw_128]
+ add r3, r3
+ add r4, r4
+ mov r6d, %1/2
+
+.loop:
+ movu m0, [r0]
+ movu m1, [r1]
+ paddw m0, m1
+ pmulhrsw m0, m4
+ paddw m0, m5
+
+ movu m1, [r0 + 32]
+ movu m2, [r1 + 32]
+ paddw m1, m2
+ pmulhrsw m1, m4
+ paddw m1, m5
+
+ packuswb m0, m1
+ vpermq m0, m0, 11011000b
+ movu [r2], m0
+
+ movu m0, [r0 + 64]
+ movu m1, [r1 + 64]
+ paddw m0, m1
+ pmulhrsw m0, m4
+ paddw m0, m5
+
+ movu m1, [r0 + 96]
+ movu m2, [r1 + 96]
+ paddw m1, m2
+ pmulhrsw m1, m4
+ paddw m1, m5
+
+ packuswb m0, m1
+ vpermq m0, m0, 11011000b
+ movu [r2 + 32], m0
+
+ movu m0, [r0 + r3]
+ movu m1, [r1 + r4]
+ paddw m0, m1
+ pmulhrsw m0, m4
+ paddw m0, m5
+
+ movu m1, [r0 + r3 + 32]
+ movu m2, [r1 + r4 + 32]
+ paddw m1, m2
+ pmulhrsw m1, m4
+ paddw m1, m5
+
+ packuswb m0, m1
+ vpermq m0, m0, 11011000b
+ movu [r2 + r5], m0
+
+ movu m0, [r0 + r3 + 64]
+ movu m1, [r1 + r4 + 64]
+ paddw m0, m1
+ pmulhrsw m0, m4
+ paddw m0, m5
+
+ movu m1, [r0 + r3 + 96]
+ movu m2, [r1 + r4 + 96]
+ paddw m1, m2
+ pmulhrsw m1, m4
+ paddw m1, m5
+
+ packuswb m0, m1
+ vpermq m0, m0, 11011000b
+ movu [r2 + r5 + 32], m0
+
+ lea r2, [r2 + 2 * r5]
+ lea r0, [r0 + 2 * r3]
+ lea r1, [r1 + 2 * r4]
+
+ dec r6d
+ jnz .loop
+ RET
+%endmacro
+
+ADDAVG_W64_H2_AVX2 16
+ADDAVG_W64_H2_AVX2 32
+ADDAVG_W64_H2_AVX2 48
+ADDAVG_W64_H2_AVX2 64
+
+%macro ADDAVG_W48_H2_AVX2 1
+INIT_YMM avx2
+cglobal addAvg_48x%1, 6,7,6, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride
+ mova m4, [pw_256]
+ mova m5, [pw_128]
+ add r3, r3
+ add r4, r4
+ mov r6d, %1/2
+
+.loop:
+ movu m0, [r0]
+ movu m1, [r1]
+ paddw m0, m1
+ pmulhrsw m0, m4
+ paddw m0, m5
+
+ movu m1, [r0 + 32]
+ movu m2, [r1 + 32]
+ paddw m1, m2
+ pmulhrsw m1, m4
+ paddw m1, m5
+
+ packuswb m0, m1
+ vpermq m0, m0, 11011000b
+ movu [r2], m0
+
+ movu m0, [r0 + 64]
+ movu m1, [r1 + 64]
+ paddw m0, m1
+ pmulhrsw m0, m4
+ paddw m0, m5
+
+ packuswb m0, m0
+ vpermq m0, m0, 11011000b
+ vextracti128 [r2 + 32], m0, 0
+
+ movu m0, [r0 + r3]
+ movu m1, [r1 + r4]
+ paddw m0, m1
+ pmulhrsw m0, m4
+ paddw m0, m5
+
+ movu m1, [r0 + r3 + 32]
+ movu m2, [r1 + r4 + 32]
+ paddw m1, m2
+ pmulhrsw m1, m4
+ paddw m1, m5
+
+ packuswb m0, m1
+ vpermq m0, m0, 11011000b
+ movu [r2 + r5], m0
+
+ movu m0, [r0 + r3 + 64]
+ movu m1, [r1 + r4 + 64]
+ paddw m0, m1
+ pmulhrsw m0, m4
+ paddw m0, m5
+
+ packuswb m0, m0
+ vpermq m0, m0, 11011000b
+ vextracti128 [r2 + r5 + 32], m0, 0
+
+ lea r2, [r2 + 2 * r5]
+ lea r0, [r0 + 2 * r3]
+ lea r1, [r1 + 2 * r4]
+
+ dec r6d
+ jnz .loop
+ RET
+%endmacro
+
+ADDAVG_W48_H2_AVX2 64
+
+;-----------------------------------------------------------------------------
+; addAvg avx2 code end
+;-----------------------------------------------------------------------------
;-----------------------------------------------------------------------------
%macro ADDAVG_W24_H2 2
diff -r cbfa66e0b50c -r 2807d9a5a494 source/common/x86/pixel.h
--- a/source/common/x86/pixel.h Wed Mar 18 18:16:51 2015 -0500
+++ b/source/common/x86/pixel.h Thu Mar 19 11:22:06 2015 +0530
@@ -180,7 +180,8 @@
int x265_pixel_ssd_s_32_avx2(const int16_t*, intptr_t);
#define ADDAVG(func) \
- void x265_ ## func ## _sse4(const int16_t*, const int16_t*, pixel*, intptr_t, intptr_t, intptr_t);
+ void x265_ ## func ## _sse4(const int16_t*, const int16_t*, pixel*, intptr_t, intptr_t, intptr_t); \
+ void x265_ ## func ## _avx2(const int16_t*, const int16_t*, pixel*, intptr_t, intptr_t, intptr_t);
ADDAVG(addAvg_2x4)
ADDAVG(addAvg_2x8)
ADDAVG(addAvg_4x2);
More information about the x265-devel
mailing list