[x265] [PATCH] asm: addAvg avx2 code for high_bit_depth sizes >= 8, improved over ~45% than previous code
dnyaneshwar at multicorewareinc.com
dnyaneshwar at multicorewareinc.com
Thu May 14 07:45:34 CEST 2015
# HG changeset patch
# User Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
# Date 1431582271 -19800
# Thu May 14 11:14:31 2015 +0530
# Node ID 722ec5cd93ab072a182f0d94ad53ce4e8ad34f94
# Parent 479087422e29a672d6e9bc8d0cd2a65649d71fe2
asm: addAvg avx2 code for high_bit_depth sizes >= 8, improved over ~45% than previous code
diff -r 479087422e29 -r 722ec5cd93ab source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Wed May 13 16:52:59 2015 -0700
+++ b/source/common/x86/asm-primitives.cpp Thu May 14 11:14:31 2015 +0530
@@ -1181,6 +1181,29 @@
}
if (cpuMask & X265_CPU_AVX2)
{
+ p.pu[LUMA_8x4].addAvg = x265_addAvg_8x4_avx2;
+ p.pu[LUMA_8x8].addAvg = x265_addAvg_8x8_avx2;
+ p.pu[LUMA_8x16].addAvg = x265_addAvg_8x16_avx2;
+ p.pu[LUMA_8x32].addAvg = x265_addAvg_8x32_avx2;
+ p.pu[LUMA_12x16].addAvg = x265_addAvg_12x16_avx2;
+ p.pu[LUMA_16x4].addAvg = x265_addAvg_16x4_avx2;
+ p.pu[LUMA_16x8].addAvg = x265_addAvg_16x8_avx2;
+ p.pu[LUMA_16x12].addAvg = x265_addAvg_16x12_avx2;
+ p.pu[LUMA_16x16].addAvg = x265_addAvg_16x16_avx2;
+ p.pu[LUMA_16x32].addAvg = x265_addAvg_16x32_avx2;
+ p.pu[LUMA_16x64].addAvg = x265_addAvg_16x64_avx2;
+ p.pu[LUMA_24x32].addAvg = x265_addAvg_24x32_avx2;
+ p.pu[LUMA_32x8].addAvg = x265_addAvg_32x8_avx2;
+ p.pu[LUMA_32x16].addAvg = x265_addAvg_32x16_avx2;
+ p.pu[LUMA_32x24].addAvg = x265_addAvg_32x24_avx2;
+ p.pu[LUMA_32x32].addAvg = x265_addAvg_32x32_avx2;
+ p.pu[LUMA_32x64].addAvg = x265_addAvg_32x64_avx2;
+ p.pu[LUMA_48x64].addAvg = x265_addAvg_48x64_avx2;
+ p.pu[LUMA_64x16].addAvg = x265_addAvg_64x16_avx2;
+ p.pu[LUMA_64x32].addAvg = x265_addAvg_64x32_avx2;
+ p.pu[LUMA_64x48].addAvg = x265_addAvg_64x48_avx2;
+ p.pu[LUMA_64x64].addAvg = x265_addAvg_64x64_avx2;
+
p.cu[BLOCK_4x4].psy_cost_ss = x265_psyCost_ss_4x4_avx2;
p.cu[BLOCK_8x8].psy_cost_ss = x265_psyCost_ss_8x8_avx2;
p.cu[BLOCK_16x16].psy_cost_ss = x265_psyCost_ss_16x16_avx2;
diff -r 479087422e29 -r 722ec5cd93ab source/common/x86/const-a.asm
--- a/source/common/x86/const-a.asm Wed May 13 16:52:59 2015 -0700
+++ b/source/common/x86/const-a.asm Thu May 14 11:14:31 2015 +0530
@@ -75,7 +75,7 @@
const pw_256, times 16 dw 256
const pw_257, times 16 dw 257
const pw_512, times 16 dw 512
-const pw_1023, times 8 dw 1023
+const pw_1023, times 16 dw 1023
const pw_1024, times 16 dw 1024
const pw_4096, times 16 dw 4096
const pw_00ff, times 16 dw 0x00ff
diff -r 479087422e29 -r 722ec5cd93ab source/common/x86/mc-a.asm
--- a/source/common/x86/mc-a.asm Wed May 13 16:52:59 2015 -0700
+++ b/source/common/x86/mc-a.asm Thu May 14 11:14:31 2015 +0530
@@ -1017,6 +1017,454 @@
ADDAVG_W64_H1 32
ADDAVG_W64_H1 48
ADDAVG_W64_H1 64
+
+;------------------------------------------------------------------------------
+; avx2 asm for addAvg high_bit_depth
+;------------------------------------------------------------------------------
+INIT_YMM avx2
+%macro ADDAVG_W8_H4_AVX2 1
+cglobal addAvg_8x%1, 6,7,6, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
+ mova m4, [pw_512]
+ mova m5, [pw_1023]
+ mova m3, [pw_1024]
+ pxor m1, m1
+ add r3d, r3d
+ add r4d, r4d
+ add r5d, r5d
+ mov r6d, %1/4
+
+.loop:
+ movu m0, [r0]
+ vinserti128 m0, m0, [r0 + r3], 1
+ movu m2, [r1]
+ vinserti128 m2, m2, [r1 + r4], 1
+
+ paddw m0, m2
+ pmulhrsw m0, m3
+ paddw m0, m4
+ pmaxsw m0, m1
+ pminsw m0, m5
+ vextracti128 xm2, m0, 1
+ movu [r2], xm0
+ movu [r2 + r5], xm2
+
+ lea r2, [r2 + 2 * r5]
+ lea r0, [r0 + 2 * r3]
+ lea r1, [r1 + 2 * r4]
+
+ movu m0, [r0]
+ vinserti128 m0, m0, [r0 + r3], 1
+ movu m2, [r1]
+ vinserti128 m2, m2, [r1 + r4], 1
+
+ paddw m0, m2
+ pmulhrsw m0, m3
+ paddw m0, m4
+ pmaxsw m0, m1
+ pminsw m0, m5
+ vextracti128 xm2, m0, 1
+ movu [r2], xm0
+ movu [r2 + r5], xm2
+
+ lea r2, [r2 + 2 * r5]
+ lea r0, [r0 + 2 * r3]
+ lea r1, [r1 + 2 * r4]
+
+ dec r6d
+ jnz .loop
+ RET
+%endmacro
+
+ADDAVG_W8_H4_AVX2 4
+ADDAVG_W8_H4_AVX2 8
+ADDAVG_W8_H4_AVX2 12
+ADDAVG_W8_H4_AVX2 16
+ADDAVG_W8_H4_AVX2 32
+ADDAVG_W8_H4_AVX2 64
+
+cglobal addAvg_12x16, 6,7,6, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
+ mova m4, [pw_512]
+ mova m5, [pw_1023]
+ mova m3, [pw_1024]
+ pxor m1, m1
+ add r3, r3
+ add r4, r4
+ add r5, r5
+ mov r6d, 4
+
+.loop:
+%rep 2
+ movu m0, [r0]
+ movu m2, [r1]
+ paddw m0, m2
+ pmulhrsw m0, m3
+ paddw m0, m4
+ pmaxsw m0, m1
+ pminsw m0, m5
+ vextracti128 xm2, m0, 1
+ movu [r2], xm0
+ movq [r2 + 16], xm2
+
+ movu m0, [r0 + r3]
+ movu m2, [r1 + r4]
+ paddw m0, m2
+ pmulhrsw m0, m3
+ paddw m0, m4
+ pmaxsw m0, m1
+ pminsw m0, m5
+ vextracti128 xm2, m0, 1
+ movu [r2 + r5], xm0
+ movq [r2 + r5 + 16], xm2
+
+ lea r2, [r2 + 2 * r5]
+ lea r0, [r0 + 2 * r3]
+ lea r1, [r1 + 2 * r4]
+%endrep
+ dec r6d
+ jnz .loop
+ RET
+
+%macro ADDAVG_W16_H4_AVX2 1
+cglobal addAvg_16x%1, 6,7,6, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
+ mova m4, [pw_512]
+ mova m5, [pw_1023]
+ mova m3, [pw_1024]
+ pxor m2, m2
+ add r3, r3
+ add r4, r4
+ add r5, r5
+ mov r6d, %1/4
+
+.loop:
+%rep 2
+ movu m0, [r0]
+ movu m1, [r1]
+ paddw m0, m1
+ pmulhrsw m0, m3
+ paddw m0, m4
+ pmaxsw m0, m2
+ pminsw m0, m5
+ movu [r2], m0
+
+ movu m0, [r0 + r3]
+ movu m1, [r1 + r4]
+ paddw m0, m1
+ pmulhrsw m0, m3
+ paddw m0, m4
+ pmaxsw m0, m2
+ pminsw m0, m5
+ movu [r2 + r5], m0
+
+ lea r2, [r2 + 2 * r5]
+ lea r0, [r0 + 2 * r3]
+ lea r1, [r1 + 2 * r4]
+%endrep
+ dec r6d
+ jnz .loop
+ RET
+%endmacro
+
+ADDAVG_W16_H4_AVX2 4
+ADDAVG_W16_H4_AVX2 8
+ADDAVG_W16_H4_AVX2 12
+ADDAVG_W16_H4_AVX2 16
+ADDAVG_W16_H4_AVX2 24
+ADDAVG_W16_H4_AVX2 32
+ADDAVG_W16_H4_AVX2 64
+
+cglobal addAvg_24x32, 6,7,6, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
+ mova m4, [pw_512]
+ mova m5, [pw_1023]
+ mova m3, [pw_1024]
+ pxor m1, m1
+ add r3, r3
+ add r4, r4
+ add r5, r5
+
+ mov r6d, 16
+
+.loop:
+ movu m0, [r0]
+ movu m2, [r1]
+ paddw m0, m2
+ pmulhrsw m0, m3
+ paddw m0, m4
+ pmaxsw m0, m1
+ pminsw m0, m5
+ movu [r2], m0
+
+ movu xm0, [r0 + 32]
+ movu xm2, [r1 + 32]
+ paddw xm0, xm2
+ pmulhrsw xm0, xm3
+ paddw xm0, xm4
+ pmaxsw xm0, xm1
+ pminsw xm0, xm5
+ movu [r2 + 32], xm0
+
+ movu m0, [r0 + r3]
+ movu m2, [r1 + r4]
+ paddw m0, m2
+ pmulhrsw m0, m3
+ paddw m0, m4
+ pmaxsw m0, m1
+ pminsw m0, m5
+ movu [r2 + r5], m0
+
+ movu xm2, [r0 + r3 + 32]
+ movu xm0, [r1 + r4 + 32]
+ paddw xm2, xm0
+ pmulhrsw xm2, xm3
+ paddw xm2, xm4
+ pmaxsw xm2, xm1
+ pminsw xm2, xm5
+ movu [r2 + r5 + 32], xm2
+
+ lea r2, [r2 + 2 * r5]
+ lea r0, [r0 + 2 * r3]
+ lea r1, [r1 + 2 * r4]
+
+ dec r6d
+ jnz .loop
+ RET
+
+%macro ADDAVG_W32_H2_AVX2 1
+cglobal addAvg_32x%1, 6,7,6, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
+ mova m4, [pw_512]
+ mova m5, [pw_1023]
+ mova m3, [pw_1024]
+ pxor m2, m2
+ add r3, r3
+ add r4, r4
+ add r5, r5
+
+ mov r6d, %1/2
+
+.loop:
+ movu m0, [r0]
+ movu m1, [r1]
+ paddw m0, m1
+ pmulhrsw m0, m3
+ paddw m0, m4
+ pmaxsw m0, m2
+ pminsw m0, m5
+ movu [r2], m0
+
+ movu m0, [r0 + 32]
+ movu m1, [r1 + 32]
+ paddw m0, m1
+ pmulhrsw m0, m3
+ paddw m0, m4
+ pmaxsw m0, m2
+ pminsw m0, m5
+ movu [r2 + 32], m0
+
+ movu m0, [r0 + r3]
+ movu m1, [r1 + r4]
+ paddw m0, m1
+ pmulhrsw m0, m3
+ paddw m0, m4
+ pmaxsw m0, m2
+ pminsw m0, m5
+ movu [r2 + r5], m0
+
+ movu m0, [r0 + r3 + 32]
+ movu m1, [r1 + r4 + 32]
+ paddw m0, m1
+ pmulhrsw m0, m3
+ paddw m0, m4
+ pmaxsw m0, m2
+ pminsw m0, m5
+ movu [r2 + r5 + 32], m0
+
+ lea r2, [r2 + 2 * r5]
+ lea r0, [r0 + 2 * r3]
+ lea r1, [r1 + 2 * r4]
+
+ dec r6d
+ jnz .loop
+ RET
+%endmacro
+
+ADDAVG_W32_H2_AVX2 8
+ADDAVG_W32_H2_AVX2 16
+ADDAVG_W32_H2_AVX2 24
+ADDAVG_W32_H2_AVX2 32
+ADDAVG_W32_H2_AVX2 48
+ADDAVG_W32_H2_AVX2 64
+
+cglobal addAvg_48x64, 6,7,6, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
+ mova m4, [pw_512]
+ mova m5, [pw_1023]
+ mova m3, [pw_1024]
+ pxor m2, m2
+ add r3, r3
+ add r4, r4
+ add r5, r5
+
+ mov r6d, 32
+
+.loop:
+ movu m0, [r0]
+ movu m1, [r1]
+ paddw m0, m1
+ pmulhrsw m0, m3
+ paddw m0, m4
+ pmaxsw m0, m2
+ pminsw m0, m5
+ movu [r2], m0
+
+ movu m0, [r0 + 32]
+ movu m1, [r1 + 32]
+ paddw m0, m1
+ pmulhrsw m0, m3
+ paddw m0, m4
+ pmaxsw m0, m2
+ pminsw m0, m5
+ movu [r2 + 32], m0
+
+ movu m0, [r0 + 64]
+ movu m1, [r1 + 64]
+ paddw m0, m1
+ pmulhrsw m0, m3
+ paddw m0, m4
+ pmaxsw m0, m2
+ pminsw m0, m5
+ movu [r2 + 64], m0
+
+ movu m0, [r0 + r3]
+ movu m1, [r1 + r4]
+ paddw m0, m1
+ pmulhrsw m0, m3
+ paddw m0, m4
+ pmaxsw m0, m2
+ pminsw m0, m5
+ movu [r2 + r5], m0
+
+ movu m0, [r0 + r3 + 32]
+ movu m1, [r1 + r4 + 32]
+ paddw m0, m1
+ pmulhrsw m0, m3
+ paddw m0, m4
+ pmaxsw m0, m2
+ pminsw m0, m5
+ movu [r2 + r5 + 32], m0
+
+ movu m0, [r0 + r3 + 64]
+ movu m1, [r1 + r4 + 64]
+ paddw m0, m1
+ pmulhrsw m0, m3
+ paddw m0, m4
+ pmaxsw m0, m2
+ pminsw m0, m5
+ movu [r2 + r5 + 64], m0
+
+ lea r2, [r2 + 2 * r5]
+ lea r0, [r0 + 2 * r3]
+ lea r1, [r1 + 2 * r4]
+
+ dec r6d
+ jnz .loop
+ RET
+
+%macro ADDAVG_W64_H1_AVX2 1
+cglobal addAvg_64x%1, 6,7,6, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
+ mova m4, [pw_512]
+ mova m5, [pw_1023]
+ mova m3, [pw_1024]
+ pxor m2, m2
+ add r3d, r3d
+ add r4d, r4d
+ add r5d, r5d
+
+ mov r6d, %1/2
+
+.loop:
+ movu m0, [r0]
+ movu m1, [r1]
+ paddw m0, m1
+ pmulhrsw m0, m3
+ paddw m0, m4
+ pmaxsw m0, m2
+ pminsw m0, m5
+ movu [r2], m0
+
+ movu m0, [r0 + 32]
+ movu m1, [r1 + 32]
+ paddw m0, m1
+ pmulhrsw m0, m3
+ paddw m0, m4
+ pmaxsw m0, m2
+ pminsw m0, m5
+ movu [r2 + 32], m0
+
+ movu m0, [r0 + 64]
+ movu m1, [r1 + 64]
+ paddw m0, m1
+ pmulhrsw m0, m3
+ paddw m0, m4
+ pmaxsw m0, m2
+ pminsw m0, m5
+ movu [r2 + 64], m0
+
+ movu m0, [r0 + 96]
+ movu m1, [r1 + 96]
+ paddw m0, m1
+ pmulhrsw m0, m3
+ paddw m0, m4
+ pmaxsw m0, m2
+ pminsw m0, m5
+ movu [r2 + 96], m0
+
+ movu m0, [r0 + r3]
+ movu m1, [r1 + r4]
+ paddw m0, m1
+ pmulhrsw m0, m3
+ paddw m0, m4
+ pmaxsw m0, m2
+ pminsw m0, m5
+ movu [r2 + r5], m0
+
+ movu m0, [r0 + r3 + 32]
+ movu m1, [r1 + r4 + 32]
+ paddw m0, m1
+ pmulhrsw m0, m3
+ paddw m0, m4
+ pmaxsw m0, m2
+ pminsw m0, m5
+ movu [r2 + r5 + 32], m0
+
+ movu m0, [r0 + r3 + 64]
+ movu m1, [r1 + r4 + 64]
+ paddw m0, m1
+ pmulhrsw m0, m3
+ paddw m0, m4
+ pmaxsw m0, m2
+ pminsw m0, m5
+ movu [r2 + r5 + 64], m0
+
+ movu m0, [r0 + r3 + 96]
+ movu m1, [r1 + r4 + 96]
+ paddw m0, m1
+ pmulhrsw m0, m3
+ paddw m0, m4
+ pmaxsw m0, m2
+ pminsw m0, m5
+ movu [r2 + r5 + 96], m0
+
+ lea r2, [r2 + 2 * r5]
+ lea r0, [r0 + 2 * r3]
+ lea r1, [r1 + 2 * r4]
+
+ dec r6d
+ jnz .loop
+ RET
+%endmacro
+
+ADDAVG_W64_H1_AVX2 16
+ADDAVG_W64_H1_AVX2 32
+ADDAVG_W64_H1_AVX2 48
+ADDAVG_W64_H1_AVX2 64
;-----------------------------------------------------------------------------
%else ; !HIGH_BIT_DEPTH
;-----------------------------------------------------------------------------
More information about the x265-devel
mailing list