[x265] [PATCH] asm: addAvg high_bit_depth avx2 asm for chroma sizes width >= 8, reused code from luma
dnyaneshwar at multicorewareinc.com
dnyaneshwar at multicorewareinc.com
Thu May 14 14:03:06 CEST 2015
# HG changeset patch
# User Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
# Date 1431603734 -19800
# Thu May 14 17:12:14 2015 +0530
# Node ID c237e9a1fd58fb95d6cab19385fb971989907739
# Parent cd88d1705bcb952e2dd2e0f753a510a86909560f
asm: addAvg high_bit_depth avx2 asm for chroma sizes width >= 8, reused code from luma
diff -r cd88d1705bcb -r c237e9a1fd58 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Thu May 14 15:39:53 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp Thu May 14 17:12:14 2015 +0530
@@ -1204,6 +1204,23 @@
p.pu[LUMA_64x48].addAvg = x265_addAvg_64x48_avx2;
p.pu[LUMA_64x64].addAvg = x265_addAvg_64x64_avx2;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_8x2].addAvg = x265_addAvg_8x2_avx2;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].addAvg = x265_addAvg_8x4_avx2;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_8x6].addAvg = x265_addAvg_8x6_avx2;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].addAvg = x265_addAvg_8x8_avx2;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].addAvg = x265_addAvg_8x16_avx2;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].addAvg = x265_addAvg_8x32_avx2;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_12x16].addAvg = x265_addAvg_12x16_avx2;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].addAvg = x265_addAvg_16x4_avx2;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].addAvg = x265_addAvg_16x8_avx2;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].addAvg = x265_addAvg_16x12_avx2;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].addAvg = x265_addAvg_16x16_avx2;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].addAvg = x265_addAvg_16x32_avx2;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].addAvg = x265_addAvg_32x8_avx2;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].addAvg = x265_addAvg_32x16_avx2;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].addAvg = x265_addAvg_32x24_avx2;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].addAvg = x265_addAvg_32x32_avx2;
+
p.cu[BLOCK_4x4].psy_cost_ss = x265_psyCost_ss_4x4_avx2;
p.cu[BLOCK_8x8].psy_cost_ss = x265_psyCost_ss_8x8_avx2;
p.cu[BLOCK_16x16].psy_cost_ss = x265_psyCost_ss_16x16_avx2;
diff -r cd88d1705bcb -r c237e9a1fd58 source/common/x86/mc-a.asm
--- a/source/common/x86/mc-a.asm Thu May 14 15:39:53 2015 +0530
+++ b/source/common/x86/mc-a.asm Thu May 14 17:12:14 2015 +0530
@@ -1022,6 +1022,83 @@
; avx2 asm for addAvg high_bit_depth
;------------------------------------------------------------------------------
INIT_YMM avx2
+cglobal addAvg_8x2, 6,6,2, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
+ movu xm0, [r0]
+ vinserti128 m0, m0, [r0 + r3 * 2], 1
+ movu xm1, [r1]
+ vinserti128 m1, m1, [r1 + r4 * 2], 1
+
+ paddw m0, m1
+ pxor m1, m1
+ pmulhrsw m0, [pw_1024]
+ paddw m0, [pw_512]
+ pmaxsw m0, m1
+ pminsw m0, [pw_1023]
+ vextracti128 xm1, m0, 1
+ movu [r2], xm0
+ movu [r2 + r5 * 2], xm1
+ RET
+
+cglobal addAvg_8x6, 6,6,6, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
+ mova m4, [pw_512]
+ mova m5, [pw_1023]
+ mova m3, [pw_1024]
+ pxor m1, m1
+ add r3d, r3d
+ add r4d, r4d
+ add r5d, r5d
+
+ movu xm0, [r0]
+ vinserti128 m0, m0, [r0 + r3], 1
+ movu xm2, [r1]
+ vinserti128 m2, m2, [r1 + r4], 1
+
+ paddw m0, m2
+ pmulhrsw m0, m3
+ paddw m0, m4
+ pmaxsw m0, m1
+ pminsw m0, m5
+ vextracti128 xm2, m0, 1
+ movu [r2], xm0
+ movu [r2 + r5], xm2
+
+ lea r2, [r2 + 2 * r5]
+ lea r0, [r0 + 2 * r3]
+ lea r1, [r1 + 2 * r4]
+
+ movu xm0, [r0]
+ vinserti128 m0, m0, [r0 + r3], 1
+ movu xm2, [r1]
+ vinserti128 m2, m2, [r1 + r4], 1
+
+ paddw m0, m2
+ pmulhrsw m0, m3
+ paddw m0, m4
+ pmaxsw m0, m1
+ pminsw m0, m5
+ vextracti128 xm2, m0, 1
+ movu [r2], xm0
+ movu [r2 + r5], xm2
+
+ lea r2, [r2 + 2 * r5]
+ lea r0, [r0 + 2 * r3]
+ lea r1, [r1 + 2 * r4]
+
+ movu xm0, [r0]
+ vinserti128 m0, m0, [r0 + r3], 1
+ movu xm2, [r1]
+ vinserti128 m2, m2, [r1 + r4], 1
+
+ paddw m0, m2
+ pmulhrsw m0, m3
+ paddw m0, m4
+ pmaxsw m0, m1
+ pminsw m0, m5
+ vextracti128 xm2, m0, 1
+ movu [r2], xm0
+ movu [r2 + r5], xm2
+ RET
+
%macro ADDAVG_W8_H4_AVX2 1
cglobal addAvg_8x%1, 6,7,6, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
mova m4, [pw_512]
More information about the x265-devel
mailing list