[x265] [PATCH 112 of 307] x86: Aligned routine implementation for addavg primitive
mythreyi at multicorewareinc.com
mythreyi at multicorewareinc.com
Sat Apr 7 04:31:50 CEST 2018
# HG changeset patch
# User Jayashri Murugan <jayashri at multicorewareinc.com>
# Date 1506512312 -19800
# Wed Sep 27 17:08:32 2017 +0530
# Node ID 762682acf5c25bdecbfec2d0f4f32da7dea3a9e2
# Parent b31fc8889e0f8a433be25fb6267552f7d03efeaf
x86: Aligned routine implementation for addavg primitive
diff -r b31fc8889e0f -r 762682acf5c2 source/common/pixel.cpp
--- a/source/common/pixel.cpp Mon Sep 25 13:11:24 2017 +0530
+++ b/source/common/pixel.cpp Wed Sep 27 17:08:32 2017 +0530
@@ -987,6 +987,7 @@
#define LUMA_PU(W, H) \
p.pu[LUMA_ ## W ## x ## H].copy_pp = blockcopy_pp_c<W, H>; \
p.pu[LUMA_ ## W ## x ## H].addAvg = addAvg<W, H>; \
+ p.pu[LUMA_ ## W ## x ## H].addAvg_aligned = addAvg<W, H>; \
p.pu[LUMA_ ## W ## x ## H].sad = sad<W, H>; \
p.pu[LUMA_ ## W ## x ## H].sad_x3 = sad_x3<W, H>; \
p.pu[LUMA_ ## W ## x ## H].sad_x4 = sad_x4<W, H>; \
@@ -1103,6 +1104,7 @@
#define CHROMA_PU_420(W, H) \
p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].addAvg = addAvg<W, H>; \
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].addAvg_aligned = addAvg<W, H>; \
p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].copy_pp = blockcopy_pp_c<W, H>; \
CHROMA_PU_420(2, 2);
@@ -1180,6 +1182,7 @@
#define CHROMA_PU_422(W, H) \
p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].addAvg = addAvg<W, H>; \
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].addAvg_aligned = addAvg<W, H>; \
p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].copy_pp = blockcopy_pp_c<W, H>; \
CHROMA_PU_422(2, 4);
diff -r b31fc8889e0f -r 762682acf5c2 source/common/primitives.cpp
--- a/source/common/primitives.cpp Mon Sep 25 13:11:24 2017 +0530
+++ b/source/common/primitives.cpp Wed Sep 27 17:08:32 2017 +0530
@@ -115,6 +115,7 @@
{
p.chroma[X265_CSP_I444].pu[i].copy_pp = p.pu[i].copy_pp;
p.chroma[X265_CSP_I444].pu[i].addAvg = p.pu[i].addAvg;
+ p.chroma[X265_CSP_I444].pu[i].addAvg_aligned = p.pu[i].addAvg_aligned;
p.chroma[X265_CSP_I444].pu[i].satd = p.pu[i].satd;
p.chroma[X265_CSP_I444].pu[i].p2s = p.pu[i].convert_p2s;
}
diff -r b31fc8889e0f -r 762682acf5c2 source/common/primitives.h
--- a/source/common/primitives.h Mon Sep 25 13:11:24 2017 +0530
+++ b/source/common/primitives.h Wed Sep 27 17:08:32 2017 +0530
@@ -245,6 +245,7 @@
pixelavg_pp_t pixelavg_pp; // quick bidir using pixels (borrowed from x264)
addAvg_t addAvg; // bidir motion compensation, uses 16bit values
+ addAvg_t addAvg_aligned;
copy_pp_t copy_pp;
filter_p2s_t convert_p2s;
@@ -386,6 +387,7 @@
filter_pp_t filter_hpp;
filter_hps_t filter_hps;
addAvg_t addAvg;
+ addAvg_t addAvg_aligned;
copy_pp_t copy_pp;
filter_p2s_t p2s;
filter_p2s_t p2s_aligned;
diff -r b31fc8889e0f -r 762682acf5c2 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Mon Sep 25 13:11:24 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp Wed Sep 27 17:08:32 2017 +0530
@@ -2510,6 +2510,65 @@
p.chroma[X265_CSP_I444].pu[LUMA_48x64].filter_hpp = PFX(interp_4tap_horiz_pp_48x64_avx512);
p.chroma[X265_CSP_I444].pu[LUMA_24x32].filter_hpp = PFX(interp_4tap_horiz_pp_24x32_avx512);
+ p.pu[LUMA_8x4].addAvg_aligned = PFX(addAvg_8x4_avx2);
+ p.pu[LUMA_8x8].addAvg_aligned = PFX(addAvg_8x8_avx2);
+ p.pu[LUMA_8x16].addAvg_aligned = PFX(addAvg_8x16_avx2);
+ p.pu[LUMA_8x32].addAvg_aligned = PFX(addAvg_8x32_avx2);
+ p.pu[LUMA_12x16].addAvg_aligned = PFX(addAvg_12x16_avx2);
+ p.pu[LUMA_16x4].addAvg_aligned = PFX(addAvg_aligned_16x4_avx512);
+ p.pu[LUMA_16x8].addAvg_aligned = PFX(addAvg_aligned_16x8_avx512);
+ p.pu[LUMA_16x12].addAvg_aligned = PFX(addAvg_aligned_16x12_avx512);
+ p.pu[LUMA_16x16].addAvg_aligned = PFX(addAvg_aligned_16x16_avx512);
+ p.pu[LUMA_16x32].addAvg_aligned = PFX(addAvg_aligned_16x32_avx512);
+ p.pu[LUMA_16x64].addAvg_aligned = PFX(addAvg_aligned_16x64_avx512);
+ p.pu[LUMA_48x64].addAvg_aligned = PFX(addAvg_aligned_48x64_avx512);
+ p.pu[LUMA_24x32].addAvg_aligned = PFX(addAvg_24x32_avx2);
+ p.pu[LUMA_32x8].addAvg_aligned = PFX(addAvg_aligned_32x8_avx512);
+ p.pu[LUMA_32x16].addAvg_aligned = PFX(addAvg_aligned_32x16_avx512);
+ p.pu[LUMA_32x24].addAvg_aligned = PFX(addAvg_aligned_32x24_avx512);
+ p.pu[LUMA_32x32].addAvg_aligned = PFX(addAvg_aligned_32x32_avx512);
+ p.pu[LUMA_32x64].addAvg_aligned = PFX(addAvg_aligned_32x64_avx512);
+ p.pu[LUMA_48x64].addAvg_aligned = PFX(addAvg_48x64_avx2);
+ p.pu[LUMA_64x16].addAvg_aligned = PFX(addAvg_aligned_64x16_avx512);
+ p.pu[LUMA_64x32].addAvg_aligned = PFX(addAvg_aligned_64x32_avx512);
+ p.pu[LUMA_64x48].addAvg_aligned = PFX(addAvg_aligned_64x48_avx512);
+ p.pu[LUMA_64x64].addAvg_aligned = PFX(addAvg_aligned_64x64_avx512);
+
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_8x2].addAvg_aligned = PFX(addAvg_8x2_avx2);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].addAvg_aligned = PFX(addAvg_8x4_avx2);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_8x6].addAvg_aligned = PFX(addAvg_8x6_avx2);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].addAvg_aligned = PFX(addAvg_8x8_avx2);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].addAvg_aligned = PFX(addAvg_8x16_avx2);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].addAvg_aligned = PFX(addAvg_8x32_avx2);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_12x16].addAvg_aligned = PFX(addAvg_12x16_avx2);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].addAvg_aligned = PFX(addAvg_aligned_16x4_avx512);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].addAvg_aligned = PFX(addAvg_aligned_16x8_avx512);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].addAvg_aligned = PFX(addAvg_aligned_16x12_avx512);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].addAvg_aligned = PFX(addAvg_aligned_16x16_avx512);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].addAvg_aligned = PFX(addAvg_aligned_16x32_avx512);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].addAvg_aligned = PFX(addAvg_aligned_32x8_avx512);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].addAvg_aligned = PFX(addAvg_aligned_32x16_avx512);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].addAvg_aligned = PFX(addAvg_aligned_32x24_avx512);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].addAvg_aligned = PFX(addAvg_aligned_32x32_avx512);
+
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].addAvg_aligned = PFX(addAvg_8x16_avx2);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].addAvg_aligned = PFX(addAvg_8x8_avx2);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].addAvg_aligned = PFX(addAvg_8x32_avx2);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_8x12].addAvg_aligned = PFX(addAvg_8x12_avx2);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].addAvg_aligned = PFX(addAvg_8x4_avx2);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].addAvg_aligned = PFX(addAvg_8x64_avx2);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].addAvg_aligned = PFX(addAvg_24x64_avx2);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_12x32].addAvg_aligned = PFX(addAvg_12x32_avx2);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].addAvg_aligned = PFX(addAvg_aligned_16x32_avx512);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].addAvg_aligned = PFX(addAvg_aligned_16x16_avx512);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].addAvg_aligned = PFX(addAvg_aligned_16x64_avx512);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].addAvg_aligned = PFX(addAvg_aligned_16x24_avx512);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].addAvg_aligned = PFX(addAvg_aligned_16x8_avx512);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].addAvg_aligned = PFX(addAvg_aligned_32x16_avx512);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].addAvg_aligned = PFX(addAvg_aligned_32x32_avx512);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].addAvg_aligned = PFX(addAvg_aligned_32x48_avx512);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].addAvg_aligned = PFX(addAvg_aligned_32x64_avx512);
+
}
}
#else // if HIGH_BIT_DEPTH
@@ -4176,6 +4235,64 @@
p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].addAvg = PFX(addAvg_32x64_avx512);
p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].addAvg = PFX(addAvg_32x32_avx512);
+ p.pu[LUMA_8x4].addAvg_aligned = PFX(addAvg_8x4_avx2);
+ p.pu[LUMA_8x8].addAvg_aligned = PFX(addAvg_8x8_avx2);
+ p.pu[LUMA_8x16].addAvg_aligned = PFX(addAvg_8x16_avx2);
+ p.pu[LUMA_8x32].addAvg_aligned = PFX(addAvg_8x32_avx2);
+ p.pu[LUMA_12x16].addAvg_aligned = PFX(addAvg_12x16_avx2);
+ p.pu[LUMA_16x4].addAvg_aligned = PFX(addAvg_16x4_avx2);
+ p.pu[LUMA_16x8].addAvg_aligned = PFX(addAvg_16x8_avx2);
+ p.pu[LUMA_16x12].addAvg_aligned = PFX(addAvg_16x12_avx2);
+ p.pu[LUMA_16x16].addAvg_aligned = PFX(addAvg_16x16_avx2);
+ p.pu[LUMA_16x32].addAvg_aligned = PFX(addAvg_16x32_avx2);
+ p.pu[LUMA_16x64].addAvg_aligned = PFX(addAvg_16x64_avx2);
+ p.pu[LUMA_24x32].addAvg_aligned = PFX(addAvg_24x32_avx2);
+ p.pu[LUMA_32x8].addAvg_aligned = PFX(addAvg_aligned_32x8_avx512);
+ p.pu[LUMA_32x16].addAvg_aligned = PFX(addAvg_aligned_32x16_avx512);
+ p.pu[LUMA_32x24].addAvg_aligned = PFX(addAvg_aligned_32x24_avx512);
+ p.pu[LUMA_32x32].addAvg_aligned = PFX(addAvg_aligned_32x32_avx512);
+ p.pu[LUMA_32x64].addAvg_aligned = PFX(addAvg_aligned_32x64_avx512);
+ p.pu[LUMA_48x64].addAvg_aligned = PFX(addAvg_48x64_avx2);
+ p.pu[LUMA_64x16].addAvg_aligned = PFX(addAvg_aligned_64x16_avx512);
+ p.pu[LUMA_64x32].addAvg_aligned = PFX(addAvg_aligned_64x32_avx512);
+ p.pu[LUMA_64x48].addAvg_aligned = PFX(addAvg_aligned_64x48_avx512);
+ p.pu[LUMA_64x64].addAvg_aligned = PFX(addAvg_aligned_64x64_avx512);
+
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_8x2].addAvg_aligned = PFX(addAvg_8x2_avx2);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].addAvg_aligned = PFX(addAvg_8x4_avx2);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_8x6].addAvg_aligned = PFX(addAvg_8x6_avx2);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].addAvg_aligned = PFX(addAvg_8x8_avx2);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].addAvg_aligned = PFX(addAvg_8x16_avx2);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].addAvg_aligned = PFX(addAvg_8x32_avx2);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_12x16].addAvg_aligned = PFX(addAvg_12x16_avx2);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].addAvg_aligned = PFX(addAvg_16x4_avx2);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].addAvg_aligned = PFX(addAvg_16x8_avx2);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].addAvg_aligned = PFX(addAvg_16x12_avx2);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].addAvg_aligned = PFX(addAvg_16x16_avx2);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].addAvg_aligned = PFX(addAvg_16x32_avx2);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].addAvg_aligned = PFX(addAvg_aligned_32x8_avx512);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].addAvg_aligned = PFX(addAvg_aligned_32x16_avx512);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].addAvg_aligned = PFX(addAvg_aligned_32x24_avx512);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].addAvg_aligned = PFX(addAvg_aligned_32x32_avx512);
+
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].addAvg_aligned = PFX(addAvg_8x4_avx2);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].addAvg_aligned = PFX(addAvg_8x8_avx2);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_8x12].addAvg_aligned = PFX(addAvg_8x12_avx2);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].addAvg_aligned = PFX(addAvg_8x16_avx2);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].addAvg_aligned = PFX(addAvg_8x32_avx2);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].addAvg_aligned = PFX(addAvg_8x64_avx2);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_12x32].addAvg_aligned = PFX(addAvg_12x32_avx2);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].addAvg_aligned = PFX(addAvg_16x8_avx2);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].addAvg_aligned = PFX(addAvg_16x16_avx2);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].addAvg_aligned = PFX(addAvg_16x24_avx2);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].addAvg_aligned = PFX(addAvg_16x32_avx2);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].addAvg_aligned = PFX(addAvg_16x64_avx2);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].addAvg_aligned = PFX(addAvg_24x64_avx2);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].addAvg_aligned = PFX(addAvg_aligned_32x16_avx512);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].addAvg_aligned = PFX(addAvg_aligned_32x48_avx512);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].addAvg_aligned = PFX(addAvg_aligned_32x64_avx512);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].addAvg_aligned = PFX(addAvg_aligned_32x32_avx512);
+
p.cu[BLOCK_32x32].blockfill_s = PFX(blockfill_s_32x32_avx512);
p.cu[BLOCK_64x64].add_ps = PFX(pixel_add_ps_64x64_avx512);
diff -r b31fc8889e0f -r 762682acf5c2 source/common/x86/mc-a.asm
--- a/source/common/x86/mc-a.asm Mon Sep 25 13:11:24 2017 +0530
+++ b/source/common/x86/mc-a.asm Wed Sep 27 17:08:32 2017 +0530
@@ -2002,6 +2002,352 @@
%endrep
PROCESS_ADDAVG_48x4_HBD_AVX512
RET
+
+%macro PROCESS_ADDAVG_ALIGNED_16x4_HBD_AVX512 0
+ movu ym0, [r0]
+ vinserti32x8 m0, [r0 + r3], 1
+ movu ym1, [r1]
+ vinserti32x8 m1, [r1 + r4], 1
+
+ paddw m0, m1
+ pmulhrsw m0, m3
+ paddw m0, m4
+ pmaxsw m0, m2
+ pminsw m0, m5
+
+ movu [r2], ym0
+ vextracti32x8 [r2 + r5], m0, 1
+
+ movu ym0, [r0 + 2 * r3]
+ vinserti32x8 m0, [r0 + r6], 1
+ movu ym1, [r1 + 2 * r4]
+ vinserti32x8 m1, [r1 + r7], 1
+
+ paddw m0, m1
+ pmulhrsw m0, m3
+ paddw m0, m4
+ pmaxsw m0, m2
+ pminsw m0, m5
+
+ movu [r2 + 2 * r5], ym0
+ vextracti32x8 [r2 + r8], m0, 1
+%endmacro
+
+%macro PROCESS_ADDAVG_ALIGNED_32x4_HBD_AVX512 0
+ movu m0, [r0]
+ movu m1, [r1]
+ paddw m0, m1
+ pmulhrsw m0, m3
+ paddw m0, m4
+ pmaxsw m0, m2
+ pminsw m0, m5
+ movu [r2], m0
+
+ movu m0, [r0 + r3]
+ movu m1, [r1 + r4]
+ paddw m0, m1
+ pmulhrsw m0, m3
+ paddw m0, m4
+ pmaxsw m0, m2
+ pminsw m0, m5
+ movu [r2 + r5], m0
+
+ movu m0, [r0 + 2 * r3]
+ movu m1, [r1 + 2 * r4]
+ paddw m0, m1
+ pmulhrsw m0, m3
+ paddw m0, m4
+ pmaxsw m0, m2
+ pminsw m0, m5
+ movu [r2 + 2 * r5], m0
+
+ movu m0, [r0 + r6]
+ movu m1, [r1 + r7]
+ paddw m0, m1
+ pmulhrsw m0, m3
+ paddw m0, m4
+ pmaxsw m0, m2
+ pminsw m0, m5
+ movu [r2 + r8], m0
+%endmacro
+
+%macro PROCESS_ADDAVG_ALIGNED_64x4_HBD_AVX512 0
+ movu m0, [r0]
+ movu m1, [r1]
+ paddw m0, m1
+ pmulhrsw m0, m3
+ paddw m0, m4
+ pmaxsw m0, m2
+ pminsw m0, m5
+ movu [r2], m0
+
+ movu m0, [r0 + mmsize]
+ movu m1, [r1 + mmsize]
+ paddw m0, m1
+ pmulhrsw m0, m3
+ paddw m0, m4
+ pmaxsw m0, m2
+ pminsw m0, m5
+ movu [r2 + mmsize], m0
+
+ movu m0, [r0 + r3]
+ movu m1, [r1 + r4]
+ paddw m0, m1
+ pmulhrsw m0, m3
+ paddw m0, m4
+ pmaxsw m0, m2
+ pminsw m0, m5
+ movu [r2 + r5], m0
+
+ movu m0, [r0 + r3 + mmsize]
+ movu m1, [r1 + r4 + mmsize]
+ paddw m0, m1
+ pmulhrsw m0, m3
+ paddw m0, m4
+ pmaxsw m0, m2
+ pminsw m0, m5
+ movu [r2 + r5 + mmsize], m0
+
+ movu m0, [r0 + 2 * r3]
+ movu m1, [r1 + 2 * r4]
+ paddw m0, m1
+ pmulhrsw m0, m3
+ paddw m0, m4
+ pmaxsw m0, m2
+ pminsw m0, m5
+ movu [r2 + 2 * r5], m0
+
+ movu m0, [r0 + 2 * r3 + mmsize]
+ movu m1, [r1 + 2 * r4 + mmsize]
+ paddw m0, m1
+ pmulhrsw m0, m3
+ paddw m0, m4
+ pmaxsw m0, m2
+ pminsw m0, m5
+ movu [r2 + 2 * r5 + mmsize], m0
+
+ movu m0, [r0 + r6]
+ movu m1, [r1 + r7]
+ paddw m0, m1
+ pmulhrsw m0, m3
+ paddw m0, m4
+ pmaxsw m0, m2
+ pminsw m0, m5
+ movu [r2 + r8], m0
+
+ movu m0, [r0 + r6 + mmsize]
+ movu m1, [r1 + r7 + mmsize]
+ paddw m0, m1
+ pmulhrsw m0, m3
+ paddw m0, m4
+ pmaxsw m0, m2
+ pminsw m0, m5
+ movu [r2 + r8 + mmsize], m0
+%endmacro
+
+%macro PROCESS_ADDAVG_ALIGNED_48x4_HBD_AVX512 0
+ movu m0, [r0]
+ movu m1, [r1]
+ paddw m0, m1
+ pmulhrsw m0, m3
+ paddw m0, m4
+ pmaxsw m0, m2
+ pminsw m0, m5
+ movu [r2], m0
+
+ movu ym0, [r0 + mmsize]
+ movu ym1, [r1 + mmsize]
+ paddw ym0, ym1
+ pmulhrsw ym0, ym3
+ paddw ym0, ym4
+ pmaxsw ym0, ym2
+ pminsw ym0, ym5
+ movu [r2 + mmsize], ym0
+
+ movu m0, [r0 + r3]
+ movu m1, [r1 + r4]
+ paddw m0, m1
+ pmulhrsw m0, m3
+ paddw m0, m4
+ pmaxsw m0, m2
+ pminsw m0, m5
+ movu [r2 + r5], m0
+
+ movu ym0, [r0 + r3 + mmsize]
+ movu ym1, [r1 + r4 + mmsize]
+ paddw ym0, ym1
+ pmulhrsw ym0, ym3
+ paddw ym0, ym4
+ pmaxsw ym0, ym2
+ pminsw ym0, ym5
+ movu [r2 + r5 + mmsize], ym0
+
+ movu m0, [r0 + 2 * r3]
+ movu m1, [r1 + 2 * r4]
+ paddw m0, m1
+ pmulhrsw m0, m3
+ paddw m0, m4
+ pmaxsw m0, m2
+ pminsw m0, m5
+ movu [r2 + 2 * r5], m0
+
+ movu ym0, [r0 + 2 * r3 + mmsize]
+ movu ym1, [r1 + 2 * r4 + mmsize]
+ paddw ym0, ym1
+ pmulhrsw ym0, ym3
+ paddw ym0, ym4
+ pmaxsw ym0, ym2
+ pminsw ym0, ym5
+ movu [r2 + 2 * r5 + mmsize], ym0
+
+ movu m0, [r0 + r6]
+ movu m1, [r1 + r7]
+ paddw m0, m1
+ pmulhrsw m0, m3
+ paddw m0, m4
+ pmaxsw m0, m2
+ pminsw m0, m5
+ movu [r2 + r8], m0
+
+ movu ym0, [r0 + r6 + mmsize]
+ movu ym1, [r1 + r7 + mmsize]
+ paddw ym0, ym1
+ pmulhrsw ym0, ym3
+ paddw ym0, ym4
+ pmaxsw ym0, ym2
+ pminsw ym0, ym5
+ movu [r2 + r8 + mmsize], ym0
+%endmacro
+;-----------------------------------------------------------------------------
+;void addAvg (int16_t* src0, int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride)
+;-----------------------------------------------------------------------------
+INIT_ZMM avx512
+cglobal addAvg_aligned_16x4, 6,9,6
+ vbroadcasti32x8 m4, [pw_ %+ ADDAVG_ROUND]
+ vbroadcasti32x8 m5, [pw_pixel_max]
+ vbroadcasti32x8 m3, [pw_ %+ ADDAVG_FACTOR]
+ pxor m2, m2
+ add r3, r3
+ add r4, r4
+ add r5, r5
+ lea r6, [3 * r3]
+ lea r7, [3 * r4]
+ lea r8, [3 * r5]
+ PROCESS_ADDAVG_ALIGNED_16x4_HBD_AVX512
+ RET
+
+%macro ADDAVG_ALIGNED_W16_HBD_AVX512 1
+INIT_ZMM avx512
+cglobal addAvg_aligned_16x%1, 6,9,6
+ vbroadcasti32x8 m4, [pw_ %+ ADDAVG_ROUND]
+ vbroadcasti32x8 m5, [pw_pixel_max]
+ vbroadcasti32x8 m3, [pw_ %+ ADDAVG_FACTOR]
+ pxor m2, m2
+ add r3, r3
+ add r4, r4
+ add r5, r5
+ lea r6, [3 * r3]
+ lea r7, [3 * r4]
+ lea r8, [3 * r5]
+
+%rep %1/4 - 1
+ PROCESS_ADDAVG_ALIGNED_16x4_HBD_AVX512
+ lea r2, [r2 + 4 * r5]
+ lea r0, [r0 + 4 * r3]
+ lea r1, [r1 + 4 * r4]
+%endrep
+ PROCESS_ADDAVG_ALIGNED_16x4_HBD_AVX512
+ RET
+%endmacro
+
+ADDAVG_ALIGNED_W16_HBD_AVX512 8
+ADDAVG_ALIGNED_W16_HBD_AVX512 12
+ADDAVG_ALIGNED_W16_HBD_AVX512 16
+ADDAVG_ALIGNED_W16_HBD_AVX512 24
+ADDAVG_ALIGNED_W16_HBD_AVX512 32
+ADDAVG_ALIGNED_W16_HBD_AVX512 64
+
+%macro ADDAVG_ALIGNED_W32_HBD_AVX512 1
+INIT_ZMM avx512
+cglobal addAvg_aligned_32x%1, 6,9,6
+ vbroadcasti32x8 m4, [pw_ %+ ADDAVG_ROUND]
+ vbroadcasti32x8 m5, [pw_pixel_max]
+ vbroadcasti32x8 m3, [pw_ %+ ADDAVG_FACTOR]
+ pxor m2, m2
+ add r3, r3
+ add r4, r4
+ add r5, r5
+ lea r6, [3 * r3]
+ lea r7, [3 * r4]
+ lea r8, [3 * r5]
+
+%rep %1/4 - 1
+ PROCESS_ADDAVG_ALIGNED_32x4_HBD_AVX512
+ lea r2, [r2 + 4 * r5]
+ lea r0, [r0 + 4 * r3]
+ lea r1, [r1 + 4 * r4]
+%endrep
+ PROCESS_ADDAVG_ALIGNED_32x4_HBD_AVX512
+ RET
+%endmacro
+
+ADDAVG_ALIGNED_W32_HBD_AVX512 8
+ADDAVG_ALIGNED_W32_HBD_AVX512 16
+ADDAVG_ALIGNED_W32_HBD_AVX512 24
+ADDAVG_ALIGNED_W32_HBD_AVX512 32
+ADDAVG_ALIGNED_W32_HBD_AVX512 48
+ADDAVG_ALIGNED_W32_HBD_AVX512 64
+
+%macro ADDAVG_ALIGNED_W64_HBD_AVX512 1
+INIT_ZMM avx512
+cglobal addAvg_aligned_64x%1, 6,9,6
+ vbroadcasti32x8 m4, [pw_ %+ ADDAVG_ROUND]
+ vbroadcasti32x8 m5, [pw_pixel_max]
+ vbroadcasti32x8 m3, [pw_ %+ ADDAVG_FACTOR]
+ pxor m2, m2
+ add r3, r3
+ add r4, r4
+ add r5, r5
+ lea r6, [3 * r3]
+ lea r7, [3 * r4]
+ lea r8, [3 * r5]
+
+%rep %1/4 - 1
+ PROCESS_ADDAVG_ALIGNED_64x4_HBD_AVX512
+ lea r2, [r2 + 4 * r5]
+ lea r0, [r0 + 4 * r3]
+ lea r1, [r1 + 4 * r4]
+%endrep
+ PROCESS_ADDAVG_ALIGNED_64x4_HBD_AVX512
+ RET
+%endmacro
+
+ADDAVG_ALIGNED_W64_HBD_AVX512 16
+ADDAVG_ALIGNED_W64_HBD_AVX512 32
+ADDAVG_ALIGNED_W64_HBD_AVX512 48
+ADDAVG_ALIGNED_W64_HBD_AVX512 64
+
+INIT_ZMM avx512
+cglobal addAvg_aligned_48x64, 6,9,6
+ vbroadcasti32x8 m4, [pw_ %+ ADDAVG_ROUND]
+ vbroadcasti32x8 m5, [pw_pixel_max]
+ vbroadcasti32x8 m3, [pw_ %+ ADDAVG_FACTOR]
+ pxor m2, m2
+ add r3, r3
+ add r4, r4
+ add r5, r5
+ lea r6, [3 * r3]
+ lea r7, [3 * r4]
+ lea r8, [3 * r5]
+
+%rep 15
+ PROCESS_ADDAVG_ALIGNED_48x4_HBD_AVX512
+ lea r2, [r2 + 4 * r5]
+ lea r0, [r0 + 4 * r3]
+ lea r1, [r1 + 4 * r4]
+%endrep
+ PROCESS_ADDAVG_ALIGNED_48x4_HBD_AVX512
+ RET
;-----------------------------------------------------------------------------
;addAvg avx512 high bit depth code end
;-----------------------------------------------------------------------------
@@ -3424,6 +3770,112 @@
ADDAVG_W32_AVX512 32
ADDAVG_W32_AVX512 48
ADDAVG_W32_AVX512 64
+
+%macro PROCESS_ADDAVG_ALIGNED_64x2_AVX512 0
+ mova m0, [r0]
+ mova m1, [r1]
+ mova m2, [r0 + mmsize]
+ mova m3, [r1 + mmsize]
+
+ paddw m0, m1
+ pmulhrsw m0, m4
+ paddw m0, m5
+ paddw m2, m3
+ pmulhrsw m2, m4
+ paddw m2, m5
+
+ packuswb m0, m2
+ vpermq m0, m6, m0
+ mova [r2], m0
+
+ mova m0, [r0 + r3]
+ mova m1, [r1 + r4]
+ mova m2, [r0 + r3 + mmsize]
+ mova m3, [r1 + r4 + mmsize]
+
+ paddw m0, m1
+ pmulhrsw m0, m4
+ paddw m0, m5
+ paddw m2, m3
+ pmulhrsw m2, m4
+ paddw m2, m5
+
+ packuswb m0, m2
+ vpermq m0, m6, m0
+ mova [r2 + r5], m0
+%endmacro
+
+%macro PROCESS_ADDAVG_ALIGNED_32x2_AVX512 0
+ mova m0, [r0]
+ mova m1, [r1]
+ mova m2, [r0 + r3]
+ mova m3, [r1 + r4]
+
+ paddw m0, m1
+ pmulhrsw m0, m4
+ paddw m0, m5
+ paddw m2, m3
+ pmulhrsw m2, m4
+ paddw m2, m5
+
+ packuswb m0, m2
+ vpermq m0, m6, m0
+ mova [r2], ym0
+ vextracti32x8 [r2 + r5], m0, 1
+%endmacro
+;--------------------------------------------------------------------------------------------------------------------
+;void addAvg (int16_t* src0, int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride)
+;--------------------------------------------------------------------------------------------------------------------
+%macro ADDAVG_ALIGNED_W64_AVX512 1
+INIT_ZMM avx512
+cglobal addAvg_aligned_64x%1, 6,6,7
+ vbroadcasti32x8 m4, [pw_256]
+ vbroadcasti32x8 m5, [pw_128]
+ mova m6, [shuf_avx512]
+
+ add r3, r3
+ add r4, r4
+
+%rep %1/2 - 1
+ PROCESS_ADDAVG_ALIGNED_64x2_AVX512
+ lea r2, [r2 + 2 * r5]
+ lea r0, [r0 + 2 * r3]
+ lea r1, [r1 + 2 * r4]
+%endrep
+ PROCESS_ADDAVG_ALIGNED_64x2_AVX512
+ RET
+%endmacro
+
+ADDAVG_ALIGNED_W64_AVX512 16
+ADDAVG_ALIGNED_W64_AVX512 32
+ADDAVG_ALIGNED_W64_AVX512 48
+ADDAVG_ALIGNED_W64_AVX512 64
+
+%macro ADDAVG_ALIGNED_W32_AVX512 1
+INIT_ZMM avx512
+cglobal addAvg_aligned_32x%1, 6,6,7
+ vbroadcasti32x8 m4, [pw_256]
+ vbroadcasti32x8 m5, [pw_128]
+ mova m6, [shuf_avx512]
+ add r3, r3
+ add r4, r4
+
+%rep %1/2 - 1
+ PROCESS_ADDAVG_ALIGNED_32x2_AVX512
+ lea r2, [r2 + 2 * r5]
+ lea r0, [r0 + 2 * r3]
+ lea r1, [r1 + 2 * r4]
+%endrep
+ PROCESS_ADDAVG_ALIGNED_32x2_AVX512
+ RET
+%endmacro
+
+ADDAVG_ALIGNED_W32_AVX512 8
+ADDAVG_ALIGNED_W32_AVX512 16
+ADDAVG_ALIGNED_W32_AVX512 24
+ADDAVG_ALIGNED_W32_AVX512 32
+ADDAVG_ALIGNED_W32_AVX512 48
+ADDAVG_ALIGNED_W32_AVX512 64
;-----------------------------------------------------------------------------
; addAvg avx512 code end
;-----------------------------------------------------------------------------
diff -r b31fc8889e0f -r 762682acf5c2 source/common/x86/pixel.h
--- a/source/common/x86/pixel.h Mon Sep 25 13:11:24 2017 +0530
+++ b/source/common/x86/pixel.h Wed Sep 27 17:08:32 2017 +0530
@@ -50,6 +50,7 @@
FUNCDEF_CHROMA_PU(int, pixel_sad, cpu, const pixel*, intptr_t, const pixel*, intptr_t); \
FUNCDEF_CHROMA_PU(sse_t, pixel_ssd_ss, cpu, const int16_t*, intptr_t, const int16_t*, intptr_t); \
FUNCDEF_CHROMA_PU(void, addAvg, cpu, const int16_t*, const int16_t*, pixel*, intptr_t, intptr_t, intptr_t); \
+ FUNCDEF_CHROMA_PU(void, addAvg_aligned, cpu, const int16_t*, const int16_t*, pixel*, intptr_t, intptr_t, intptr_t); \
FUNCDEF_CHROMA_PU(sse_t, pixel_ssd_s, cpu, const int16_t*, intptr_t); \
FUNCDEF_TU_S(sse_t, pixel_ssd_s, cpu, const int16_t*, intptr_t); \
FUNCDEF_TU(uint64_t, pixel_var, cpu, const pixel*, intptr_t); \
diff -r b31fc8889e0f -r 762682acf5c2 source/test/pixelharness.cpp
--- a/source/test/pixelharness.cpp Mon Sep 25 13:11:24 2017 +0530
+++ b/source/test/pixelharness.cpp Wed Sep 27 17:08:32 2017 +0530
@@ -873,8 +873,8 @@
bool PixelHarness::check_addAvg(addAvg_t ref, addAvg_t opt)
{
- ALIGN_VAR_16(pixel, ref_dest[64 * 64]);
- ALIGN_VAR_16(pixel, opt_dest[64 * 64]);
+ ALIGN_VAR_64(pixel, ref_dest[64 * 64]);
+ ALIGN_VAR_64(pixel, opt_dest[64 * 64]);
int j = 0;
@@ -898,6 +898,32 @@
return true;
}
+bool PixelHarness::check_addAvg_aligned(addAvg_t ref, addAvg_t opt)
+{
+ ALIGN_VAR_64(pixel, ref_dest[64 * 64]);
+ ALIGN_VAR_64(pixel, opt_dest[64 * 64]);
+
+ int j = 0;
+
+ memset(ref_dest, 0xCD, sizeof(ref_dest));
+ memset(opt_dest, 0xCD, sizeof(opt_dest));
+ intptr_t stride = STRIDE;
+
+ for (int i = 0; i < ITERS; i++)
+ {
+ int index1 = rand() % TEST_CASES;
+ int index2 = rand() % TEST_CASES;
+ ref(short_test_buff2[index1] + j, short_test_buff2[index2] + j, ref_dest, stride, stride, stride);
+ checked(opt, short_test_buff2[index1] + j, short_test_buff2[index2] + j, opt_dest, stride, stride, stride);
+ if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(pixel)))
+ return false;
+
+ reportfail();
+ j += INCR * 2;
+ }
+
+ return true;
+}
bool PixelHarness::check_calSign(sign_t ref, sign_t opt)
{
ALIGN_VAR_16(int8_t, ref_dest[64 * 2]);
@@ -2140,6 +2166,15 @@
}
}
+ if (opt.pu[part].addAvg_aligned)
+ {
+ if (!check_addAvg_aligned(ref.pu[part].addAvg_aligned, opt.pu[part].addAvg_aligned))
+ {
+ printf("addAvg_aligned[%s] failed\n", lumaPartStr[part]);
+ return false;
+ }
+ }
+
if (part < NUM_CU_SIZES)
{
if (opt.cu[part].sse_pp)
@@ -2224,6 +2259,14 @@
return false;
}
}
+ if (opt.chroma[i].pu[part].addAvg_aligned)
+ {
+ if (!check_addAvg_aligned(ref.chroma[i].pu[part].addAvg_aligned, opt.chroma[i].pu[part].addAvg_aligned))
+ {
+ printf("chroma_addAvg_aligned[%s][%s] failed\n", x265_source_csp_names[i], chromaPartStr[i][part]);
+ return false;
+ }
+ }
if (opt.chroma[i].pu[part].satd)
{
if (!check_pixelcmp(ref.chroma[i].pu[part].satd, opt.chroma[i].pu[part].satd))
@@ -2869,6 +2912,11 @@
HEADER("addAvg[%s]", lumaPartStr[part]);
REPORT_SPEEDUP(opt.pu[part].addAvg, ref.pu[part].addAvg, sbuf1, sbuf2, pbuf1, STRIDE, STRIDE, STRIDE);
}
+ if (opt.pu[part].addAvg_aligned)
+ {
+ HEADER("addAvg_aligned[%s]", lumaPartStr[part]);
+ REPORT_SPEEDUP(opt.pu[part].addAvg_aligned, ref.pu[part].addAvg_aligned, sbuf1, sbuf2, pbuf1, STRIDE, STRIDE, STRIDE);
+ }
if (part < NUM_CU_SIZES)
{
@@ -2922,6 +2970,11 @@
HEADER("[%s] addAvg[%s]", x265_source_csp_names[i], chromaPartStr[i][part]);
REPORT_SPEEDUP(opt.chroma[i].pu[part].addAvg, ref.chroma[i].pu[part].addAvg, sbuf1, sbuf2, pbuf1, STRIDE, STRIDE, STRIDE);
}
+ if (opt.chroma[i].pu[part].addAvg_aligned)
+ {
+ HEADER("[%s] addAvg_aligned[%s]", x265_source_csp_names[i], chromaPartStr[i][part]);
+ REPORT_SPEEDUP(opt.chroma[i].pu[part].addAvg_aligned, ref.chroma[i].pu[part].addAvg_aligned, sbuf1, sbuf2, pbuf1, STRIDE, STRIDE, STRIDE);
+ }
if (opt.chroma[i].pu[part].satd)
{
HEADER("[%s] satd[%s]", x265_source_csp_names[i], chromaPartStr[i][part]);
diff -r b31fc8889e0f -r 762682acf5c2 source/test/pixelharness.h
--- a/source/test/pixelharness.h Mon Sep 25 13:11:24 2017 +0530
+++ b/source/test/pixelharness.h Wed Sep 27 17:08:32 2017 +0530
@@ -44,30 +44,30 @@
enum { RMAX = PIXEL_MAX - PIXEL_MIN }; //The maximum value obtained by subtracting pixel values (residual max)
enum { RMIN = PIXEL_MIN - PIXEL_MAX }; //The minimum value obtained by subtracting pixel values (residual min)
- ALIGN_VAR_32(pixel, pbuf1[BUFFSIZE]);
- pixel pbuf2[BUFFSIZE];
- pixel pbuf3[BUFFSIZE];
- pixel pbuf4[BUFFSIZE];
- int ibuf1[BUFFSIZE];
- int8_t psbuf1[BUFFSIZE];
- int8_t psbuf2[BUFFSIZE];
- int8_t psbuf3[BUFFSIZE];
- int8_t psbuf4[BUFFSIZE];
- int8_t psbuf5[BUFFSIZE];
+ ALIGN_VAR_64(pixel, pbuf1[BUFFSIZE]);
+ ALIGN_VAR_64(pixel, pbuf2[BUFFSIZE]);
+ ALIGN_VAR_64(pixel, pbuf3[BUFFSIZE]);
+ ALIGN_VAR_64(pixel, pbuf4[BUFFSIZE]);
+ ALIGN_VAR_64(int, ibuf1[BUFFSIZE]);
+ ALIGN_VAR_64(int8_t, psbuf1[BUFFSIZE]);
+ ALIGN_VAR_64(int8_t, psbuf2[BUFFSIZE]);
+ ALIGN_VAR_64(int8_t, psbuf3[BUFFSIZE]);
+ ALIGN_VAR_64(int8_t, psbuf4[BUFFSIZE]);
+ ALIGN_VAR_64(int8_t, psbuf5[BUFFSIZE]);
- int16_t sbuf1[BUFFSIZE];
- int16_t sbuf2[BUFFSIZE];
- int16_t sbuf3[BUFFSIZE];
+ ALIGN_VAR_64(int16_t, sbuf1[BUFFSIZE]);
+ ALIGN_VAR_64(int16_t, sbuf2[BUFFSIZE]);
+ ALIGN_VAR_64(int16_t, sbuf3[BUFFSIZE]);
- pixel pixel_test_buff[TEST_CASES][BUFFSIZE];
- int16_t short_test_buff[TEST_CASES][BUFFSIZE];
- int16_t short_test_buff1[TEST_CASES][BUFFSIZE];
- int16_t short_test_buff2[TEST_CASES][BUFFSIZE];
- int int_test_buff[TEST_CASES][BUFFSIZE];
- uint16_t ushort_test_buff[TEST_CASES][BUFFSIZE];
- uint8_t uchar_test_buff[TEST_CASES][BUFFSIZE];
- double double_test_buff[TEST_CASES][BUFFSIZE];
- int16_t residual_test_buff[TEST_CASES][BUFFSIZE];
+ ALIGN_VAR_64(pixel, pixel_test_buff[TEST_CASES][BUFFSIZE]);
+ ALIGN_VAR_64(int16_t, short_test_buff[TEST_CASES][BUFFSIZE]);
+ ALIGN_VAR_64(int16_t, short_test_buff1[TEST_CASES][BUFFSIZE]);
+ ALIGN_VAR_64(int16_t, short_test_buff2[TEST_CASES][BUFFSIZE]);
+ ALIGN_VAR_64(int, int_test_buff[TEST_CASES][BUFFSIZE]);
+ ALIGN_VAR_64(uint16_t, ushort_test_buff[TEST_CASES][BUFFSIZE]);
+ ALIGN_VAR_64(uint8_t, uchar_test_buff[TEST_CASES][BUFFSIZE]);
+ ALIGN_VAR_64(double, double_test_buff[TEST_CASES][BUFFSIZE]);
+ ALIGN_VAR_64(int16_t, residual_test_buff[TEST_CASES][BUFFSIZE]);
bool check_pixelcmp(pixelcmp_t ref, pixelcmp_t opt);
bool check_pixel_sse(pixel_sse_t ref, pixel_sse_t opt);
@@ -99,6 +99,7 @@
bool check_ssim_4x4x2_core(ssim_4x4x2_core_t ref, ssim_4x4x2_core_t opt);
bool check_ssim_end(ssim_end4_t ref, ssim_end4_t opt);
bool check_addAvg(addAvg_t, addAvg_t);
+ bool check_addAvg_aligned(addAvg_t, addAvg_t);
bool check_saoCuOrgE0_t(saoCuOrgE0_t ref, saoCuOrgE0_t opt);
bool check_saoCuOrgE1_t(saoCuOrgE1_t ref, saoCuOrgE1_t opt);
bool check_saoCuOrgE2_t(saoCuOrgE2_t ref[], saoCuOrgE2_t opt[]);
More information about the x265-devel
mailing list