[x265] [PATCH 270 of 307] x86: Aligned routine implementation for pixel_avg_aligned primitive for high bit depth
mythreyi at multicorewareinc.com
mythreyi at multicorewareinc.com
Sat Apr 7 04:34:28 CEST 2018
# HG changeset patch
# User Jayashri Murugan <jayashri at multicorewareinc.com>
# Date 1513164083 -19800
# Wed Dec 13 16:51:23 2017 +0530
# Node ID 265fd2e1e49587837ebed4e7efcc38a0f6e79346
# Parent a9be28cde01fd379dff1aec4bfcf809c7c96f9d2
x86: Aligned routine implementation for pixel_avg_aligned primitive for high bit depth
1. Aligned code impementation
- pixel_avg_aligned_64xN
- pixel_avg_aligned_32xN
- pixel_avg_aligned_48xN
2. Testbench implementation
3. Linking with encoder
4. cpy1Dto2D_shl cleaup
diff -r a9be28cde01f -r 265fd2e1e495 source/common/lowres.h
--- a/source/common/lowres.h Tue Dec 12 15:43:30 2017 +0530
+++ b/source/common/lowres.h Wed Dec 13 16:51:23 2017 +0530
@@ -69,7 +69,7 @@
int qmvy = qmv.y + (qmv.y & 1);
int hpelB = (qmvy & 2) | ((qmvx & 2) >> 1);
pixel *frefB = lowresPlane[hpelB] + blockOffset + (qmvx >> 2) + (qmvy >> 2) * lumaStride;
- primitives.pu[LUMA_8x8].pixelavg_pp(buf, outstride, frefA, lumaStride, frefB, lumaStride, 32);
+ primitives.pu[LUMA_8x8].pixelavg_pp[(outstride % 64 == 0) && (lumaStride % 64 == 0)](buf, outstride, frefA, lumaStride, frefB, lumaStride, 32);
return buf;
}
else
@@ -91,7 +91,7 @@
int qmvy = qmv.y + (qmv.y & 1);
int hpelB = (qmvy & 2) | ((qmvx & 2) >> 1);
pixel *frefB = lowresPlane[hpelB] + blockOffset + (qmvx >> 2) + (qmvy >> 2) * lumaStride;
- primitives.pu[LUMA_8x8].pixelavg_pp(subpelbuf, 8, frefA, lumaStride, frefB, lumaStride, 32);
+ primitives.pu[LUMA_8x8].pixelavg_pp[NONALIGNED](subpelbuf, 8, frefA, lumaStride, frefB, lumaStride, 32);
return comp(fenc, FENC_STRIDE, subpelbuf, 8);
}
else
diff -r a9be28cde01f -r 265fd2e1e495 source/common/pixel.cpp
--- a/source/common/pixel.cpp Tue Dec 12 15:43:30 2017 +0530
+++ b/source/common/pixel.cpp Wed Dec 13 16:51:23 2017 +0530
@@ -991,8 +991,8 @@
p.pu[LUMA_ ## W ## x ## H].sad = sad<W, H>; \
p.pu[LUMA_ ## W ## x ## H].sad_x3 = sad_x3<W, H>; \
p.pu[LUMA_ ## W ## x ## H].sad_x4 = sad_x4<W, H>; \
- p.pu[LUMA_ ## W ## x ## H].pixelavg_pp = pixelavg_pp<W, H>;
-
+ p.pu[LUMA_ ## W ## x ## H].pixelavg_pp[NONALIGNED] = pixelavg_pp<W, H>; \
+ p.pu[LUMA_ ## W ## x ## H].pixelavg_pp[ALIGNED] = pixelavg_pp<W, H>;
#define LUMA_CU(W, H) \
p.cu[BLOCK_ ## W ## x ## H].sub_ps = pixel_sub_ps_c<W, H>; \
p.cu[BLOCK_ ## W ## x ## H].add_ps[NONALIGNED] = pixel_add_ps_c<W, H>; \
diff -r a9be28cde01f -r 265fd2e1e495 source/common/primitives.h
--- a/source/common/primitives.h Tue Dec 12 15:43:30 2017 +0530
+++ b/source/common/primitives.h Wed Dec 13 16:51:23 2017 +0530
@@ -249,10 +249,8 @@
filter_sp_t luma_vsp;
filter_ss_t luma_vss;
filter_hv_pp_t luma_hvpp; // combines hps + vsp
-
- pixelavg_pp_t pixelavg_pp; // quick bidir using pixels (borrowed from x264)
+ pixelavg_pp_t pixelavg_pp[NUM_ALIGNMENT_TYPES]; // quick bidir using pixels (borrowed from x264)
addAvg_t addAvg[NUM_ALIGNMENT_TYPES]; // bidir motion compensation, uses 16bit values
-
copy_pp_t copy_pp;
filter_p2s_t convert_p2s[NUM_ALIGNMENT_TYPES];
}
diff -r a9be28cde01f -r 265fd2e1e495 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Tue Dec 12 15:43:30 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp Wed Dec 13 16:51:23 2017 +0530
@@ -404,36 +404,58 @@
p.chroma[X265_CSP_I422].cu[BLOCK_422_8x16].sa8d = PFX(pixel_sa8d_8x16_ ## cpu); \
p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].sa8d = PFX(pixel_sa8d_16x32_ ## cpu); \
p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].sa8d = PFX(pixel_sa8d_32x64_ ## cpu)
-
#define PIXEL_AVG(cpu) \
- p.pu[LUMA_64x64].pixelavg_pp = PFX(pixel_avg_64x64_ ## cpu); \
- p.pu[LUMA_64x48].pixelavg_pp = PFX(pixel_avg_64x48_ ## cpu); \
- p.pu[LUMA_64x32].pixelavg_pp = PFX(pixel_avg_64x32_ ## cpu); \
- p.pu[LUMA_64x16].pixelavg_pp = PFX(pixel_avg_64x16_ ## cpu); \
- p.pu[LUMA_48x64].pixelavg_pp = PFX(pixel_avg_48x64_ ## cpu); \
- p.pu[LUMA_32x64].pixelavg_pp = PFX(pixel_avg_32x64_ ## cpu); \
- p.pu[LUMA_32x32].pixelavg_pp = PFX(pixel_avg_32x32_ ## cpu); \
- p.pu[LUMA_32x24].pixelavg_pp = PFX(pixel_avg_32x24_ ## cpu); \
- p.pu[LUMA_32x16].pixelavg_pp = PFX(pixel_avg_32x16_ ## cpu); \
- p.pu[LUMA_32x8].pixelavg_pp = PFX(pixel_avg_32x8_ ## cpu); \
- p.pu[LUMA_24x32].pixelavg_pp = PFX(pixel_avg_24x32_ ## cpu); \
- p.pu[LUMA_16x64].pixelavg_pp = PFX(pixel_avg_16x64_ ## cpu); \
- p.pu[LUMA_16x32].pixelavg_pp = PFX(pixel_avg_16x32_ ## cpu); \
- p.pu[LUMA_16x16].pixelavg_pp = PFX(pixel_avg_16x16_ ## cpu); \
- p.pu[LUMA_16x12].pixelavg_pp = PFX(pixel_avg_16x12_ ## cpu); \
- p.pu[LUMA_16x8].pixelavg_pp = PFX(pixel_avg_16x8_ ## cpu); \
- p.pu[LUMA_16x4].pixelavg_pp = PFX(pixel_avg_16x4_ ## cpu); \
- p.pu[LUMA_12x16].pixelavg_pp = PFX(pixel_avg_12x16_ ## cpu); \
- p.pu[LUMA_8x32].pixelavg_pp = PFX(pixel_avg_8x32_ ## cpu); \
- p.pu[LUMA_8x16].pixelavg_pp = PFX(pixel_avg_8x16_ ## cpu); \
- p.pu[LUMA_8x8].pixelavg_pp = PFX(pixel_avg_8x8_ ## cpu); \
- p.pu[LUMA_8x4].pixelavg_pp = PFX(pixel_avg_8x4_ ## cpu);
-
+ p.pu[LUMA_64x64].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_64x64_ ## cpu); \
+ p.pu[LUMA_64x48].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_64x48_ ## cpu); \
+ p.pu[LUMA_64x32].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_64x32_ ## cpu); \
+ p.pu[LUMA_64x16].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_64x16_ ## cpu); \
+ p.pu[LUMA_48x64].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_48x64_ ## cpu); \
+ p.pu[LUMA_32x64].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_32x64_ ## cpu); \
+ p.pu[LUMA_32x32].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_32x32_ ## cpu); \
+ p.pu[LUMA_32x24].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_32x24_ ## cpu); \
+ p.pu[LUMA_32x16].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_32x16_ ## cpu); \
+ p.pu[LUMA_32x8].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_32x8_ ## cpu); \
+ p.pu[LUMA_24x32].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_24x32_ ## cpu); \
+ p.pu[LUMA_16x64].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_16x64_ ## cpu); \
+ p.pu[LUMA_16x32].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_16x32_ ## cpu); \
+ p.pu[LUMA_16x16].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_16x16_ ## cpu); \
+ p.pu[LUMA_16x12].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_16x12_ ## cpu); \
+ p.pu[LUMA_16x8].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_16x8_ ## cpu); \
+ p.pu[LUMA_16x4].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_16x4_ ## cpu); \
+ p.pu[LUMA_12x16].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_12x16_ ## cpu); \
+ p.pu[LUMA_8x32].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_8x32_ ## cpu); \
+ p.pu[LUMA_8x16].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_8x16_ ## cpu); \
+ p.pu[LUMA_8x8].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_8x8_ ## cpu); \
+ p.pu[LUMA_8x4].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_8x4_ ## cpu); \
+ p.pu[LUMA_64x64].pixelavg_pp[ALIGNED] = PFX(pixel_avg_64x64_ ## cpu); \
+ p.pu[LUMA_64x48].pixelavg_pp[ALIGNED] = PFX(pixel_avg_64x48_ ## cpu); \
+ p.pu[LUMA_64x32].pixelavg_pp[ALIGNED] = PFX(pixel_avg_64x32_ ## cpu); \
+ p.pu[LUMA_64x16].pixelavg_pp[ALIGNED] = PFX(pixel_avg_64x16_ ## cpu); \
+ p.pu[LUMA_48x64].pixelavg_pp[ALIGNED] = PFX(pixel_avg_48x64_ ## cpu); \
+ p.pu[LUMA_32x64].pixelavg_pp[ALIGNED] = PFX(pixel_avg_32x64_ ## cpu); \
+ p.pu[LUMA_32x32].pixelavg_pp[ALIGNED] = PFX(pixel_avg_32x32_ ## cpu); \
+ p.pu[LUMA_32x24].pixelavg_pp[ALIGNED] = PFX(pixel_avg_32x24_ ## cpu); \
+ p.pu[LUMA_32x16].pixelavg_pp[ALIGNED] = PFX(pixel_avg_32x16_ ## cpu); \
+ p.pu[LUMA_32x8].pixelavg_pp[ALIGNED] = PFX(pixel_avg_32x8_ ## cpu); \
+ p.pu[LUMA_24x32].pixelavg_pp[ALIGNED] = PFX(pixel_avg_24x32_ ## cpu); \
+ p.pu[LUMA_16x64].pixelavg_pp[ALIGNED] = PFX(pixel_avg_16x64_ ## cpu); \
+ p.pu[LUMA_16x32].pixelavg_pp[ALIGNED] = PFX(pixel_avg_16x32_ ## cpu); \
+ p.pu[LUMA_16x16].pixelavg_pp[ALIGNED] = PFX(pixel_avg_16x16_ ## cpu); \
+ p.pu[LUMA_16x12].pixelavg_pp[ALIGNED] = PFX(pixel_avg_16x12_ ## cpu); \
+ p.pu[LUMA_16x8].pixelavg_pp[ALIGNED] = PFX(pixel_avg_16x8_ ## cpu); \
+ p.pu[LUMA_16x4].pixelavg_pp[ALIGNED] = PFX(pixel_avg_16x4_ ## cpu); \
+ p.pu[LUMA_12x16].pixelavg_pp[ALIGNED] = PFX(pixel_avg_12x16_ ## cpu); \
+ p.pu[LUMA_8x32].pixelavg_pp[ALIGNED] = PFX(pixel_avg_8x32_ ## cpu); \
+ p.pu[LUMA_8x16].pixelavg_pp[ALIGNED] = PFX(pixel_avg_8x16_ ## cpu); \
+ p.pu[LUMA_8x8].pixelavg_pp[ALIGNED] = PFX(pixel_avg_8x8_ ## cpu); \
+ p.pu[LUMA_8x4].pixelavg_pp[ALIGNED] = PFX(pixel_avg_8x4_ ## cpu);
#define PIXEL_AVG_W4(cpu) \
- p.pu[LUMA_4x4].pixelavg_pp = PFX(pixel_avg_4x4_ ## cpu); \
- p.pu[LUMA_4x8].pixelavg_pp = PFX(pixel_avg_4x8_ ## cpu); \
- p.pu[LUMA_4x16].pixelavg_pp = PFX(pixel_avg_4x16_ ## cpu);
-
+ p.pu[LUMA_4x4].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_4x4_ ## cpu); \
+ p.pu[LUMA_4x8].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_4x8_ ## cpu); \
+ p.pu[LUMA_4x16].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_4x16_ ## cpu); \
+ p.pu[LUMA_4x4].pixelavg_pp[ALIGNED] = PFX(pixel_avg_4x4_ ## cpu); \
+ p.pu[LUMA_4x8].pixelavg_pp[ALIGNED] = PFX(pixel_avg_4x8_ ## cpu); \
+ p.pu[LUMA_4x16].pixelavg_pp[ALIGNED] = PFX(pixel_avg_4x16_ ## cpu);
#define CHROMA_420_FILTERS(cpu) \
ALL_CHROMA_420_PU(filter_hpp, interp_4tap_horiz_pp, cpu); \
ALL_CHROMA_420_PU(filter_hps, interp_4tap_horiz_ps, cpu); \
@@ -957,30 +979,29 @@
p.ssim_4x4x2_core = PFX(pixel_ssim_4x4x2_core_sse2);
p.ssim_end_4 = PFX(pixel_ssim_end4_sse2);
-
- p.pu[LUMA_64x64].pixelavg_pp = PFX(pixel_avg_64x64_sse2);
- p.pu[LUMA_64x48].pixelavg_pp = PFX(pixel_avg_64x48_sse2);
- p.pu[LUMA_64x32].pixelavg_pp = PFX(pixel_avg_64x32_sse2);
- p.pu[LUMA_64x16].pixelavg_pp = PFX(pixel_avg_64x16_sse2);
- p.pu[LUMA_48x64].pixelavg_pp = PFX(pixel_avg_48x64_sse2);
- p.pu[LUMA_32x64].pixelavg_pp = PFX(pixel_avg_32x64_sse2);
- p.pu[LUMA_32x32].pixelavg_pp = PFX(pixel_avg_32x32_sse2);
- p.pu[LUMA_32x24].pixelavg_pp = PFX(pixel_avg_32x24_sse2);
- p.pu[LUMA_32x16].pixelavg_pp = PFX(pixel_avg_32x16_sse2);
- p.pu[LUMA_32x8].pixelavg_pp = PFX(pixel_avg_32x8_sse2);
- p.pu[LUMA_24x32].pixelavg_pp = PFX(pixel_avg_24x32_sse2);
- p.pu[LUMA_16x64].pixelavg_pp = PFX(pixel_avg_16x64_sse2);
- p.pu[LUMA_16x32].pixelavg_pp = PFX(pixel_avg_16x32_sse2);
- p.pu[LUMA_16x16].pixelavg_pp = PFX(pixel_avg_16x16_sse2);
- p.pu[LUMA_16x12].pixelavg_pp = PFX(pixel_avg_16x12_sse2);
- p.pu[LUMA_16x8].pixelavg_pp = PFX(pixel_avg_16x8_sse2);
- p.pu[LUMA_16x4].pixelavg_pp = PFX(pixel_avg_16x4_sse2);
- p.pu[LUMA_12x16].pixelavg_pp = PFX(pixel_avg_12x16_sse2);
+ ASSIGN2(p.pu[LUMA_64x64].pixelavg_pp, pixel_avg_64x64_sse2);
+ ASSIGN2(p.pu[LUMA_64x48].pixelavg_pp, pixel_avg_64x48_sse2);
+ ASSIGN2(p.pu[LUMA_64x32].pixelavg_pp, pixel_avg_64x32_sse2);
+ ASSIGN2(p.pu[LUMA_64x16].pixelavg_pp, pixel_avg_64x16_sse2);
+ ASSIGN2(p.pu[LUMA_48x64].pixelavg_pp, pixel_avg_48x64_sse2);
+ ASSIGN2(p.pu[LUMA_32x64].pixelavg_pp, pixel_avg_32x64_sse2);
+ ASSIGN2(p.pu[LUMA_32x32].pixelavg_pp, pixel_avg_32x32_sse2);
+ ASSIGN2(p.pu[LUMA_32x24].pixelavg_pp, pixel_avg_32x24_sse2);
+ ASSIGN2(p.pu[LUMA_32x16].pixelavg_pp, pixel_avg_32x16_sse2);
+ ASSIGN2(p.pu[LUMA_32x8].pixelavg_pp, pixel_avg_32x8_sse2);
+ ASSIGN2(p.pu[LUMA_24x32].pixelavg_pp, pixel_avg_24x32_sse2);
+ ASSIGN2(p.pu[LUMA_16x64].pixelavg_pp, pixel_avg_16x64_sse2);
+ ASSIGN2(p.pu[LUMA_16x32].pixelavg_pp, pixel_avg_16x32_sse2);
+ ASSIGN2(p.pu[LUMA_16x16].pixelavg_pp, pixel_avg_16x16_sse2);
+ ASSIGN2(p.pu[LUMA_16x12].pixelavg_pp, pixel_avg_16x12_sse2);
+ ASSIGN2(p.pu[LUMA_16x8].pixelavg_pp, pixel_avg_16x8_sse2);
+ ASSIGN2(p.pu[LUMA_16x4].pixelavg_pp, pixel_avg_16x4_sse2);
+ ASSIGN2(p.pu[LUMA_12x16].pixelavg_pp, pixel_avg_12x16_sse2);
#if X86_64
- p.pu[LUMA_8x32].pixelavg_pp = PFX(pixel_avg_8x32_sse2);
- p.pu[LUMA_8x16].pixelavg_pp = PFX(pixel_avg_8x16_sse2);
- p.pu[LUMA_8x8].pixelavg_pp = PFX(pixel_avg_8x8_sse2);
- p.pu[LUMA_8x4].pixelavg_pp = PFX(pixel_avg_8x4_sse2);
+ ASSIGN2(p.pu[LUMA_8x32].pixelavg_pp, pixel_avg_8x32_sse2);
+ ASSIGN2(p.pu[LUMA_8x16].pixelavg_pp, pixel_avg_8x16_sse2);
+ ASSIGN2(p.pu[LUMA_8x8].pixelavg_pp, pixel_avg_8x8_sse2);
+ ASSIGN2(p.pu[LUMA_8x4].pixelavg_pp, pixel_avg_8x4_sse2);
#endif
PIXEL_AVG_W4(mmx2);
LUMA_VAR(sse2);
@@ -989,6 +1010,7 @@
ALL_LUMA_TU(blockfill_s[ALIGNED], blockfill_s, sse2);
ALL_LUMA_TU(blockfill_s[NONALIGNED], blockfill_s, sse2);
ALL_LUMA_TU_S(cpy1Dto2D_shr, cpy1Dto2D_shr_, sse2);
+ ALL_LUMA_TU_S(cpy1Dto2D_shl[ALIGNED], cpy1Dto2D_shl_, sse2);
ALL_LUMA_TU_S(cpy1Dto2D_shl[NONALIGNED], cpy1Dto2D_shl_, sse2);
ALL_LUMA_TU_S(cpy2Dto1D_shr, cpy2Dto1D_shr_, sse2);
ALL_LUMA_TU_S(cpy2Dto1D_shl, cpy2Dto1D_shl_, sse2);
@@ -1535,26 +1557,24 @@
p.cu[BLOCK_32x32].intra_pred[32] = PFX(intra_pred_ang32_32_avx2);
p.cu[BLOCK_32x32].intra_pred[33] = PFX(intra_pred_ang32_33_avx2);
p.cu[BLOCK_32x32].intra_pred[34] = PFX(intra_pred_ang32_2_avx2);
-
- p.pu[LUMA_12x16].pixelavg_pp = PFX(pixel_avg_12x16_avx2);
- p.pu[LUMA_16x4].pixelavg_pp = PFX(pixel_avg_16x4_avx2);
- p.pu[LUMA_16x8].pixelavg_pp = PFX(pixel_avg_16x8_avx2);
- p.pu[LUMA_16x12].pixelavg_pp = PFX(pixel_avg_16x12_avx2);
- p.pu[LUMA_16x16].pixelavg_pp = PFX(pixel_avg_16x16_avx2);
- p.pu[LUMA_16x32].pixelavg_pp = PFX(pixel_avg_16x32_avx2);
- p.pu[LUMA_16x64].pixelavg_pp = PFX(pixel_avg_16x64_avx2);
- p.pu[LUMA_24x32].pixelavg_pp = PFX(pixel_avg_24x32_avx2);
- p.pu[LUMA_32x8].pixelavg_pp = PFX(pixel_avg_32x8_avx2);
- p.pu[LUMA_32x16].pixelavg_pp = PFX(pixel_avg_32x16_avx2);
- p.pu[LUMA_32x24].pixelavg_pp = PFX(pixel_avg_32x24_avx2);
- p.pu[LUMA_32x32].pixelavg_pp = PFX(pixel_avg_32x32_avx2);
- p.pu[LUMA_32x64].pixelavg_pp = PFX(pixel_avg_32x64_avx2);
- p.pu[LUMA_64x16].pixelavg_pp = PFX(pixel_avg_64x16_avx2);
- p.pu[LUMA_64x32].pixelavg_pp = PFX(pixel_avg_64x32_avx2);
- p.pu[LUMA_64x48].pixelavg_pp = PFX(pixel_avg_64x48_avx2);
- p.pu[LUMA_64x64].pixelavg_pp = PFX(pixel_avg_64x64_avx2);
- p.pu[LUMA_48x64].pixelavg_pp = PFX(pixel_avg_48x64_avx2);
-
+ ASSIGN2(p.pu[LUMA_12x16].pixelavg_pp, pixel_avg_12x16_avx2);
+ ASSIGN2(p.pu[LUMA_16x4].pixelavg_pp, pixel_avg_16x4_avx2);
+ ASSIGN2(p.pu[LUMA_16x8].pixelavg_pp, pixel_avg_16x8_avx2);
+ ASSIGN2(p.pu[LUMA_16x12].pixelavg_pp, pixel_avg_16x12_avx2);
+ ASSIGN2(p.pu[LUMA_16x16].pixelavg_pp, pixel_avg_16x16_avx2);
+ ASSIGN2(p.pu[LUMA_16x32].pixelavg_pp, pixel_avg_16x32_avx2);
+ ASSIGN2(p.pu[LUMA_16x64].pixelavg_pp, pixel_avg_16x64_avx2);
+ ASSIGN2(p.pu[LUMA_24x32].pixelavg_pp, pixel_avg_24x32_avx2);
+ ASSIGN2(p.pu[LUMA_32x8].pixelavg_pp, pixel_avg_32x8_avx2);
+ ASSIGN2(p.pu[LUMA_32x16].pixelavg_pp, pixel_avg_32x16_avx2);
+ ASSIGN2(p.pu[LUMA_32x24].pixelavg_pp, pixel_avg_32x24_avx2);
+ ASSIGN2(p.pu[LUMA_32x32].pixelavg_pp, pixel_avg_32x32_avx2);
+ ASSIGN2(p.pu[LUMA_32x64].pixelavg_pp, pixel_avg_32x64_avx2);
+ ASSIGN2(p.pu[LUMA_64x16].pixelavg_pp, pixel_avg_64x16_avx2);
+ ASSIGN2(p.pu[LUMA_64x32].pixelavg_pp, pixel_avg_64x32_avx2);
+ ASSIGN2(p.pu[LUMA_64x48].pixelavg_pp, pixel_avg_64x48_avx2);
+ ASSIGN2(p.pu[LUMA_64x64].pixelavg_pp, pixel_avg_64x64_avx2);
+ ASSIGN2(p.pu[LUMA_48x64].pixelavg_pp, pixel_avg_48x64_avx2);
ASSIGN2(p.pu[LUMA_8x4].addAvg, addAvg_8x4_avx2);
ASSIGN2(p.pu[LUMA_8x8].addAvg, addAvg_8x8_avx2);
ASSIGN2(p.pu[LUMA_8x16].addAvg, addAvg_8x16_avx2);
@@ -1693,6 +1713,7 @@
ASSIGN2(p.cu[BLOCK_16x16].blockfill_s, blockfill_s_16x16_avx2);
ASSIGN2(p.cu[BLOCK_32x32].blockfill_s, blockfill_s_32x32_avx2);
ALL_LUMA_TU(count_nonzero, count_nonzero, avx2);
+ ALL_LUMA_TU_S(cpy1Dto2D_shl[ALIGNED], cpy1Dto2D_shl_, avx2);
ALL_LUMA_TU_S(cpy1Dto2D_shl[NONALIGNED], cpy1Dto2D_shl_, avx2);
ALL_LUMA_TU_S(cpy1Dto2D_shr, cpy1Dto2D_shr_, avx2);
p.cu[BLOCK_8x8].copy_cnt = PFX(copy_cnt_8_avx2);
@@ -2481,18 +2502,27 @@
p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].addAvg[NONALIGNED] = PFX(addAvg_16x64_avx512);
p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].addAvg[NONALIGNED] = PFX(addAvg_16x24_avx512);
p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].addAvg[NONALIGNED] = PFX(addAvg_16x8_avx512);
-
- p.pu[LUMA_32x8].pixelavg_pp = PFX(pixel_avg_32x8_avx512);
- p.pu[LUMA_32x16].pixelavg_pp = PFX(pixel_avg_32x16_avx512);
- p.pu[LUMA_32x24].pixelavg_pp = PFX(pixel_avg_32x24_avx512);
- p.pu[LUMA_32x32].pixelavg_pp = PFX(pixel_avg_32x32_avx512);
- p.pu[LUMA_32x64].pixelavg_pp = PFX(pixel_avg_32x64_avx512);
- p.pu[LUMA_64x16].pixelavg_pp = PFX(pixel_avg_64x16_avx512);
- p.pu[LUMA_64x32].pixelavg_pp = PFX(pixel_avg_64x32_avx512);
- p.pu[LUMA_64x48].pixelavg_pp = PFX(pixel_avg_64x48_avx512);
- p.pu[LUMA_64x64].pixelavg_pp = PFX(pixel_avg_64x64_avx512);
- p.pu[LUMA_48x64].pixelavg_pp = PFX(pixel_avg_48x64_avx512);
-
+ p.pu[LUMA_32x8].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_32x8_avx512);
+ p.pu[LUMA_32x16].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_32x16_avx512);
+ p.pu[LUMA_32x24].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_32x24_avx512);
+ p.pu[LUMA_32x32].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_32x32_avx512);
+ p.pu[LUMA_32x64].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_32x64_avx512);
+ p.pu[LUMA_64x16].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_64x16_avx512);
+ p.pu[LUMA_64x32].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_64x32_avx512);
+ p.pu[LUMA_64x48].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_64x48_avx512);
+ p.pu[LUMA_64x64].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_64x64_avx512);
+ p.pu[LUMA_48x64].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_48x64_avx512);
+
+ p.pu[LUMA_32x8].pixelavg_pp[ALIGNED] = PFX(pixel_avg_aligned_32x8_avx512);
+ p.pu[LUMA_32x16].pixelavg_pp[ALIGNED] = PFX(pixel_avg_aligned_32x16_avx512);
+ p.pu[LUMA_32x24].pixelavg_pp[ALIGNED] = PFX(pixel_avg_aligned_32x24_avx512);
+ p.pu[LUMA_32x32].pixelavg_pp[ALIGNED] = PFX(pixel_avg_aligned_32x32_avx512);
+ p.pu[LUMA_32x64].pixelavg_pp[ALIGNED] = PFX(pixel_avg_aligned_32x64_avx512);
+ p.pu[LUMA_48x64].pixelavg_pp[ALIGNED] = PFX(pixel_avg_aligned_48x64_avx512);
+ p.pu[LUMA_64x16].pixelavg_pp[ALIGNED] = PFX(pixel_avg_aligned_64x16_avx512);
+ p.pu[LUMA_64x32].pixelavg_pp[ALIGNED] = PFX(pixel_avg_aligned_64x32_avx512);
+ p.pu[LUMA_64x48].pixelavg_pp[ALIGNED] = PFX(pixel_avg_aligned_64x48_avx512);
+ p.pu[LUMA_64x64].pixelavg_pp[ALIGNED] = PFX(pixel_avg_aligned_64x64_avx512);
p.pu[LUMA_16x8].sad_x3 = PFX(pixel_sad_x3_16x8_avx512);
p.pu[LUMA_16x12].sad_x3 = PFX(pixel_sad_x3_16x12_avx512);
p.pu[LUMA_16x16].sad_x3 = PFX(pixel_sad_x3_16x16_avx512);
@@ -3202,6 +3232,7 @@
ALL_LUMA_TU(blockfill_s[ALIGNED], blockfill_s, sse2);
ALL_LUMA_TU_S(cpy2Dto1D_shl, cpy2Dto1D_shl_, sse2);
ALL_LUMA_TU_S(cpy2Dto1D_shr, cpy2Dto1D_shr_, sse2);
+ ALL_LUMA_TU_S(cpy1Dto2D_shl[ALIGNED], cpy1Dto2D_shl_, sse2);
ALL_LUMA_TU_S(cpy1Dto2D_shl[NONALIGNED], cpy1Dto2D_shl_, sse2);
ALL_LUMA_TU_S(cpy1Dto2D_shr, cpy1Dto2D_shr_, sse2);
ALL_LUMA_TU_S(ssd_s, pixel_ssd_s_, sse2);
@@ -3716,25 +3747,23 @@
p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].sub_ps = PFX(pixel_sub_ps_32x32_avx2);
p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].sub_ps = PFX(pixel_sub_ps_16x32_avx2);
p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].sub_ps = PFX(pixel_sub_ps_32x64_avx2);
-
- p.pu[LUMA_16x4].pixelavg_pp = PFX(pixel_avg_16x4_avx2);
- p.pu[LUMA_16x8].pixelavg_pp = PFX(pixel_avg_16x8_avx2);
- p.pu[LUMA_16x12].pixelavg_pp = PFX(pixel_avg_16x12_avx2);
- p.pu[LUMA_16x16].pixelavg_pp = PFX(pixel_avg_16x16_avx2);
- p.pu[LUMA_16x32].pixelavg_pp = PFX(pixel_avg_16x32_avx2);
- p.pu[LUMA_16x64].pixelavg_pp = PFX(pixel_avg_16x64_avx2);
-
- p.pu[LUMA_32x64].pixelavg_pp = PFX(pixel_avg_32x64_avx2);
- p.pu[LUMA_32x32].pixelavg_pp = PFX(pixel_avg_32x32_avx2);
- p.pu[LUMA_32x24].pixelavg_pp = PFX(pixel_avg_32x24_avx2);
- p.pu[LUMA_32x16].pixelavg_pp = PFX(pixel_avg_32x16_avx2);
- p.pu[LUMA_32x8].pixelavg_pp = PFX(pixel_avg_32x8_avx2);
- p.pu[LUMA_48x64].pixelavg_pp = PFX(pixel_avg_48x64_avx2);
- p.pu[LUMA_64x64].pixelavg_pp = PFX(pixel_avg_64x64_avx2);
- p.pu[LUMA_64x48].pixelavg_pp = PFX(pixel_avg_64x48_avx2);
- p.pu[LUMA_64x32].pixelavg_pp = PFX(pixel_avg_64x32_avx2);
- p.pu[LUMA_64x16].pixelavg_pp = PFX(pixel_avg_64x16_avx2);
-
+ ASSIGN2(p.pu[LUMA_16x4].pixelavg_pp, pixel_avg_16x4_avx2);
+ ASSIGN2(p.pu[LUMA_16x8].pixelavg_pp, pixel_avg_16x8_avx2);
+ ASSIGN2(p.pu[LUMA_16x12].pixelavg_pp, pixel_avg_16x12_avx2);
+ ASSIGN2(p.pu[LUMA_16x16].pixelavg_pp, pixel_avg_16x16_avx2);
+ ASSIGN2(p.pu[LUMA_16x32].pixelavg_pp, pixel_avg_16x32_avx2);
+ ASSIGN2(p.pu[LUMA_16x64].pixelavg_pp, pixel_avg_16x64_avx2);
+
+ ASSIGN2(p.pu[LUMA_32x64].pixelavg_pp, pixel_avg_32x64_avx2);
+ ASSIGN2(p.pu[LUMA_32x32].pixelavg_pp, pixel_avg_32x32_avx2);
+ ASSIGN2(p.pu[LUMA_32x24].pixelavg_pp, pixel_avg_32x24_avx2);
+ ASSIGN2(p.pu[LUMA_32x16].pixelavg_pp, pixel_avg_32x16_avx2);
+ ASSIGN2(p.pu[LUMA_32x8].pixelavg_pp, pixel_avg_32x8_avx2);
+ ASSIGN2(p.pu[LUMA_48x64].pixelavg_pp, pixel_avg_48x64_avx2);
+ ASSIGN2(p.pu[LUMA_64x64].pixelavg_pp, pixel_avg_64x64_avx2);
+ ASSIGN2(p.pu[LUMA_64x48].pixelavg_pp, pixel_avg_64x48_avx2);
+ ASSIGN2(p.pu[LUMA_64x32].pixelavg_pp, pixel_avg_64x32_avx2);
+ ASSIGN2(p.pu[LUMA_64x16].pixelavg_pp, pixel_avg_64x16_avx2);
p.pu[LUMA_16x16].satd = PFX(pixel_satd_16x16_avx2);
p.pu[LUMA_16x8].satd = PFX(pixel_satd_16x8_avx2);
p.pu[LUMA_8x16].satd = PFX(pixel_satd_8x16_avx2);
@@ -3800,6 +3829,7 @@
p.cu[BLOCK_16x16].copy_cnt = PFX(copy_cnt_16_avx2);
p.cu[BLOCK_32x32].copy_cnt = PFX(copy_cnt_32_avx2);
ASSIGN2(p.cu[BLOCK_16x16].blockfill_s, blockfill_s_16x16_avx2);
+ ALL_LUMA_TU_S(cpy1Dto2D_shl[ALIGNED], cpy1Dto2D_shl_, avx2);
ALL_LUMA_TU_S(cpy1Dto2D_shl[NONALIGNED], cpy1Dto2D_shl_, avx2);
ALL_LUMA_TU_S(cpy1Dto2D_shr, cpy1Dto2D_shr_, avx2);
p.cu[BLOCK_8x8].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_8_avx2);
@@ -4682,17 +4712,16 @@
p.cu[BLOCK_16x16].var = PFX(pixel_var_16x16_avx512);
p.cu[BLOCK_32x32].var = PFX(pixel_var_32x32_avx512);
p.cu[BLOCK_64x64].var = PFX(pixel_var_64x64_avx512);
- p.pu[LUMA_16x64].pixelavg_pp = PFX(pixel_avg_16x64_avx512);
- p.pu[LUMA_16x32].pixelavg_pp = PFX(pixel_avg_16x32_avx512);
- p.pu[LUMA_16x16].pixelavg_pp = PFX(pixel_avg_16x16_avx512);
- p.pu[LUMA_16x12].pixelavg_pp = PFX(pixel_avg_16x12_avx512);
- p.pu[LUMA_16x8].pixelavg_pp = PFX(pixel_avg_16x8_avx512);
- p.pu[LUMA_16x4].pixelavg_pp = PFX(pixel_avg_16x4_avx512);
- p.pu[LUMA_8x32].pixelavg_pp = PFX(pixel_avg_8x32_avx512);
- p.pu[LUMA_8x16].pixelavg_pp = PFX(pixel_avg_8x16_avx512);
- p.pu[LUMA_8x8].pixelavg_pp = PFX(pixel_avg_8x8_avx512);
+ ASSIGN2(p.pu[LUMA_16x64].pixelavg_pp, pixel_avg_16x64_avx512);
+ ASSIGN2(p.pu[LUMA_16x32].pixelavg_pp, pixel_avg_16x32_avx512);
+ ASSIGN2(p.pu[LUMA_16x16].pixelavg_pp, pixel_avg_16x16_avx512);
+ ASSIGN2(p.pu[LUMA_16x12].pixelavg_pp, pixel_avg_16x12_avx512);
+ ASSIGN2(p.pu[LUMA_16x8].pixelavg_pp, pixel_avg_16x8_avx512);
+ ASSIGN2(p.pu[LUMA_16x4].pixelavg_pp, pixel_avg_16x4_avx512);
+ ASSIGN2(p.pu[LUMA_8x32].pixelavg_pp, pixel_avg_8x32_avx512);
+ ASSIGN2(p.pu[LUMA_8x16].pixelavg_pp, pixel_avg_8x16_avx512);
+ ASSIGN2(p.pu[LUMA_8x8].pixelavg_pp, pixel_avg_8x8_avx512);
//p.pu[LUMA_8x4].pixelavg_pp = PFX(pixel_avg_8x4_avx512);
-
p.pu[LUMA_4x4].sad = PFX(pixel_sad_4x4_avx512);
p.pu[LUMA_4x8].sad = PFX(pixel_sad_4x8_avx512);
p.pu[LUMA_4x16].sad = PFX(pixel_sad_4x16_avx512);
@@ -4954,12 +4983,10 @@
p.pu[LUMA_64x48].luma_hpp = PFX(interp_8tap_horiz_pp_64x48_avx512);
p.pu[LUMA_64x64].luma_hpp = PFX(interp_8tap_horiz_pp_64x64_avx512);
p.pu[LUMA_48x64].luma_hpp = PFX(interp_8tap_horiz_pp_48x64_avx512);
-
- p.pu[LUMA_64x16].pixelavg_pp = PFX(pixel_avg_64x16_avx512);
- p.pu[LUMA_64x32].pixelavg_pp = PFX(pixel_avg_64x32_avx512);
- p.pu[LUMA_64x48].pixelavg_pp = PFX(pixel_avg_64x48_avx512);
- p.pu[LUMA_64x64].pixelavg_pp = PFX(pixel_avg_64x64_avx512);
-
+ ASSIGN2(p.pu[LUMA_64x16].pixelavg_pp, pixel_avg_64x16_avx512);
+ ASSIGN2(p.pu[LUMA_64x32].pixelavg_pp, pixel_avg_64x32_avx512);
+ ASSIGN2(p.pu[LUMA_64x48].pixelavg_pp, pixel_avg_64x48_avx512);
+ ASSIGN2(p.pu[LUMA_64x64].pixelavg_pp, pixel_avg_64x64_avx512);
//luma hps
p.pu[LUMA_64x64].luma_hps = PFX(interp_8tap_horiz_ps_64x64_avx512);
p.pu[LUMA_64x48].luma_hps = PFX(interp_8tap_horiz_ps_64x48_avx512);
diff -r a9be28cde01f -r 265fd2e1e495 source/common/x86/mc-a.asm
--- a/source/common/x86/mc-a.asm Tue Dec 12 15:43:30 2017 +0530
+++ b/source/common/x86/mc-a.asm Wed Dec 13 16:51:23 2017 +0530
@@ -6388,6 +6388,47 @@
movu [r0 + r1 * 2], m0
movu [r0 + r8], m2
%endmacro
+%macro PROCESS_PIXELAVG_ALIGNED_32x8_HBD_AVX512 0
+ mova m0, [r2]
+ mova m1, [r4]
+ mova m2, [r2 + r3]
+ mova m3, [r4 + r5]
+ pavgw m0, m1
+ pavgw m2, m3
+ mova [r0], m0
+ mova [r0 + r1], m2
+
+ mova m0, [r2 + r3 * 2]
+ mova m1, [r4 + r5 * 2]
+ mova m2, [r2 + r6]
+ mova m3, [r4 + r7]
+ pavgw m0, m1
+ pavgw m2, m3
+ mova [r0 + r1 * 2], m0
+ mova [r0 + r8], m2
+
+ lea r0, [r0 + 4 * r1]
+ lea r2, [r2 + 4 * r3]
+ lea r4, [r4 + 4 * r5]
+
+ mova m0, [r2]
+ mova m1, [r4]
+ mova m2, [r2 + r3]
+ mova m3, [r4 + r5]
+ pavgw m0, m1
+ pavgw m2, m3
+ mova [r0], m0
+ mova [r0 + r1], m2
+
+ mova m0, [r2 + r3 * 2]
+ mova m1, [r4 + r5 * 2]
+ mova m2, [r2 + r6]
+ mova m3, [r4 + r7]
+ pavgw m0, m1
+ pavgw m2, m3
+ mova [r0 + r1 * 2], m0
+ mova [r0 + r8], m2
+%endmacro
%macro PROCESS_PIXELAVG_64x8_HBD_AVX512 0
movu m0, [r2]
@@ -6466,6 +6507,83 @@
movu [r0 + r1 * 2 + mmsize], m0
movu [r0 + r8 + mmsize], m2
%endmacro
+%macro PROCESS_PIXELAVG_ALIGNED_64x8_HBD_AVX512 0
+ mova m0, [r2]
+ mova m1, [r4]
+ mova m2, [r2 + r3]
+ mova m3, [r4 + r5]
+ pavgw m0, m1
+ pavgw m2, m3
+ mova [r0], m0
+ mova [r0 + r1], m2
+
+ mova m0, [r2 + mmsize]
+ mova m1, [r4 + mmsize]
+ mova m2, [r2 + r3 + mmsize]
+ mova m3, [r4 + r5 + mmsize]
+ pavgw m0, m1
+ pavgw m2, m3
+ mova [r0 + mmsize], m0
+ mova [r0 + r1 + mmsize], m2
+
+ mova m0, [r2 + r3 * 2]
+ mova m1, [r4 + r5 * 2]
+ mova m2, [r2 + r6]
+ mova m3, [r4 + r7]
+ pavgw m0, m1
+ pavgw m2, m3
+ mova [r0 + r1 * 2], m0
+ mova [r0 + r8], m2
+
+ mova m0, [r2 + r3 * 2 + mmsize]
+ mova m1, [r4 + r5 * 2 + mmsize]
+ mova m2, [r2 + r6 + mmsize]
+ mova m3, [r4 + r7 + mmsize]
+ pavgw m0, m1
+ pavgw m2, m3
+ mova [r0 + r1 * 2 + mmsize], m0
+ mova [r0 + r8 + mmsize], m2
+
+ lea r0, [r0 + 4 * r1]
+ lea r2, [r2 + 4 * r3]
+ lea r4, [r4 + 4 * r5]
+
+ mova m0, [r2]
+ mova m1, [r4]
+ mova m2, [r2 + r3]
+ mova m3, [r4 + r5]
+ pavgw m0, m1
+ pavgw m2, m3
+ mova [r0], m0
+ mova [r0 + r1], m2
+
+ mova m0, [r2 + mmsize]
+ mova m1, [r4 + mmsize]
+ mova m2, [r2 + r3 + mmsize]
+ mova m3, [r4 + r5 + mmsize]
+ pavgw m0, m1
+ pavgw m2, m3
+ mova [r0 + mmsize], m0
+ mova [r0 + r1 + mmsize], m2
+
+ mova m0, [r2 + r3 * 2]
+ mova m1, [r4 + r5 * 2]
+ mova m2, [r2 + r6]
+ mova m3, [r4 + r7]
+ pavgw m0, m1
+ pavgw m2, m3
+ mova [r0 + r1 * 2], m0
+ mova [r0 + r8], m2
+
+ mova m0, [r2 + r3 * 2 + mmsize]
+ mova m1, [r4 + r5 * 2 + mmsize]
+ mova m2, [r2 + r6 + mmsize]
+ mova m3, [r4 + r7 + mmsize]
+ pavgw m0, m1
+ pavgw m2, m3
+ mova [r0 + r1 * 2 + mmsize], m0
+ mova [r0 + r8 + mmsize], m2
+%endmacro
%macro PROCESS_PIXELAVG_48x8_HBD_AVX512 0
movu m0, [r2]
@@ -6544,13 +6662,90 @@
movu [r0 + r1 * 2 + mmsize], ym0
movu [r0 + r8 + mmsize], ym2
%endmacro
+%macro PROCESS_PIXELAVG_ALIGNED_48x8_HBD_AVX512 0
+ mova m0, [r2]
+ mova m1, [r4]
+ mova m2, [r2 + r3]
+ mova m3, [r4 + r5]
+ pavgw m0, m1
+ pavgw m2, m3
+ mova [r0], m0
+ mova [r0 + r1], m2
+
+ mova ym0, [r2 + mmsize]
+ mova ym1, [r4 + mmsize]
+ mova ym2, [r2 + r3 + mmsize]
+ mova ym3, [r4 + r5 + mmsize]
+ pavgw ym0, ym1
+ pavgw ym2, ym3
+ mova [r0 + mmsize], ym0
+ mova [r0 + r1 + mmsize], ym2
+
+ mova m0, [r2 + r3 * 2]
+ mova m1, [r4 + r5 * 2]
+ mova m2, [r2 + r6]
+ mova m3, [r4 + r7]
+ pavgw m0, m1
+ pavgw m2, m3
+ mova [r0 + r1 * 2], m0
+ mova [r0 + r8], m2
+
+ mova ym0, [r2 + r3 * 2 + mmsize]
+ mova ym1, [r4 + r5 * 2 + mmsize]
+ mova ym2, [r2 + r6 + mmsize]
+ mova ym3, [r4 + r7 + mmsize]
+ pavgw ym0, ym1
+ pavgw ym2, ym3
+ mova [r0 + r1 * 2 + mmsize], ym0
+ mova [r0 + r8 + mmsize], ym2
+
+ lea r0, [r0 + 4 * r1]
+ lea r2, [r2 + 4 * r3]
+ lea r4, [r4 + 4 * r5]
+
+ mova m0, [r2]
+ mova m1, [r4]
+ mova m2, [r2 + r3]
+ mova m3, [r4 + r5]
+ pavgw m0, m1
+ pavgw m2, m3
+ mova [r0], m0
+ mova [r0 + r1], m2
+
+ mova ym0, [r2 + mmsize]
+ mova ym1, [r4 + mmsize]
+ mova ym2, [r2 + r3 + mmsize]
+ mova ym3, [r4 + r5 + mmsize]
+ pavgw ym0, ym1
+ pavgw ym2, ym3
+ mova [r0 + mmsize], ym0
+ mova [r0 + r1 + mmsize], ym2
+
+ mova m0, [r2 + r3 * 2]
+ mova m1, [r4 + r5 * 2]
+ mova m2, [r2 + r6]
+ mova m3, [r4 + r7]
+ pavgw m0, m1
+ pavgw m2, m3
+ mova [r0 + r1 * 2], m0
+ mova [r0 + r8], m2
+
+ mova ym0, [r2 + r3 * 2 + mmsize]
+ mova ym1, [r4 + r5 * 2 + mmsize]
+ mova ym2, [r2 + r6 + mmsize]
+ mova ym3, [r4 + r7 + mmsize]
+ pavgw ym0, ym1
+ pavgw ym2, ym3
+ mova [r0 + r1 * 2 + mmsize], ym0
+ mova [r0 + r8 + mmsize], ym2
+%endmacro
%macro PIXEL_AVG_HBD_W32 1
INIT_ZMM avx512
cglobal pixel_avg_32x%1, 6,9,4
- add r1d, r1d
- add r3d, r3d
- add r5d, r5d
+ shl r1d, 1
+ shl r3d, 1
+ shl r5d, 1
lea r6, [r3 * 3]
lea r7, [r5 * 3]
lea r8, [r1 * 3]
@@ -6572,13 +6767,40 @@
PIXEL_AVG_HBD_W32 32
PIXEL_AVG_HBD_W32 64
%endif
+%macro PIXEL_AVG_HBD_ALIGNED_W32 1
+INIT_ZMM avx512
+cglobal pixel_avg_aligned_32x%1, 6,9,4
+ shl r1d, 1
+ shl r3d, 1
+ shl r5d, 1
+ lea r6, [r3 * 3]
+ lea r7, [r5 * 3]
+ lea r8, [r1 * 3]
+
+%rep %1/8 - 1
+ PROCESS_PIXELAVG_ALIGNED_32x8_HBD_AVX512
+ lea r0, [r0 + 4 * r1]
+ lea r2, [r2 + 4 * r3]
+ lea r4, [r4 + 4 * r5]
+%endrep
+ PROCESS_PIXELAVG_ALIGNED_32x8_HBD_AVX512
+ RET
+%endmacro
+
+%if ARCH_X86_64
+PIXEL_AVG_HBD_ALIGNED_W32 8
+PIXEL_AVG_HBD_ALIGNED_W32 16
+PIXEL_AVG_HBD_ALIGNED_W32 24
+PIXEL_AVG_HBD_ALIGNED_W32 32
+PIXEL_AVG_HBD_ALIGNED_W32 64
+%endif
%macro PIXEL_AVG_HBD_W64 1
INIT_ZMM avx512
cglobal pixel_avg_64x%1, 6,9,4
- add r1d, r1d
- add r3d, r3d
- add r5d, r5d
+ shl r1d, 1
+ shl r3d, 1
+ shl r5d, 1
lea r6, [r3 * 3]
lea r7, [r5 * 3]
lea r8, [r1 * 3]
@@ -6599,13 +6821,39 @@
PIXEL_AVG_HBD_W64 48
PIXEL_AVG_HBD_W64 64
%endif
+%macro PIXEL_AVG_HBD_ALIGNED_W64 1
+INIT_ZMM avx512
+cglobal pixel_avg_aligned_64x%1, 6,9,4
+ shl r1d, 1
+ shl r3d, 1
+ shl r5d, 1
+ lea r6, [r3 * 3]
+ lea r7, [r5 * 3]
+ lea r8, [r1 * 3]
+
+%rep %1/8 - 1
+ PROCESS_PIXELAVG_ALIGNED_64x8_HBD_AVX512
+ lea r0, [r0 + 4 * r1]
+ lea r2, [r2 + 4 * r3]
+ lea r4, [r4 + 4 * r5]
+%endrep
+ PROCESS_PIXELAVG_ALIGNED_64x8_HBD_AVX512
+ RET
+%endmacro
+
+%if ARCH_X86_64
+PIXEL_AVG_HBD_ALIGNED_W64 16
+PIXEL_AVG_HBD_ALIGNED_W64 32
+PIXEL_AVG_HBD_ALIGNED_W64 48
+PIXEL_AVG_HBD_ALIGNED_W64 64
+%endif
%if ARCH_X86_64
INIT_ZMM avx512
cglobal pixel_avg_48x64, 6,9,4
- add r1d, r1d
- add r3d, r3d
- add r5d, r5d
+ shl r1d, 1
+ shl r3d, 1
+ shl r5d, 1
lea r6, [r3 * 3]
lea r7, [r5 * 3]
lea r8, [r1 * 3]
@@ -6619,6 +6867,26 @@
PROCESS_PIXELAVG_48x8_HBD_AVX512
RET
%endif
+
+%if ARCH_X86_64
+INIT_ZMM avx512
+cglobal pixel_avg_aligned_48x64, 6,9,4
+ shl r1d, 1
+ shl r3d, 1
+ shl r5d, 1
+ lea r6, [r3 * 3]
+ lea r7, [r5 * 3]
+ lea r8, [r1 * 3]
+
+%rep 7
+ PROCESS_PIXELAVG_ALIGNED_48x8_HBD_AVX512
+ lea r0, [r0 + 4 * r1]
+ lea r2, [r2 + 4 * r3]
+ lea r4, [r4 + 4 * r5]
+%endrep
+ PROCESS_PIXELAVG_ALIGNED_48x8_HBD_AVX512
+ RET
+%endif
;-----------------------------------------------------------------------------
;pixel_avg_pp avx512 high bit depth code end
;-----------------------------------------------------------------------------
diff -r a9be28cde01f -r 265fd2e1e495 source/common/x86/pixel.h
--- a/source/common/x86/pixel.h Tue Dec 12 15:43:30 2017 +0530
+++ b/source/common/x86/pixel.h Wed Dec 13 16:51:23 2017 +0530
@@ -44,6 +44,7 @@
FUNCDEF_PU(void, pixel_sad_x3, cpu, const pixel*, const pixel*, const pixel*, const pixel*, intptr_t, int32_t*); \
FUNCDEF_PU(void, pixel_sad_x4, cpu, const pixel*, const pixel*, const pixel*, const pixel*, const pixel*, intptr_t, int32_t*); \
FUNCDEF_PU(void, pixel_avg, cpu, pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int); \
+ FUNCDEF_PU(void, pixel_avg_aligned, cpu, pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int); \
FUNCDEF_PU(void, pixel_add_ps, cpu, pixel* a, intptr_t dstride, const pixel* b0, const int16_t* b1, intptr_t sstride0, intptr_t sstride1); \
FUNCDEF_PU(void, pixel_add_ps_aligned, cpu, pixel* a, intptr_t dstride, const pixel* b0, const int16_t* b1, intptr_t sstride0, intptr_t sstride1); \
FUNCDEF_PU(void, pixel_sub_ps, cpu, int16_t* a, intptr_t dstride, const pixel* b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1); \
diff -r a9be28cde01f -r 265fd2e1e495 source/encoder/analysis.cpp
--- a/source/encoder/analysis.cpp Tue Dec 12 15:43:30 2017 +0530
+++ b/source/encoder/analysis.cpp Wed Dec 13 16:51:23 2017 +0530
@@ -3196,11 +3196,9 @@
pixel *fref0 = m_slice->m_mref[0][ref0].getLumaAddr(pu.ctuAddr, pu.cuAbsPartIdx);
pixel *fref1 = m_slice->m_mref[1][ref1].getLumaAddr(pu.ctuAddr, pu.cuAbsPartIdx);
intptr_t refStride = m_slice->m_mref[0][0].lumaStride;
-
- primitives.pu[partEnum].pixelavg_pp(tmpPredYuv.m_buf[0], tmpPredYuv.m_size, fref0, refStride, fref1, refStride, 32);
+ primitives.pu[partEnum].pixelavg_pp[(tmpPredYuv.m_size % 64 == 0) && (refStride % 64 == 0)](tmpPredYuv.m_buf[0], tmpPredYuv.m_size, fref0, refStride, fref1, refStride, 32);
zsa8d = primitives.cu[partEnum].sa8d(fencYuv.m_buf[0], fencYuv.m_size, tmpPredYuv.m_buf[0], tmpPredYuv.m_size);
}
-
uint32_t bits0 = bestME[0].bits - m_me.bitcost(bestME[0].mv, mvp0) + m_me.bitcost(mvzero, mvp0);
uint32_t bits1 = bestME[1].bits - m_me.bitcost(bestME[1].mv, mvp1) + m_me.bitcost(mvzero, mvp1);
uint32_t zcost = zsa8d + m_rdCost.getCost(bits0) + m_rdCost.getCost(bits1);
diff -r a9be28cde01f -r 265fd2e1e495 source/encoder/search.cpp
--- a/source/encoder/search.cpp Tue Dec 12 15:43:30 2017 +0530
+++ b/source/encoder/search.cpp Wed Dec 13 16:51:23 2017 +0530
@@ -2423,8 +2423,7 @@
/* Generate reference subpels */
predInterLumaPixel(pu, bidirYuv[0], *refPic0, bestME[0].mv);
predInterLumaPixel(pu, bidirYuv[1], *refPic1, bestME[1].mv);
-
- primitives.pu[m_me.partEnum].pixelavg_pp(tmpPredYuv.m_buf[0], tmpPredYuv.m_size, bidirYuv[0].getLumaAddr(pu.puAbsPartIdx), bidirYuv[0].m_size,
+ primitives.pu[m_me.partEnum].pixelavg_pp[(tmpPredYuv.m_size % 64 == 0) && (bidirYuv[0].m_size % 64 == 0) && (bidirYuv[1].m_size % 64 == 0)](tmpPredYuv.m_buf[0], tmpPredYuv.m_size, bidirYuv[0].getLumaAddr(pu.puAbsPartIdx), bidirYuv[0].m_size,
bidirYuv[1].getLumaAddr(pu.puAbsPartIdx), bidirYuv[1].m_size, 32);
satdCost = m_me.bufSATD(tmpPredYuv.m_buf[0], tmpPredYuv.m_size);
}
@@ -2466,11 +2465,9 @@
const pixel* ref0 = m_slice->m_mref[0][bestME[0].ref].getLumaAddr(pu.ctuAddr, pu.cuAbsPartIdx + pu.puAbsPartIdx);
const pixel* ref1 = m_slice->m_mref[1][bestME[1].ref].getLumaAddr(pu.ctuAddr, pu.cuAbsPartIdx + pu.puAbsPartIdx);
intptr_t refStride = slice->m_mref[0][0].lumaStride;
-
- primitives.pu[m_me.partEnum].pixelavg_pp(tmpPredYuv.m_buf[0], tmpPredYuv.m_size, ref0, refStride, ref1, refStride, 32);
+ primitives.pu[m_me.partEnum].pixelavg_pp[(tmpPredYuv.m_size % 64 == 0) && (refStride % 64 == 0)](tmpPredYuv.m_buf[0], tmpPredYuv.m_size, ref0, refStride, ref1, refStride, 32);
satdCost = m_me.bufSATD(tmpPredYuv.m_buf[0], tmpPredYuv.m_size);
}
-
MV mvp0 = bestME[0].mvp;
int mvpIdx0 = bestME[0].mvpIdx;
uint32_t bits0 = bestME[0].bits - m_me.bitcost(bestME[0].mv, mvp0) + m_me.bitcost(mvzero, mvp0);
diff -r a9be28cde01f -r 265fd2e1e495 source/encoder/slicetype.cpp
--- a/source/encoder/slicetype.cpp Tue Dec 12 15:43:30 2017 +0530
+++ b/source/encoder/slicetype.cpp Wed Dec 13 16:51:23 2017 +0530
@@ -2523,19 +2523,16 @@
intptr_t stride0 = X265_LOWRES_CU_SIZE, stride1 = X265_LOWRES_CU_SIZE;
pixel *src0 = fref0->lowresMC(pelOffset, fenc->lowresMvs[0][listDist[0]][cuXY], subpelbuf0, stride0);
pixel *src1 = fref1->lowresMC(pelOffset, fenc->lowresMvs[1][listDist[1]][cuXY], subpelbuf1, stride1);
-
ALIGN_VAR_32(pixel, ref[X265_LOWRES_CU_SIZE * X265_LOWRES_CU_SIZE]);
- primitives.pu[LUMA_8x8].pixelavg_pp(ref, X265_LOWRES_CU_SIZE, src0, stride0, src1, stride1, 32);
+ primitives.pu[LUMA_8x8].pixelavg_pp[NONALIGNED](ref, X265_LOWRES_CU_SIZE, src0, stride0, src1, stride1, 32);
int bicost = tld.me.bufSATD(ref, X265_LOWRES_CU_SIZE);
COPY2_IF_LT(bcost, bicost, listused, 3);
-
/* coloc candidate */
src0 = fref0->lowresPlane[0] + pelOffset;
src1 = fref1->lowresPlane[0] + pelOffset;
- primitives.pu[LUMA_8x8].pixelavg_pp(ref, X265_LOWRES_CU_SIZE, src0, fref0->lumaStride, src1, fref1->lumaStride, 32);
+ primitives.pu[LUMA_8x8].pixelavg_pp[NONALIGNED](ref, X265_LOWRES_CU_SIZE, src0, fref0->lumaStride, src1, fref1->lumaStride, 32);
bicost = tld.me.bufSATD(ref, X265_LOWRES_CU_SIZE);
COPY2_IF_LT(bcost, bicost, listused, 3);
-
bcost += lowresPenalty;
}
else /* P, also consider intra */
diff -r a9be28cde01f -r 265fd2e1e495 source/test/pixelharness.cpp
--- a/source/test/pixelharness.cpp Tue Dec 12 15:43:30 2017 +0530
+++ b/source/test/pixelharness.cpp Wed Dec 13 16:51:23 2017 +0530
@@ -550,14 +550,11 @@
return true;
}
-
bool PixelHarness::check_pixelavg_pp(pixelavg_pp_t ref, pixelavg_pp_t opt)
{
- ALIGN_VAR_16(pixel, ref_dest[64 * 64]);
- ALIGN_VAR_16(pixel, opt_dest[64 * 64]);
-
+ ALIGN_VAR_64(pixel, ref_dest[64 * 64]);
+ ALIGN_VAR_64(pixel, opt_dest[64 * 64]);
int j = 0;
-
memset(ref_dest, 0xCD, sizeof(ref_dest));
memset(opt_dest, 0xCD, sizeof(opt_dest));
@@ -580,6 +577,35 @@
return true;
}
+bool PixelHarness::check_pixelavg_pp_aligned(pixelavg_pp_t ref, pixelavg_pp_t opt)
+{
+ ALIGN_VAR_64(pixel, ref_dest[64 * 64]);
+ ALIGN_VAR_64(pixel, opt_dest[64 * 64]);
+
+ int j = 0;
+
+ memset(ref_dest, 0xCD, sizeof(ref_dest));
+ memset(opt_dest, 0xCD, sizeof(opt_dest));
+
+ intptr_t stride = STRIDE;
+ for (int i = 0; i < ITERS; i++)
+ {
+ int index1 = rand() % TEST_CASES;
+ int index2 = rand() % TEST_CASES;
+ checked(ref, ref_dest, stride, pixel_test_buff[index1] + j,
+ stride, pixel_test_buff[index2] + j, stride, 32);
+ opt(opt_dest, stride, pixel_test_buff[index1] + j,
+ stride, pixel_test_buff[index2] + j, stride, 32);
+
+ if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(pixel)))
+ return false;
+
+ reportfail();
+ j += INCR + 32;
+ }
+
+ return true;
+}
bool PixelHarness::check_copy_pp(copy_pp_t ref, copy_pp_t opt)
{
@@ -2263,15 +2289,22 @@
return false;
}
}
-
- if (opt.pu[part].pixelavg_pp)
+ if (opt.pu[part].pixelavg_pp[NONALIGNED])
{
- if (!check_pixelavg_pp(ref.pu[part].pixelavg_pp, opt.pu[part].pixelavg_pp))
+ if (!check_pixelavg_pp(ref.pu[part].pixelavg_pp[NONALIGNED], opt.pu[part].pixelavg_pp[NONALIGNED]))
{
printf("pixelavg_pp[%s]: failed!\n", lumaPartStr[part]);
return false;
}
}
+ if (opt.pu[part].pixelavg_pp[ALIGNED])
+ {
+ if (!check_pixelavg_pp_aligned(ref.pu[part].pixelavg_pp[ALIGNED], opt.pu[part].pixelavg_pp[ALIGNED]))
+ {
+ printf("pixelavg_pp_aligned[%s]: failed!\n", lumaPartStr[part]);
+ return false;
+ }
+ }
if (opt.pu[part].copy_pp)
{
@@ -2632,7 +2665,7 @@
}
if (opt.cu[i].cpy1Dto2D_shl[ALIGNED])
{
- if (!check_cpy1Dto2D_shl_t(ref.cu[i].cpy1Dto2D_shl[ALIGNED], opt.cu[i].cpy1Dto2D_shl[ALIGNED]))
+ if (!check_cpy1Dto2D_shl_aligned_t(ref.cu[i].cpy1Dto2D_shl[ALIGNED], opt.cu[i].cpy1Dto2D_shl[ALIGNED]))
{
printf("cpy1Dto2D_shl_aligned[%dx%d] failed!\n", 4 << i, 4 << i);
return false;
@@ -3051,13 +3084,17 @@
HEADER("satd[%s]", lumaPartStr[part]);
REPORT_SPEEDUP(opt.pu[part].satd, ref.pu[part].satd, pbuf1, STRIDE, fref, STRIDE);
}
-
- if (opt.pu[part].pixelavg_pp)
+ if (opt.pu[part].pixelavg_pp[NONALIGNED])
{
HEADER("avg_pp[%s]", lumaPartStr[part]);
- REPORT_SPEEDUP(opt.pu[part].pixelavg_pp, ref.pu[part].pixelavg_pp, pbuf1, STRIDE, pbuf2, STRIDE, pbuf3, STRIDE, 32);
+ REPORT_SPEEDUP(opt.pu[part].pixelavg_pp[NONALIGNED], ref.pu[part].pixelavg_pp[NONALIGNED], pbuf1, STRIDE, pbuf2, STRIDE, pbuf3, STRIDE, 32);
}
+ if (opt.pu[part].pixelavg_pp[ALIGNED])
+ {
+ HEADER("avg_pp_aligned[%s]", lumaPartStr[part]);
+ REPORT_SPEEDUP(opt.pu[part].pixelavg_pp[ALIGNED], ref.pu[part].pixelavg_pp[ALIGNED], pbuf1, STRIDE, pbuf2, STRIDE, pbuf3, STRIDE, 32);
+ }
if (opt.pu[part].sad)
{
HEADER("sad[%s]", lumaPartStr[part]);
diff -r a9be28cde01f -r 265fd2e1e495 source/test/pixelharness.h
--- a/source/test/pixelharness.h Tue Dec 12 15:43:30 2017 +0530
+++ b/source/test/pixelharness.h Wed Dec 13 16:51:23 2017 +0530
@@ -79,6 +79,7 @@
bool check_copy_ps(copy_ps_t ref, copy_ps_t opt);
bool check_copy_ss(copy_ss_t ref, copy_ss_t opt);
bool check_pixelavg_pp(pixelavg_pp_t ref, pixelavg_pp_t opt);
+ bool check_pixelavg_pp_aligned(pixelavg_pp_t ref, pixelavg_pp_t opt);
bool check_pixel_sub_ps(pixel_sub_ps_t ref, pixel_sub_ps_t opt);
bool check_pixel_add_ps(pixel_add_ps_t ref, pixel_add_ps_t opt);
bool check_pixel_add_ps_aligned(pixel_add_ps_t ref, pixel_add_ps_t opt);
More information about the x265-devel
mailing list