[x265] [PATCH 119 of 307] x86: Aligned routine implementation of add_ps primitive
mythreyi at multicorewareinc.com
mythreyi at multicorewareinc.com
Sat Apr 7 04:31:57 CEST 2018
# HG changeset patch
# User Vignesh Vijayakumar
# Date 1507278656 -19800
# Fri Oct 06 14:00:56 2017 +0530
# Node ID 44433ded38d00c79fa52e69e7c5c5127009f9ede
# Parent ba20a08181382a2fb18a0d1aff7637d66fa41ac7
x86: Aligned routine implementation of add_ps primitive
diff -r ba20a0818138 -r 44433ded38d0 source/common/pixel.cpp
--- a/source/common/pixel.cpp Wed Oct 04 17:02:59 2017 +0530
+++ b/source/common/pixel.cpp Fri Oct 06 14:00:56 2017 +0530
@@ -996,6 +996,7 @@
#define LUMA_CU(W, H) \
p.cu[BLOCK_ ## W ## x ## H].sub_ps = pixel_sub_ps_c<W, H>; \
p.cu[BLOCK_ ## W ## x ## H].add_ps = pixel_add_ps_c<W, H>; \
+ p.cu[BLOCK_ ## W ## x ## H].add_ps_aligned = pixel_add_ps_c<W, H>; \
p.cu[BLOCK_ ## W ## x ## H].copy_sp = blockcopy_sp_c<W, H>; \
p.cu[BLOCK_ ## W ## x ## H].copy_ps = blockcopy_ps_c<W, H>; \
p.cu[BLOCK_ ## W ## x ## H].copy_ss = blockcopy_ss_c<W, H>; \
@@ -1169,7 +1170,8 @@
p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].copy_ps = blockcopy_ps_c<W, H>; \
p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].copy_ss = blockcopy_ss_c<W, H>; \
p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].sub_ps = pixel_sub_ps_c<W, H>; \
- p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].add_ps = pixel_add_ps_c<W, H>;
+ p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].add_ps = pixel_add_ps_c<W, H>; \
+ p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].add_ps_aligned = pixel_add_ps_c<W, H>;
CHROMA_CU_420(2, 2)
CHROMA_CU_420(4, 4)
@@ -1247,7 +1249,8 @@
p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].copy_ps = blockcopy_ps_c<W, H>; \
p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].copy_ss = blockcopy_ss_c<W, H>; \
p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].sub_ps = pixel_sub_ps_c<W, H>; \
- p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].add_ps = pixel_add_ps_c<W, H>;
+ p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].add_ps = pixel_add_ps_c<W, H>; \
+ p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].add_ps_aligned = pixel_add_ps_c<W, H>;
CHROMA_CU_422(2, 4)
CHROMA_CU_422(4, 8)
diff -r ba20a0818138 -r 44433ded38d0 source/common/primitives.cpp
--- a/source/common/primitives.cpp Wed Oct 04 17:02:59 2017 +0530
+++ b/source/common/primitives.cpp Fri Oct 06 14:00:56 2017 +0530
@@ -126,6 +126,7 @@
p.chroma[X265_CSP_I444].cu[i].sse_pp = p.cu[i].sse_pp;
p.chroma[X265_CSP_I444].cu[i].sub_ps = p.cu[i].sub_ps;
p.chroma[X265_CSP_I444].cu[i].add_ps = p.cu[i].add_ps;
+ p.chroma[X265_CSP_I444].cu[i].add_ps_aligned = p.cu[i].add_ps_aligned;
p.chroma[X265_CSP_I444].cu[i].copy_ps = p.cu[i].copy_ps;
p.chroma[X265_CSP_I444].cu[i].copy_sp = p.cu[i].copy_sp;
p.chroma[X265_CSP_I444].cu[i].copy_ss = p.cu[i].copy_ss;
diff -r ba20a0818138 -r 44433ded38d0 source/common/primitives.h
--- a/source/common/primitives.h Wed Oct 04 17:02:59 2017 +0530
+++ b/source/common/primitives.h Fri Oct 06 14:00:56 2017 +0530
@@ -271,6 +271,7 @@
calcresidual_t calcresidual_aligned;
pixel_sub_ps_t sub_ps;
pixel_add_ps_t add_ps;
+ pixel_add_ps_t add_ps_aligned;
blockfill_s_t blockfill_s; // block fill, for DC transforms
blockfill_s_t blockfill_s_aligned; // block fill, for DC transforms
copy_cnt_t copy_cnt; // copy coeff while counting non-zero
@@ -405,6 +406,7 @@
pixel_sse_t sse_pp;
pixel_sub_ps_t sub_ps;
pixel_add_ps_t add_ps;
+ pixel_add_ps_t add_ps_aligned;
copy_ps_t copy_ps;
copy_sp_t copy_sp;
diff -r ba20a0818138 -r 44433ded38d0 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Wed Oct 04 17:02:59 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp Fri Oct 06 14:00:56 2017 +0530
@@ -2202,6 +2202,20 @@
p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].add_ps = PFX(pixel_add_ps_32x32_avx512);
p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].add_ps = PFX(pixel_add_ps_32x64_avx512);
+ p.cu[BLOCK_4x4].add_ps_aligned = PFX(pixel_add_ps_4x4_sse2);
+ p.cu[BLOCK_8x8].add_ps_aligned = PFX(pixel_add_ps_8x8_sse2);
+ p.cu[BLOCK_16x16].add_ps_aligned = PFX(pixel_add_ps_16x16_avx2);
+ p.cu[BLOCK_32x32].add_ps_aligned = PFX(pixel_add_ps_aligned_32x32_avx512);
+ p.cu[BLOCK_64x64].add_ps_aligned = PFX(pixel_add_ps_aligned_64x64_avx512);
+ p.chroma[X265_CSP_I420].cu[BLOCK_420_4x4].add_ps_aligned = PFX(pixel_add_ps_4x4_sse2);
+ p.chroma[X265_CSP_I420].cu[BLOCK_420_8x8].add_ps_aligned = PFX(pixel_add_ps_8x8_sse2);
+ p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].add_ps_aligned = PFX(pixel_add_ps_16x16_avx2);
+ p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].add_ps_aligned = PFX(pixel_add_ps_aligned_32x32_avx512);
+ p.chroma[X265_CSP_I422].cu[BLOCK_422_4x8].add_ps_aligned = PFX(pixel_add_ps_4x8_sse2);
+ p.chroma[X265_CSP_I422].cu[BLOCK_422_8x16].add_ps_aligned = PFX(pixel_add_ps_8x16_sse2);
+ p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].add_ps_aligned = PFX(pixel_add_ps_16x32_avx2);
+ p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].add_ps_aligned = PFX(pixel_add_ps_aligned_32x64_avx512);
+
// 64 X N
p.cu[BLOCK_64x64].copy_ss = PFX(blockcopy_ss_64x64_avx512);
p.pu[LUMA_64x64].copy_pp = (copy_pp_t)PFX(blockcopy_ss_64x64_avx512);
@@ -4306,6 +4320,20 @@
p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].add_ps = PFX(pixel_add_ps_32x32_avx512);
p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].add_ps = PFX(pixel_add_ps_32x64_avx512);
+ p.cu[BLOCK_4x4].add_ps_aligned = PFX(pixel_add_ps_4x4_sse4);
+ p.cu[BLOCK_8x8].add_ps_aligned = PFX(pixel_add_ps_8x8_sse4);
+ p.cu[BLOCK_16x16].add_ps_aligned = PFX(pixel_add_ps_16x16_avx2);
+ p.cu[BLOCK_32x32].add_ps_aligned = PFX(pixel_add_ps_aligned_32x32_avx512);
+ p.cu[BLOCK_64x64].add_ps_aligned = PFX(pixel_add_ps_aligned_64x64_avx512);
+ p.chroma[X265_CSP_I420].cu[BLOCK_420_4x4].add_ps_aligned = PFX(pixel_add_ps_4x4_sse4);
+ p.chroma[X265_CSP_I420].cu[BLOCK_420_8x8].add_ps_aligned = PFX(pixel_add_ps_8x8_sse4);
+ p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].add_ps_aligned = PFX(pixel_add_ps_16x16_avx2);
+ p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].add_ps_aligned = PFX(pixel_add_ps_aligned_32x32_avx512);
+ p.chroma[X265_CSP_I422].cu[BLOCK_422_4x8].add_ps_aligned = PFX(pixel_add_ps_4x8_sse4);
+ p.chroma[X265_CSP_I422].cu[BLOCK_422_8x16].add_ps_aligned = PFX(pixel_add_ps_8x16_sse4);
+ p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].add_ps_aligned = PFX(pixel_add_ps_16x32_avx2);
+ p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].add_ps_aligned = PFX(pixel_add_ps_aligned_32x64_avx512);
+
p.cu[BLOCK_64x64].sub_ps = PFX(pixel_sub_ps_64x64_avx512);
p.cu[BLOCK_32x32].sub_ps = PFX(pixel_sub_ps_32x32_avx512);
p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].sub_ps = PFX(pixel_sub_ps_32x32_avx512);
diff -r ba20a0818138 -r 44433ded38d0 source/common/x86/pixel.h
--- a/source/common/x86/pixel.h Wed Oct 04 17:02:59 2017 +0530
+++ b/source/common/x86/pixel.h Fri Oct 06 14:00:56 2017 +0530
@@ -45,6 +45,7 @@
FUNCDEF_PU(void, pixel_sad_x4, cpu, const pixel*, const pixel*, const pixel*, const pixel*, const pixel*, intptr_t, int32_t*); \
FUNCDEF_PU(void, pixel_avg, cpu, pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int); \
FUNCDEF_PU(void, pixel_add_ps, cpu, pixel* a, intptr_t dstride, const pixel* b0, const int16_t* b1, intptr_t sstride0, intptr_t sstride1); \
+ FUNCDEF_PU(void, pixel_add_ps_aligned, cpu, pixel* a, intptr_t dstride, const pixel* b0, const int16_t* b1, intptr_t sstride0, intptr_t sstride1); \
FUNCDEF_PU(void, pixel_sub_ps, cpu, int16_t* a, intptr_t dstride, const pixel* b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1); \
FUNCDEF_CHROMA_PU(int, pixel_satd, cpu, const pixel*, intptr_t, const pixel*, intptr_t); \
FUNCDEF_CHROMA_PU(int, pixel_sad, cpu, const pixel*, intptr_t, const pixel*, intptr_t); \
diff -r ba20a0818138 -r 44433ded38d0 source/common/x86/pixeladd8.asm
--- a/source/common/x86/pixeladd8.asm Wed Oct 04 17:02:59 2017 +0530
+++ b/source/common/x86/pixeladd8.asm Fri Oct 06 14:00:56 2017 +0530
@@ -1150,27 +1150,27 @@
;-----------------------------------------------------------------------------
%macro PROCESS_ADD_PS_64x4_AVX512 0
pmovzxbw m0, [r2]
- pmovzxbw m1, [r2 + 32]
+ pmovzxbw m1, [r2 + mmsize/2]
movu m2, [r3]
- movu m3, [r3 + 64]
+ movu m3, [r3 + mmsize]
paddw m0, m2
paddw m1, m3
packuswb m0, m1
vpermq m0, m4, m0
movu [r0], m0
pmovzxbw m0, [r2 + r4]
- pmovzxbw m1, [r2 + r4 + 32]
+ pmovzxbw m1, [r2 + r4 + mmsize/2]
movu m2, [r3 + r5]
- movu m3, [r3 + r5 + 64]
+ movu m3, [r3 + r5 + mmsize]
paddw m0, m2
paddw m1, m3
packuswb m0, m1
vpermq m0, m4, m0
movu [r0 + r1], m0
pmovzxbw m0, [r2 + 2 * r4]
- pmovzxbw m1, [r2 + 2 * r4 + 32]
+ pmovzxbw m1, [r2 + 2 * r4 + mmsize/2]
movu m2, [r3 + 2 * r5]
- movu m3, [r3 + 2 * r5 + 64]
+ movu m3, [r3 + 2 * r5 + mmsize]
paddw m0, m2
paddw m1, m3
packuswb m0, m1
@@ -1178,15 +1178,16 @@
movu [r0 + 2 * r1], m0
pmovzxbw m0, [r2 + r7]
- pmovzxbw m1, [r2 + r7 + 32]
+ pmovzxbw m1, [r2 + r7 + mmsize/2]
movu m2, [r3 + r8]
- movu m3, [r3 + r8 + 64]
+ movu m3, [r3 + r8 + mmsize]
paddw m0, m2
paddw m1, m3
packuswb m0, m1
vpermq m0, m4, m0
movu [r0 + r6], m0
%endmacro
+
%macro PROCESS_ADD_PS_64x4_HBD_AVX512 0
movu m0, [r2]
movu m1, [r2 + mmsize]
@@ -1233,6 +1234,92 @@
movu [r0 + r8 + mmsize], m1
%endmacro
+%macro PROCESS_ADD_PS_64x4_ALIGNED_AVX512 0
+ pmovzxbw m0, [r2]
+ pmovzxbw m1, [r2 + mmsize/2]
+ mova m2, [r3]
+ mova m3, [r3 + mmsize]
+ paddw m0, m2
+ paddw m1, m3
+ packuswb m0, m1
+ vpermq m0, m4, m0
+ mova [r0], m0
+ pmovzxbw m0, [r2 + r4]
+ pmovzxbw m1, [r2 + r4 + mmsize/2]
+ mova m2, [r3 + r5]
+ mova m3, [r3 + r5 + mmsize]
+ paddw m0, m2
+ paddw m1, m3
+ packuswb m0, m1
+ vpermq m0, m4, m0
+ mova [r0 + r1], m0
+ pmovzxbw m0, [r2 + 2 * r4]
+ pmovzxbw m1, [r2 + 2 * r4 + mmsize/2]
+ mova m2, [r3 + 2 * r5]
+ mova m3, [r3 + 2 * r5 + mmsize]
+ paddw m0, m2
+ paddw m1, m3
+ packuswb m0, m1
+ vpermq m0, m4, m0
+ mova [r0 + 2 * r1], m0
+
+ pmovzxbw m0, [r2 + r7]
+ pmovzxbw m1, [r2 + r7 + mmsize/2]
+ mova m2, [r3 + r8]
+ mova m3, [r3 + r8 + mmsize]
+ paddw m0, m2
+ paddw m1, m3
+ packuswb m0, m1
+ vpermq m0, m4, m0
+ mova [r0 + r6], m0
+%endmacro
+
+%macro PROCESS_ADD_PS_64x4_HBD_ALIGNED_AVX512 0
+ mova m0, [r2]
+ mova m1, [r2 + mmsize]
+ mova m2, [r3]
+ mova m3, [r3 + mmsize]
+ paddw m0, m2
+ paddw m1, m3
+
+ CLIPW2 m0, m1, m4, m5
+ mova [r0], m0
+ mova [r0 + mmsize], m1
+
+ mova m0, [r2 + r4]
+ mova m1, [r2 + r4 + mmsize]
+ mova m2, [r3 + r5]
+ mova m3, [r3 + r5 + mmsize]
+ paddw m0, m2
+ paddw m1, m3
+
+ CLIPW2 m0, m1, m4, m5
+ mova [r0 + r1], m0
+ mova [r0 + r1 + mmsize], m1
+
+ mova m0, [r2 + r4 * 2]
+ mova m1, [r2 + r4 * 2 + mmsize]
+ mova m2, [r3 + r5 * 2]
+ mova m3, [r3 + r5 * 2 + mmsize]
+ paddw m0, m2
+ paddw m1, m3
+
+ CLIPW2 m0, m1, m4, m5
+ mova [r0 + r1 * 2], m0
+ mova [r0 + r1 * 2 + mmsize], m1
+
+ mova m0, [r2 + r6]
+ mova m1, [r2 + r6 + mmsize]
+ mova m2, [r3 + r7]
+ mova m3, [r3 + r7 + mmsize]
+ paddw m0, m2
+ paddw m1, m3
+
+ CLIPW2 m0, m1, m4, m5
+ mova [r0 + r8], m0
+ mova [r0 + r8 + mmsize], m1
+%endmacro
+
;-----------------------------------------------------------------------------
; void pixel_add_ps_64x64(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1)
;-----------------------------------------------------------------------------
@@ -1256,6 +1343,25 @@
%endrep
PROCESS_ADD_PS_64x4_HBD_AVX512
RET
+
+INIT_ZMM avx512
+cglobal pixel_add_ps_aligned_64x64, 6, 9, 6
+ vbroadcasti32x8 m5, [pw_pixel_max]
+ pxor m4, m4
+ add r4d, r4d
+ add r5d, r5d
+ add r1d, r1d
+ lea r6, [r4 * 3]
+ lea r7, [r5 * 3]
+ lea r8, [r1 * 3]
+%rep 15
+ PROCESS_ADD_PS_64x4_HBD_ALIGNED_AVX512
+ lea r2, [r2 + r4 * 4]
+ lea r3, [r3 + r5 * 4]
+ lea r0, [r0 + r1 * 4]
+%endrep
+ PROCESS_ADD_PS_64x4_HBD_ALIGNED_AVX512
+ RET
%endif
%else
%if ARCH_X86_64
@@ -1274,8 +1380,25 @@
%endrep
PROCESS_ADD_PS_64x4_AVX512
RET
+
+INIT_ZMM avx512
+cglobal pixel_add_ps_aligned_64x64, 6, 9, 4
+ add r5, r5
+ lea r6, [3 * r1]
+ lea r7, [3 * r4]
+ lea r8, [3 * r5]
+ mova m4, [store_shuf1_avx512]
+%rep 15
+ PROCESS_ADD_PS_64x4_ALIGNED_AVX512
+ lea r2, [r2 + r4 * 4]
+ lea r3, [r3 + r5 * 4]
+ lea r0, [r0 + r1 * 4]
+%endrep
+ PROCESS_ADD_PS_64x4_ALIGNED_AVX512
+ RET
%endif
%endif
+
%macro PROCESS_ADD_PS_32x4_AVX512 0
pmovzxbw m0, [r2]
movu m1, [r3]
@@ -1298,6 +1421,7 @@
movu [r0 + r1 * 2], ym0
vextracti32x8 [r0 + r8], m0, 1
%endmacro
+
%macro PROCESS_ADD_PS_32x4_HBD_AVX512 0
movu m0, [r2]
movu m1, [r2 + r4]
@@ -1322,6 +1446,53 @@
movu [r0 + r8], m1
%endmacro
+%macro PROCESS_ADD_PS_32x4_ALIGNED_AVX512 0
+ pmovzxbw m0, [r2]
+ mova m1, [r3]
+ pmovzxbw m2, [r2 + r4]
+ mova m3, [r3 + r5]
+ paddw m0, m1
+ paddw m2, m3
+ packuswb m0, m2
+ vpermq m0, m4, m0
+ mova [r0], ym0
+ vextracti32x8 [r0 + r1], m0, 1
+ pmovzxbw m0, [r2 + r4 * 2]
+ mova m1, [r3 + r5 * 2]
+ pmovzxbw m2, [r2 + r6]
+ mova m3, [r3 + r7]
+ paddw m0, m1
+ paddw m2, m3
+ packuswb m0, m2
+ vpermq m0, m4, m0
+ mova [r0 + r1 * 2], ym0
+ vextracti32x8 [r0 + r8], m0, 1
+%endmacro
+
+%macro PROCESS_ADD_PS_32x4_HBD_ALIGNED_AVX512 0
+ mova m0, [r2]
+ mova m1, [r2 + r4]
+ mova m2, [r3]
+ mova m3, [r3 + r5]
+ paddw m0, m2
+ paddw m1, m3
+
+ CLIPW2 m0, m1, m4, m5
+ mova [r0], m0
+ mova [r0 + r1], m1
+
+ mova m0, [r2 + r4 * 2]
+ mova m1, [r2 + r6]
+ mova m2, [r3 + r5 * 2]
+ mova m3, [r3 + r7]
+ paddw m0, m2
+ paddw m1, m3
+
+ CLIPW2 m0, m1, m4, m5
+ mova [r0 + r1 * 2], m0
+ mova [r0 + r8], m1
+%endmacro
+
;-----------------------------------------------------------------------------
; void pixel_add_ps_32x32(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1)
;-----------------------------------------------------------------------------
@@ -1345,6 +1516,7 @@
%endrep
PROCESS_ADD_PS_32x4_HBD_AVX512
RET
+
INIT_ZMM avx512
cglobal pixel_add_ps_32x64, 6, 9, 6
vbroadcasti32x8 m5, [pw_pixel_max]
@@ -1363,6 +1535,44 @@
%endrep
PROCESS_ADD_PS_32x4_HBD_AVX512
RET
+
+INIT_ZMM avx512
+cglobal pixel_add_ps_aligned_32x32, 6, 9, 6
+ vbroadcasti32x8 m5, [pw_pixel_max]
+ pxor m4, m4
+ add r4d, r4d
+ add r5d, r5d
+ add r1d, r1d
+ lea r6, [r4 * 3]
+ lea r7, [r5 * 3]
+ lea r8, [r1 * 3]
+%rep 7
+ PROCESS_ADD_PS_32x4_HBD_ALIGNED_AVX512
+ lea r2, [r2 + r4 * 4]
+ lea r3, [r3 + r5 * 4]
+ lea r0, [r0 + r1 * 4]
+%endrep
+ PROCESS_ADD_PS_32x4_HBD_ALIGNED_AVX512
+ RET
+
+INIT_ZMM avx512
+cglobal pixel_add_ps_aligned_32x64, 6, 9, 6
+ vbroadcasti32x8 m5, [pw_pixel_max]
+ pxor m4, m4
+ add r4d, r4d
+ add r5d, r5d
+ add r1d, r1d
+ lea r6, [r4 * 3]
+ lea r7, [r5 * 3]
+ lea r8, [r1 * 3]
+%rep 15
+ PROCESS_ADD_PS_32x4_HBD_ALIGNED_AVX512
+ lea r2, [r2 + r4 * 4]
+ lea r3, [r3 + r5 * 4]
+ lea r0, [r0 + r1 * 4]
+%endrep
+ PROCESS_ADD_PS_32x4_HBD_ALIGNED_AVX512
+ RET
%endif
%else
%if ARCH_X86_64
@@ -1398,6 +1608,39 @@
%endrep
PROCESS_ADD_PS_32x4_AVX512
RET
+
+INIT_ZMM avx512
+cglobal pixel_add_ps_aligned_32x32, 6, 9, 5
+ add r5, r5
+ lea r6, [r4 * 3]
+ lea r7, [r5 * 3]
+ lea r8, [r1 * 3]
+ mova m4, [store_shuf1_avx512]
+%rep 7
+ PROCESS_ADD_PS_32x4_ALIGNED_AVX512
+ lea r2, [r2 + r4 * 4]
+ lea r3, [r3 + r5 * 4]
+ lea r0, [r0 + r1 * 4]
+%endrep
+ PROCESS_ADD_PS_32x4_ALIGNED_AVX512
+ RET
+
+INIT_ZMM avx512
+cglobal pixel_add_ps_aligned_32x64, 6, 9, 5
+ add r5, r5
+ lea r6, [r4 * 3]
+ lea r7, [r5 * 3]
+ lea r8, [r1 * 3]
+ mova m4, [store_shuf1_avx512]
+
+%rep 15
+ PROCESS_ADD_PS_32x4_ALIGNED_AVX512
+ lea r2, [r2 + r4 * 4]
+ lea r3, [r3 + r5 * 4]
+ lea r0, [r0 + r1 * 4]
+%endrep
+ PROCESS_ADD_PS_32x4_ALIGNED_AVX512
+ RET
%endif
%endif
;-----------------------------------------------------------------------------
diff -r ba20a0818138 -r 44433ded38d0 source/test/pixelharness.cpp
--- a/source/test/pixelharness.cpp Wed Oct 04 17:02:59 2017 +0530
+++ b/source/test/pixelharness.cpp Fri Oct 06 14:00:56 2017 +0530
@@ -876,6 +876,31 @@
return true;
}
+bool PixelHarness::check_pixel_add_ps_aligned(pixel_add_ps_t ref, pixel_add_ps_t opt)
+{
+ ALIGN_VAR_64(pixel, ref_dest[64 * 64]);
+ ALIGN_VAR_64(pixel, opt_dest[64 * 64]);
+
+ memset(ref_dest, 0xCD, sizeof(ref_dest));
+ memset(opt_dest, 0xCD, sizeof(opt_dest));
+
+ int j = 0;
+ intptr_t stride2 = 64, stride = STRIDE;
+ for (int i = 0; i < ITERS; i++)
+ {
+ int index1 = rand() % TEST_CASES;
+ int index2 = rand() % TEST_CASES;
+ checked(opt, opt_dest, stride2, pixel_test_buff[index1] + j, short_test_buff[index2] + j, stride, stride);
+ ref(ref_dest, stride2, pixel_test_buff[index1] + j, short_test_buff[index2] + j, stride, stride);
+ if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(pixel)))
+ return false;
+
+ reportfail();
+ j += 2 * INCR;
+ }
+ return true;
+}
+
bool PixelHarness::check_pixel_var(var_t ref, var_t opt)
{
int j = 0;
@@ -2288,6 +2313,15 @@
}
}
+ if (opt.cu[part].add_ps_aligned)
+ {
+ if (!check_pixel_add_ps_aligned(ref.cu[part].add_ps_aligned, opt.cu[part].add_ps_aligned))
+ {
+ printf("add_ps_aligned[%s] failed\n", lumaPartStr[part]);
+ return false;
+ }
+ }
+
if (opt.cu[part].copy_ss)
{
if (!check_copy_ss(ref.cu[part].copy_ss, opt.cu[part].copy_ss))
@@ -2376,6 +2410,14 @@
return false;
}
}
+ if (opt.chroma[i].cu[part].add_ps_aligned)
+ {
+ if (!check_pixel_add_ps_aligned(ref.chroma[i].cu[part].add_ps_aligned, opt.chroma[i].cu[part].add_ps_aligned))
+ {
+ printf("chroma_add_ps_aligned[%s][%s] failed\n", x265_source_csp_names[i], chromaPartStr[i][part]);
+ return false;
+ }
+ }
if (opt.chroma[i].cu[part].copy_sp)
{
if (!check_copy_sp(ref.chroma[i].cu[part].copy_sp, opt.chroma[i].cu[part].copy_sp))
@@ -3042,6 +3084,11 @@
HEADER("add_ps[%s]", lumaPartStr[part]);
REPORT_SPEEDUP(opt.cu[part].add_ps, ref.cu[part].add_ps, pbuf1, FENC_STRIDE, pbuf2, sbuf1, STRIDE, STRIDE);
}
+ if (opt.cu[part].add_ps_aligned)
+ {
+ HEADER("add_ps[%s]", lumaPartStr[part]);
+ REPORT_SPEEDUP(opt.cu[part].add_ps_aligned, ref.cu[part].add_ps_aligned, pbuf1, FENC_STRIDE, pbuf2, sbuf1, STRIDE, STRIDE);
+ }
if (opt.cu[part].copy_ss)
{
HEADER("copy_ss[%s]", lumaPartStr[part]);
@@ -3113,6 +3160,11 @@
HEADER("[%s] add_ps[%s]", x265_source_csp_names[i], chromaPartStr[i][part]);
REPORT_SPEEDUP(opt.chroma[i].cu[part].add_ps, ref.chroma[i].cu[part].add_ps, pbuf1, FENC_STRIDE, pbuf2, sbuf1, STRIDE, STRIDE);
}
+ if (opt.chroma[i].cu[part].add_ps_aligned)
+ {
+ HEADER("[%s] add_ps_aligned[%s]", x265_source_csp_names[i], chromaPartStr[i][part]);
+ REPORT_SPEEDUP(opt.chroma[i].cu[part].add_ps_aligned, ref.chroma[i].cu[part].add_ps_aligned, pbuf1, FENC_STRIDE, pbuf2, sbuf1, STRIDE, STRIDE);
+ }
if (opt.chroma[i].cu[part].sa8d)
{
HEADER("[%s] sa8d[%s]", x265_source_csp_names[i], chromaPartStr[i][part]);
diff -r ba20a0818138 -r 44433ded38d0 source/test/pixelharness.h
--- a/source/test/pixelharness.h Wed Oct 04 17:02:59 2017 +0530
+++ b/source/test/pixelharness.h Fri Oct 06 14:00:56 2017 +0530
@@ -81,6 +81,7 @@
bool check_pixelavg_pp(pixelavg_pp_t ref, pixelavg_pp_t opt);
bool check_pixel_sub_ps(pixel_sub_ps_t ref, pixel_sub_ps_t opt);
bool check_pixel_add_ps(pixel_add_ps_t ref, pixel_add_ps_t opt);
+ bool check_pixel_add_ps_aligned(pixel_add_ps_t ref, pixel_add_ps_t opt);
bool check_scale1D_pp(scale1D_t ref, scale1D_t opt);
bool check_scale1D_pp_aligned(scale1D_t ref, scale1D_t opt);
bool check_scale2D_pp(scale2D_t ref, scale2D_t opt);
More information about the x265-devel
mailing list