[x265] [PATCH 111 of 307] x86: Aligned routine implementation for low bit depth p2s primitive
mythreyi at multicorewareinc.com
mythreyi at multicorewareinc.com
Sat Apr 7 04:31:49 CEST 2018
# HG changeset patch
# User Jayashri Murugan <jayashri at multicorewareinc.com>
# Date 1506325284 -19800
# Mon Sep 25 13:11:24 2017 +0530
# Node ID b31fc8889e0f8a433be25fb6267552f7d03efeaf
# Parent ffd4c1528b37332493c5fa4677e780dbef121a01
x86: Aligned routine implementation for low bit depth p2s primitive
diff -r ffd4c1528b37 -r b31fc8889e0f source/common/predict.cpp
--- a/source/common/predict.cpp Thu Sep 21 16:39:45 2017 +0530
+++ b/source/common/predict.cpp Mon Sep 25 13:11:24 2017 +0530
@@ -284,16 +284,12 @@
if (!(yFrac | xFrac))
{
-#if HIGH_BIT_DEPTH
bool srcbufferAlignCheck = (refPic.m_cuOffsetY[pu.ctuAddr] + refPic.m_buOffsetY[pu.cuAbsPartIdx + pu.puAbsPartIdx] + srcOffset) % 64 == 0;
bool dstbufferAlignCheck = (dstSYuv.getAddrOffset(pu.puAbsPartIdx, dstSYuv.m_size) % 64) == 0;
if (srcStride % 64 == 0 && dstStride % 64 == 0 && srcbufferAlignCheck && dstbufferAlignCheck && (refPic.m_param->cpuid & X265_CPU_AVX512))
primitives.pu[partEnum].convert_p2s_aligned(src, srcStride, dst, dstStride);
else
primitives.pu[partEnum].convert_p2s(src, srcStride, dst, dstStride);
-#else
- primitives.pu[partEnum].convert_p2s(src, srcStride, dst, dstStride);
-#endif
}
else if (!yFrac)
primitives.pu[partEnum].luma_hps(src, srcStride, dst, dstStride, xFrac, 0);
@@ -386,7 +382,6 @@
if (!(yFrac | xFrac))
{
-#if HIGH_BIT_DEPTH
bool srcbufferAlignCheckC = (refPic.m_cuOffsetC[pu.ctuAddr] + refPic.m_buOffsetC[pu.cuAbsPartIdx + pu.puAbsPartIdx] + refOffset) % 64 == 0;
bool dstbufferAlignCheckC = dstSYuv.getChromaAddrOffset(pu.puAbsPartIdx) % 64 == 0;
if (refStride % 64 == 0 && dstStride % 64 == 0 && srcbufferAlignCheckC && dstbufferAlignCheckC && (refPic.m_param->cpuid & X265_CPU_AVX512))
@@ -399,10 +394,6 @@
primitives.chroma[m_csp].pu[partEnum].p2s(refCb, refStride, dstCb, dstStride);
primitives.chroma[m_csp].pu[partEnum].p2s(refCr, refStride, dstCr, dstStride);
}
-#else
- primitives.chroma[m_csp].pu[partEnum].p2s(refCb, refStride, dstCb, dstStride);
- primitives.chroma[m_csp].pu[partEnum].p2s(refCr, refStride, dstCr, dstStride);
-#endif
}
else if (!yFrac)
{
diff -r ffd4c1528b37 -r b31fc8889e0f source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Thu Sep 21 16:39:45 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp Mon Sep 25 13:11:24 2017 +0530
@@ -4217,6 +4217,106 @@
p.chroma[X265_CSP_I444].pu[LUMA_64x48].p2s = PFX(filterPixelToShort_64x48_avx512);
p.chroma[X265_CSP_I444].pu[LUMA_64x64].p2s = PFX(filterPixelToShort_64x64_avx512);
+ p.pu[LUMA_4x4].convert_p2s_aligned = PFX(filterPixelToShort_4x4_sse4);
+ p.pu[LUMA_4x8].convert_p2s_aligned = PFX(filterPixelToShort_4x8_sse4);
+ p.pu[LUMA_4x16].convert_p2s_aligned = PFX(filterPixelToShort_4x16_sse4);
+ p.pu[LUMA_8x8].convert_p2s_aligned = PFX(filterPixelToShort_8x8_ssse3);
+ p.pu[LUMA_8x4].convert_p2s_aligned = PFX(filterPixelToShort_8x4_ssse3);
+ p.pu[LUMA_8x16].convert_p2s_aligned = PFX(filterPixelToShort_8x16_ssse3);
+ p.pu[LUMA_8x32].convert_p2s_aligned = PFX(filterPixelToShort_8x32_ssse3);
+ p.pu[LUMA_12x16].convert_p2s_aligned = PFX(filterPixelToShort_12x16_ssse3);
+ p.pu[LUMA_16x4].convert_p2s_aligned = PFX(filterPixelToShort_16x4_avx2);
+ p.pu[LUMA_16x8].convert_p2s_aligned = PFX(filterPixelToShort_16x8_avx2);
+ p.pu[LUMA_16x12].convert_p2s_aligned = PFX(filterPixelToShort_16x12_avx2);
+ p.pu[LUMA_16x16].convert_p2s_aligned = PFX(filterPixelToShort_16x16_avx2);
+ p.pu[LUMA_16x32].convert_p2s_aligned = PFX(filterPixelToShort_16x32_avx2);
+ p.pu[LUMA_16x64].convert_p2s_aligned = PFX(filterPixelToShort_16x64_avx2);
+ p.pu[LUMA_24x32].convert_p2s_aligned = PFX(filterPixelToShort_24x32_avx2);
+ p.pu[LUMA_64x16].convert_p2s_aligned = PFX(filterPixelToShort_aligned_64x16_avx512);
+ p.pu[LUMA_64x32].convert_p2s_aligned = PFX(filterPixelToShort_aligned_64x32_avx512);
+ p.pu[LUMA_64x48].convert_p2s_aligned = PFX(filterPixelToShort_aligned_64x48_avx512);
+ p.pu[LUMA_64x64].convert_p2s_aligned = PFX(filterPixelToShort_aligned_64x64_avx512);
+ p.pu[LUMA_32x8].convert_p2s_aligned = PFX(filterPixelToShort_aligned_32x8_avx512);
+ p.pu[LUMA_32x16].convert_p2s_aligned = PFX(filterPixelToShort_aligned_32x16_avx512);
+ p.pu[LUMA_32x24].convert_p2s_aligned = PFX(filterPixelToShort_aligned_32x24_avx512);
+ p.pu[LUMA_32x32].convert_p2s_aligned = PFX(filterPixelToShort_aligned_32x32_avx512);
+ p.pu[LUMA_32x64].convert_p2s_aligned = PFX(filterPixelToShort_aligned_32x64_avx512);
+ p.pu[LUMA_48x64].convert_p2s_aligned = PFX(filterPixelToShort_aligned_48x64_avx512);
+
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].p2s_aligned = PFX(filterPixelToShort_4x4_sse4);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_4x2].p2s_aligned = PFX(filterPixelToShort_4x2_sse4);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].p2s_aligned = PFX(filterPixelToShort_4x8_sse4);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].p2s_aligned = PFX(filterPixelToShort_4x16_sse4);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].p2s_aligned = PFX(filterPixelToShort_8x8_ssse3);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].p2s_aligned = PFX(filterPixelToShort_8x4_ssse3);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_8x2].p2s_aligned = PFX(filterPixelToShort_8x2_ssse3);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_8x6].p2s_aligned = PFX(filterPixelToShort_8x6_ssse3);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].p2s_aligned = PFX(filterPixelToShort_8x16_ssse3);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].p2s_aligned = PFX(filterPixelToShort_8x32_ssse3);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_12x16].p2s_aligned = PFX(filterPixelToShort_12x16_ssse3);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_2x4].p2s_aligned = PFX(filterPixelToShort_2x4_sse4);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_2x8].p2s_aligned = PFX(filterPixelToShort_2x8_sse4);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_6x8].p2s_aligned = PFX(filterPixelToShort_6x8_sse4);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].p2s_aligned = PFX(filterPixelToShort_16x8_avx2);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].p2s_aligned = PFX(filterPixelToShort_16x12_avx2);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].p2s_aligned = PFX(filterPixelToShort_16x16_avx2);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].p2s_aligned = PFX(filterPixelToShort_16x32_avx2);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].p2s_aligned = PFX(filterPixelToShort_24x32_avx2);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].p2s_aligned = PFX(filterPixelToShort_aligned_32x8_avx512);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].p2s_aligned = PFX(filterPixelToShort_aligned_32x16_avx512);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].p2s_aligned = PFX(filterPixelToShort_aligned_32x24_avx512);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].p2s_aligned = PFX(filterPixelToShort_aligned_32x32_avx512);
+
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_2x8].p2s_aligned = PFX(filterPixelToShort_2x8_sse4);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_2x16].p2s_aligned = PFX(filterPixelToShort_2x16_sse4);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].p2s_aligned = PFX(filterPixelToShort_4x4_sse4);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].p2s_aligned = PFX(filterPixelToShort_4x8_sse4);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].p2s_aligned = PFX(filterPixelToShort_4x16_sse4);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_4x32].p2s_aligned = PFX(filterPixelToShort_4x32_sse4);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_6x16].p2s_aligned = PFX(filterPixelToShort_6x16_sse4);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].p2s_aligned = PFX(filterPixelToShort_8x4_ssse3);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].p2s_aligned = PFX(filterPixelToShort_8x8_ssse3);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_8x12].p2s_aligned = PFX(filterPixelToShort_8x12_ssse3);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].p2s_aligned = PFX(filterPixelToShort_8x16_ssse3);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].p2s_aligned = PFX(filterPixelToShort_8x32_ssse3);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].p2s_aligned = PFX(filterPixelToShort_8x64_ssse3);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_12x32].p2s_aligned = PFX(filterPixelToShort_12x32_ssse3);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].p2s_aligned = PFX(filterPixelToShort_16x8_avx2);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].p2s_aligned = PFX(filterPixelToShort_16x16_avx2);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].p2s_aligned = PFX(filterPixelToShort_16x24_avx2);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].p2s_aligned = PFX(filterPixelToShort_16x32_avx2);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].p2s_aligned = PFX(filterPixelToShort_16x64_avx2);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].p2s_aligned = PFX(filterPixelToShort_24x64_avx2);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].p2s_aligned = PFX(filterPixelToShort_aligned_32x16_avx512);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].p2s_aligned = PFX(filterPixelToShort_aligned_32x32_avx512);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].p2s_aligned = PFX(filterPixelToShort_aligned_32x48_avx512);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].p2s_aligned = PFX(filterPixelToShort_aligned_32x64_avx512);
+
+ p.chroma[X265_CSP_I444].pu[LUMA_4x4].p2s_aligned = PFX(filterPixelToShort_4x4_sse4);
+ p.chroma[X265_CSP_I444].pu[LUMA_4x8].p2s_aligned = PFX(filterPixelToShort_4x8_sse4);
+ p.chroma[X265_CSP_I444].pu[LUMA_4x16].p2s_aligned = PFX(filterPixelToShort_4x16_sse4);
+ p.chroma[X265_CSP_I444].pu[LUMA_8x8].p2s_aligned = PFX(filterPixelToShort_8x8_ssse3);
+ p.chroma[X265_CSP_I444].pu[LUMA_8x4].p2s_aligned = PFX(filterPixelToShort_8x4_ssse3);
+ p.chroma[X265_CSP_I444].pu[LUMA_8x16].p2s_aligned = PFX(filterPixelToShort_8x16_ssse3);
+ p.chroma[X265_CSP_I444].pu[LUMA_8x32].p2s_aligned = PFX(filterPixelToShort_8x32_ssse3);
+ p.chroma[X265_CSP_I444].pu[LUMA_12x16].p2s_aligned = PFX(filterPixelToShort_12x16_ssse3);
+ p.chroma[X265_CSP_I444].pu[LUMA_16x4].p2s_aligned = PFX(filterPixelToShort_16x4_avx2);
+ p.chroma[X265_CSP_I444].pu[LUMA_16x8].p2s_aligned = PFX(filterPixelToShort_16x8_avx2);
+ p.chroma[X265_CSP_I444].pu[LUMA_16x12].p2s_aligned = PFX(filterPixelToShort_16x12_avx2);
+ p.chroma[X265_CSP_I444].pu[LUMA_16x16].p2s_aligned = PFX(filterPixelToShort_16x16_avx2);
+ p.chroma[X265_CSP_I444].pu[LUMA_16x32].p2s_aligned = PFX(filterPixelToShort_16x32_avx2);
+ p.chroma[X265_CSP_I444].pu[LUMA_16x64].p2s_aligned = PFX(filterPixelToShort_16x64_avx2);
+ p.chroma[X265_CSP_I444].pu[LUMA_24x32].p2s_aligned = PFX(filterPixelToShort_24x32_avx2);
+ p.chroma[X265_CSP_I444].pu[LUMA_32x8].p2s_aligned = PFX(filterPixelToShort_aligned_32x8_avx512);
+ p.chroma[X265_CSP_I444].pu[LUMA_32x16].p2s_aligned = PFX(filterPixelToShort_aligned_32x16_avx512);
+ p.chroma[X265_CSP_I444].pu[LUMA_32x24].p2s_aligned = PFX(filterPixelToShort_aligned_32x24_avx512);
+ p.chroma[X265_CSP_I444].pu[LUMA_32x32].p2s_aligned = PFX(filterPixelToShort_aligned_32x32_avx512);
+ p.chroma[X265_CSP_I444].pu[LUMA_32x64].p2s_aligned = PFX(filterPixelToShort_aligned_32x64_avx512);
+ p.chroma[X265_CSP_I444].pu[LUMA_64x16].p2s_aligned = PFX(filterPixelToShort_aligned_64x16_avx512);
+ p.chroma[X265_CSP_I444].pu[LUMA_64x32].p2s_aligned = PFX(filterPixelToShort_aligned_64x32_avx512);
+ p.chroma[X265_CSP_I444].pu[LUMA_64x48].p2s_aligned = PFX(filterPixelToShort_aligned_64x48_avx512);
+ p.chroma[X265_CSP_I444].pu[LUMA_64x64].p2s_aligned = PFX(filterPixelToShort_aligned_64x64_avx512);
+
p.cu[BLOCK_64x64].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_ss_64x64_avx512);
p.cu[BLOCK_32x32].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_ss_32x32_avx512);
p.cu[BLOCK_32x32].ssd_s = PFX(pixel_ssd_s_32_avx512);
diff -r ffd4c1528b37 -r b31fc8889e0f source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm Thu Sep 21 16:39:45 2017 +0530
+++ b/source/common/x86/ipfilter8.asm Mon Sep 25 13:11:24 2017 +0530
@@ -1969,6 +1969,10 @@
P2S_H_32xN_avx2 64
P2S_H_32xN_avx2 48
+;-----------------------------------------------------------------------------
+;p2s and p2s_aligned 32xN avx512 code start
+;-----------------------------------------------------------------------------
+
%macro PROCESS_P2S_32x4_AVX512 0
pmovzxbw m0, [r0]
pmovzxbw m1, [r0 + r1]
@@ -2099,6 +2103,138 @@
PROCESS_P2S_32x4_AVX512
RET
+%macro PROCESS_P2S_ALIGNED_32x4_AVX512 0
+ pmovzxbw m0, [r0]
+ pmovzxbw m1, [r0 + r1]
+ pmovzxbw m2, [r0 + r1 * 2]
+ pmovzxbw m3, [r0 + r5]
+
+ psllw m0, 6
+ psllw m1, 6
+ psllw m2, 6
+ psllw m3, 6
+ psubw m0, m4
+ psubw m1, m4
+ psubw m2, m4
+ psubw m3, m4
+
+ mova [r2], m0
+ mova [r2 + r3], m1
+ mova [r2 + r3 * 2], m2
+ mova [r2 + r6], m3
+%endmacro
+
+;-----------------------------------------------------------------------------
+; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride)
+;-----------------------------------------------------------------------------
+INIT_ZMM avx512
+cglobal filterPixelToShort_aligned_32x8, 3, 7, 5
+ mov r3d, r3m
+ add r3d, r3d
+ lea r5, [r1 * 3]
+ lea r6, [r3 * 3]
+
+ ; load constant
+ vpbroadcastd m4, [pw_2000]
+
+ PROCESS_P2S_ALIGNED_32x4_AVX512
+ lea r0, [r0 + r1 * 4]
+ lea r2, [r2 + r3 * 4]
+ PROCESS_P2S_ALIGNED_32x4_AVX512
+ RET
+
+INIT_ZMM avx512
+cglobal filterPixelToShort_aligned_32x16, 3, 7, 5
+ mov r3d, r3m
+ add r3d, r3d
+ lea r5, [r1 * 3]
+ lea r6, [r3 * 3]
+
+ ; load constant
+ vpbroadcastd m4, [pw_2000]
+
+%rep 3
+ PROCESS_P2S_ALIGNED_32x4_AVX512
+ lea r0, [r0 + r1 * 4]
+ lea r2, [r2 + r3 * 4]
+%endrep
+ PROCESS_P2S_ALIGNED_32x4_AVX512
+ RET
+
+INIT_ZMM avx512
+cglobal filterPixelToShort_aligned_32x24, 3, 7, 5
+ mov r3d, r3m
+ add r3d, r3d
+ lea r5, [r1 * 3]
+ lea r6, [r3 * 3]
+
+ ; load constant
+ vpbroadcastd m4, [pw_2000]
+
+%rep 5
+ PROCESS_P2S_ALIGNED_32x4_AVX512
+ lea r0, [r0 + r1 * 4]
+ lea r2, [r2 + r3 * 4]
+%endrep
+ PROCESS_P2S_ALIGNED_32x4_AVX512
+ RET
+
+INIT_ZMM avx512
+cglobal filterPixelToShort_aligned_32x32, 3, 7, 5
+ mov r3d, r3m
+ add r3d, r3d
+ lea r5, [r1 * 3]
+ lea r6, [r3 * 3]
+
+ ; load constant
+ vpbroadcastd m4, [pw_2000]
+
+%rep 7
+ PROCESS_P2S_ALIGNED_32x4_AVX512
+ lea r0, [r0 + r1 * 4]
+ lea r2, [r2 + r3 * 4]
+%endrep
+ PROCESS_P2S_ALIGNED_32x4_AVX512
+ RET
+
+INIT_ZMM avx512
+cglobal filterPixelToShort_aligned_32x48, 3, 7, 5
+ mov r3d, r3m
+ add r3d, r3d
+ lea r5, [r1 * 3]
+ lea r6, [r3 * 3]
+
+ ; load constant
+ vpbroadcastd m4, [pw_2000]
+
+%rep 11
+ PROCESS_P2S_ALIGNED_32x4_AVX512
+ lea r0, [r0 + r1 * 4]
+ lea r2, [r2 + r3 * 4]
+%endrep
+ PROCESS_P2S_ALIGNED_32x4_AVX512
+ RET
+
+INIT_ZMM avx512
+cglobal filterPixelToShort_aligned_32x64, 3, 7, 5
+ mov r3d, r3m
+ add r3d, r3d
+ lea r5, [r1 * 3]
+ lea r6, [r3 * 3]
+
+ ; load constant
+ vpbroadcastd m4, [pw_2000]
+
+%rep 15
+ PROCESS_P2S_ALIGNED_32x4_AVX512
+ lea r0, [r0 + r1 * 4]
+ lea r2, [r2 + r3 * 4]
+%endrep
+ PROCESS_P2S_ALIGNED_32x4_AVX512
+ RET
+;-----------------------------------------------------------------------------
+;p2s and p2s_aligned 32xN avx512 code end
+;-----------------------------------------------------------------------------
;-----------------------------------------------------------------------------
; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride)
;-----------------------------------------------------------------------------
@@ -2414,6 +2550,9 @@
P2S_H_64xN_avx2 32
P2S_H_64xN_avx2 48
+;-----------------------------------------------------------------------------
+;p2s and p2s_aligned 64xN avx512 code start
+;-----------------------------------------------------------------------------
%macro PROCESS_P2S_64x4_AVX512 0
pmovzxbw m0, [r0]
pmovzxbw m1, [r0 + mmsize/2]
@@ -2452,6 +2591,43 @@
movu [r2 + r6 + mmsize], m3
%endmacro
+%macro PROCESS_P2S_ALIGNED_64x4_AVX512 0
+ pmovzxbw m0, [r0]
+ pmovzxbw m1, [r0 + mmsize/2]
+ pmovzxbw m2, [r0 + r1]
+ pmovzxbw m3, [r0 + r1 + mmsize/2]
+
+ psllw m0, 6
+ psllw m1, 6
+ psllw m2, 6
+ psllw m3, 6
+ psubw m0, m4
+ psubw m1, m4
+ psubw m2, m4
+ psubw m3, m4
+ mova [r2], m0
+ mova [r2 + mmsize], m1
+ mova [r2 + r3], m2
+ mova [r2 + r3 + mmsize], m3
+
+ pmovzxbw m0, [r0 + r1 * 2]
+ pmovzxbw m1, [r0 + r1 * 2 + mmsize/2]
+ pmovzxbw m2, [r0 + r5]
+ pmovzxbw m3, [r0 + r5 + mmsize/2]
+
+ psllw m0, 6
+ psllw m1, 6
+ psllw m2, 6
+ psllw m3, 6
+ psubw m0, m4
+ psubw m1, m4
+ psubw m2, m4
+ psubw m3, m4
+ mova [r2 + r3 * 2], m0
+ mova [r2 + r3 * 2 + mmsize], m1
+ mova [r2 + r6], m2
+ mova [r2 + r6 + mmsize], m3
+%endmacro
;-----------------------------------------------------------------------------
; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride)
;-----------------------------------------------------------------------------
@@ -2527,6 +2703,81 @@
PROCESS_P2S_64x4_AVX512
RET
+INIT_ZMM avx512
+cglobal filterPixelToShort_aligned_64x64, 3, 7, 5
+ mov r3d, r3m
+ add r3d, r3d
+ lea r5, [r1 * 3]
+ lea r6, [r3 * 3]
+
+ ; load constant
+ vpbroadcastd m4, [pw_2000]
+
+%rep 15
+ PROCESS_P2S_ALIGNED_64x4_AVX512
+ lea r0, [r0 + r1 * 4]
+ lea r2, [r2 + r3 * 4]
+%endrep
+ PROCESS_P2S_ALIGNED_64x4_AVX512
+ RET
+
+INIT_ZMM avx512
+cglobal filterPixelToShort_aligned_64x48, 3, 7, 5
+ mov r3d, r3m
+ add r3d, r3d
+ lea r5, [r1 * 3]
+ lea r6, [r3 * 3]
+
+ ; load constant
+ vpbroadcastd m4, [pw_2000]
+
+%rep 11
+ PROCESS_P2S_ALIGNED_64x4_AVX512
+ lea r0, [r0 + r1 * 4]
+ lea r2, [r2 + r3 * 4]
+%endrep
+ PROCESS_P2S_ALIGNED_64x4_AVX512
+ RET
+
+INIT_ZMM avx512
+cglobal filterPixelToShort_aligned_64x32, 3, 7, 5
+ mov r3d, r3m
+ add r3d, r3d
+ lea r5, [r1 * 3]
+ lea r6, [r3 * 3]
+
+ ; load constant
+ vpbroadcastd m4, [pw_2000]
+
+%rep 7
+ PROCESS_P2S_ALIGNED_64x4_AVX512
+ lea r0, [r0 + r1 * 4]
+ lea r2, [r2 + r3 * 4]
+%endrep
+ PROCESS_P2S_ALIGNED_64x4_AVX512
+ RET
+
+INIT_ZMM avx512
+cglobal filterPixelToShort_aligned_64x16, 3, 7, 5
+ mov r3d, r3m
+ add r3d, r3d
+ lea r5, [r1 * 3]
+ lea r6, [r3 * 3]
+
+ ; load constant
+ vpbroadcastd m4, [pw_2000]
+
+%rep 3
+ PROCESS_P2S_ALIGNED_64x4_AVX512
+ lea r0, [r0 + r1 * 4]
+ lea r2, [r2 + r3 * 4]
+%endrep
+ PROCESS_P2S_ALIGNED_64x4_AVX512
+ RET
+;-----------------------------------------------------------------------------
+;p2s and p2s_aligned 64xN avx512 code end
+;-----------------------------------------------------------------------------
+
;-----------------------------------------------------------------------------
; void filterPixelToShort(pixel src, intptr_t srcStride, int16_t dst, int16_t dstStride)
;-----------------------------------------------------------------------------
@@ -2948,6 +3199,9 @@
jnz .loop
RET
+;-----------------------------------------------------------------------------
+;p2s and p2s_aligned 48xN avx512 code start
+;-----------------------------------------------------------------------------
%macro PROCESS_P2S_48x8_AVX512 0
pmovzxbw m0, [r0]
pmovzxbw m1, [r0 + r1]
@@ -3021,6 +3275,78 @@
movu [r2 + r6 + 64], ym3
%endmacro
+%macro PROCESS_P2S_ALIGNED_48x8_AVX512 0
+ pmovzxbw m0, [r0]
+ pmovzxbw m1, [r0 + r1]
+ pmovzxbw m2, [r0 + r1 * 2]
+ pmovzxbw m3, [r0 + r5]
+ psllw m0, 6
+ psllw m1, 6
+ psllw m2, 6
+ psllw m3, 6
+ psubw m0, m4
+ psubw m1, m4
+ psubw m2, m4
+ psubw m3, m4
+ mova [r2], m0
+ mova [r2 + r3], m1
+ mova [r2 + r3 * 2], m2
+ mova [r2 + r6], m3
+
+ pmovzxbw ym0, [r0 + 32]
+ pmovzxbw ym1, [r0 + r1 + 32]
+ pmovzxbw ym2, [r0 + r1 * 2 + 32]
+ pmovzxbw ym3, [r0 + r5 + 32]
+ psllw ym0, 6
+ psllw ym1, 6
+ psllw ym2, 6
+ psllw ym3, 6
+ psubw ym0, ym4
+ psubw ym1, ym4
+ psubw ym2, ym4
+ psubw ym3, ym4
+ mova [r2 + 64], ym0
+ mova [r2 + r3 + 64], ym1
+ mova [r2 + r3 * 2 + 64], ym2
+ mova [r2 + r6 + 64], ym3
+
+ lea r0, [r0 + r1 * 4]
+ lea r2, [r2 + r3 * 4]
+
+ pmovzxbw m0, [r0]
+ pmovzxbw m1, [r0 + r1]
+ pmovzxbw m2, [r0 + r1 * 2]
+ pmovzxbw m3, [r0 + r5]
+ psllw m0, 6
+ psllw m1, 6
+ psllw m2, 6
+ psllw m3, 6
+ psubw m0, m4
+ psubw m1, m4
+ psubw m2, m4
+ psubw m3, m4
+ mova [r2], m0
+ mova [r2 + r3], m1
+ mova [r2 + r3 * 2], m2
+ mova [r2 + r6], m3
+
+ pmovzxbw ym0, [r0 + 32]
+ pmovzxbw ym1, [r0 + r1 + 32]
+ pmovzxbw ym2, [r0 + r1 * 2 + 32]
+ pmovzxbw ym3, [r0 + r5 + 32]
+ psllw ym0, 6
+ psllw ym1, 6
+ psllw ym2, 6
+ psllw ym3, 6
+ psubw ym0, ym4
+ psubw ym1, ym4
+ psubw ym2, ym4
+ psubw ym3, ym4
+ mova [r2 + 64], ym0
+ mova [r2 + r3 + 64], ym1
+ mova [r2 + r3 * 2 + 64], ym2
+ mova [r2 + r6 + 64], ym3
+%endmacro
;-----------------------------------------------------------------------------
; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride)
;-----------------------------------------------------------------------------
@@ -3058,6 +3384,43 @@
PROCESS_P2S_48x8_AVX512
RET
+INIT_ZMM avx512
+cglobal filterPixelToShort_aligned_48x64, 3,7,5
+ mov r3d, r3m
+ add r3d, r3d
+ lea r5, [r1 * 3]
+ lea r6, [r3 * 3]
+
+ ; load constant
+ vpbroadcastd m4, [pw_2000]
+
+ PROCESS_P2S_ALIGNED_48x8_AVX512
+ lea r0, [r0 + r1 * 4]
+ lea r2, [r2 + r3 * 4]
+ PROCESS_P2S_ALIGNED_48x8_AVX512
+ lea r0, [r0 + r1 * 4]
+ lea r2, [r2 + r3 * 4]
+ PROCESS_P2S_ALIGNED_48x8_AVX512
+ lea r0, [r0 + r1 * 4]
+ lea r2, [r2 + r3 * 4]
+ PROCESS_P2S_ALIGNED_48x8_AVX512
+ lea r0, [r0 + r1 * 4]
+ lea r2, [r2 + r3 * 4]
+ PROCESS_P2S_ALIGNED_48x8_AVX512
+ lea r0, [r0 + r1 * 4]
+ lea r2, [r2 + r3 * 4]
+ PROCESS_P2S_ALIGNED_48x8_AVX512
+ lea r0, [r0 + r1 * 4]
+ lea r2, [r2 + r3 * 4]
+ PROCESS_P2S_ALIGNED_48x8_AVX512
+ lea r0, [r0 + r1 * 4]
+ lea r2, [r2 + r3 * 4]
+ PROCESS_P2S_ALIGNED_48x8_AVX512
+ RET
+;-----------------------------------------------------------------------------
+;p2s and p2s_aligned 48xN avx512 code end
+;-----------------------------------------------------------------------------
+
%macro PROCESS_LUMA_W4_4R 0
movd m0, [r0]
movd m1, [r0 + r1]
More information about the x265-devel
mailing list