[x265] [PATCH 111 of 307] x86: Aligned routine implementation for low bit depth p2s primitive

mythreyi at multicorewareinc.com mythreyi at multicorewareinc.com
Sat Apr 7 04:31:49 CEST 2018


# HG changeset patch
# User Jayashri Murugan <jayashri at multicorewareinc.com>
# Date 1506325284 -19800
#      Mon Sep 25 13:11:24 2017 +0530
# Node ID b31fc8889e0f8a433be25fb6267552f7d03efeaf
# Parent  ffd4c1528b37332493c5fa4677e780dbef121a01
x86: Aligned routine implementation for low bit depth  p2s primitive

diff -r ffd4c1528b37 -r b31fc8889e0f source/common/predict.cpp
--- a/source/common/predict.cpp	Thu Sep 21 16:39:45 2017 +0530
+++ b/source/common/predict.cpp	Mon Sep 25 13:11:24 2017 +0530
@@ -284,16 +284,12 @@
 
     if (!(yFrac | xFrac))
     {
-#if HIGH_BIT_DEPTH
         bool srcbufferAlignCheck = (refPic.m_cuOffsetY[pu.ctuAddr] + refPic.m_buOffsetY[pu.cuAbsPartIdx + pu.puAbsPartIdx] + srcOffset) % 64 == 0;
         bool dstbufferAlignCheck = (dstSYuv.getAddrOffset(pu.puAbsPartIdx, dstSYuv.m_size) % 64) == 0;
         if (srcStride % 64 == 0 && dstStride % 64 == 0 && srcbufferAlignCheck && dstbufferAlignCheck && (refPic.m_param->cpuid & X265_CPU_AVX512))
             primitives.pu[partEnum].convert_p2s_aligned(src, srcStride, dst, dstStride);
         else
             primitives.pu[partEnum].convert_p2s(src, srcStride, dst, dstStride);
-#else
-        primitives.pu[partEnum].convert_p2s(src, srcStride, dst, dstStride);
-#endif
     }
     else if (!yFrac)
         primitives.pu[partEnum].luma_hps(src, srcStride, dst, dstStride, xFrac, 0);
@@ -386,7 +382,6 @@
 
     if (!(yFrac | xFrac))
     {
-#if HIGH_BIT_DEPTH
         bool srcbufferAlignCheckC = (refPic.m_cuOffsetC[pu.ctuAddr] + refPic.m_buOffsetC[pu.cuAbsPartIdx + pu.puAbsPartIdx] + refOffset) % 64 == 0;
         bool dstbufferAlignCheckC = dstSYuv.getChromaAddrOffset(pu.puAbsPartIdx) % 64 == 0;
         if (refStride % 64 == 0 && dstStride % 64 == 0 && srcbufferAlignCheckC  && dstbufferAlignCheckC && (refPic.m_param->cpuid & X265_CPU_AVX512))
@@ -399,10 +394,6 @@
             primitives.chroma[m_csp].pu[partEnum].p2s(refCb, refStride, dstCb, dstStride);
             primitives.chroma[m_csp].pu[partEnum].p2s(refCr, refStride, dstCr, dstStride);
         }
-#else
-        primitives.chroma[m_csp].pu[partEnum].p2s(refCb, refStride, dstCb, dstStride);
-        primitives.chroma[m_csp].pu[partEnum].p2s(refCr, refStride, dstCr, dstStride);
-#endif
     }
     else if (!yFrac)
     {
diff -r ffd4c1528b37 -r b31fc8889e0f source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Thu Sep 21 16:39:45 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp	Mon Sep 25 13:11:24 2017 +0530
@@ -4217,6 +4217,106 @@
         p.chroma[X265_CSP_I444].pu[LUMA_64x48].p2s = PFX(filterPixelToShort_64x48_avx512);
         p.chroma[X265_CSP_I444].pu[LUMA_64x64].p2s = PFX(filterPixelToShort_64x64_avx512);
 
+        p.pu[LUMA_4x4].convert_p2s_aligned = PFX(filterPixelToShort_4x4_sse4);
+        p.pu[LUMA_4x8].convert_p2s_aligned = PFX(filterPixelToShort_4x8_sse4);
+        p.pu[LUMA_4x16].convert_p2s_aligned = PFX(filterPixelToShort_4x16_sse4);
+        p.pu[LUMA_8x8].convert_p2s_aligned = PFX(filterPixelToShort_8x8_ssse3);
+        p.pu[LUMA_8x4].convert_p2s_aligned = PFX(filterPixelToShort_8x4_ssse3);
+        p.pu[LUMA_8x16].convert_p2s_aligned = PFX(filterPixelToShort_8x16_ssse3);
+        p.pu[LUMA_8x32].convert_p2s_aligned = PFX(filterPixelToShort_8x32_ssse3);
+        p.pu[LUMA_12x16].convert_p2s_aligned = PFX(filterPixelToShort_12x16_ssse3);
+        p.pu[LUMA_16x4].convert_p2s_aligned = PFX(filterPixelToShort_16x4_avx2);
+        p.pu[LUMA_16x8].convert_p2s_aligned = PFX(filterPixelToShort_16x8_avx2);
+        p.pu[LUMA_16x12].convert_p2s_aligned = PFX(filterPixelToShort_16x12_avx2);
+        p.pu[LUMA_16x16].convert_p2s_aligned = PFX(filterPixelToShort_16x16_avx2);
+        p.pu[LUMA_16x32].convert_p2s_aligned = PFX(filterPixelToShort_16x32_avx2);
+        p.pu[LUMA_16x64].convert_p2s_aligned = PFX(filterPixelToShort_16x64_avx2);
+        p.pu[LUMA_24x32].convert_p2s_aligned = PFX(filterPixelToShort_24x32_avx2);
+        p.pu[LUMA_64x16].convert_p2s_aligned = PFX(filterPixelToShort_aligned_64x16_avx512);
+        p.pu[LUMA_64x32].convert_p2s_aligned = PFX(filterPixelToShort_aligned_64x32_avx512);
+        p.pu[LUMA_64x48].convert_p2s_aligned = PFX(filterPixelToShort_aligned_64x48_avx512);
+        p.pu[LUMA_64x64].convert_p2s_aligned = PFX(filterPixelToShort_aligned_64x64_avx512);
+        p.pu[LUMA_32x8].convert_p2s_aligned = PFX(filterPixelToShort_aligned_32x8_avx512);
+        p.pu[LUMA_32x16].convert_p2s_aligned = PFX(filterPixelToShort_aligned_32x16_avx512);
+        p.pu[LUMA_32x24].convert_p2s_aligned = PFX(filterPixelToShort_aligned_32x24_avx512);
+        p.pu[LUMA_32x32].convert_p2s_aligned = PFX(filterPixelToShort_aligned_32x32_avx512);
+        p.pu[LUMA_32x64].convert_p2s_aligned = PFX(filterPixelToShort_aligned_32x64_avx512);
+        p.pu[LUMA_48x64].convert_p2s_aligned = PFX(filterPixelToShort_aligned_48x64_avx512);
+
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].p2s_aligned = PFX(filterPixelToShort_4x4_sse4);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x2].p2s_aligned = PFX(filterPixelToShort_4x2_sse4);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].p2s_aligned = PFX(filterPixelToShort_4x8_sse4);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].p2s_aligned = PFX(filterPixelToShort_4x16_sse4);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].p2s_aligned = PFX(filterPixelToShort_8x8_ssse3);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].p2s_aligned = PFX(filterPixelToShort_8x4_ssse3);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x2].p2s_aligned = PFX(filterPixelToShort_8x2_ssse3);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x6].p2s_aligned = PFX(filterPixelToShort_8x6_ssse3);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].p2s_aligned = PFX(filterPixelToShort_8x16_ssse3);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].p2s_aligned = PFX(filterPixelToShort_8x32_ssse3);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_12x16].p2s_aligned = PFX(filterPixelToShort_12x16_ssse3);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_2x4].p2s_aligned = PFX(filterPixelToShort_2x4_sse4);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_2x8].p2s_aligned = PFX(filterPixelToShort_2x8_sse4);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_6x8].p2s_aligned = PFX(filterPixelToShort_6x8_sse4);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].p2s_aligned = PFX(filterPixelToShort_16x8_avx2);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].p2s_aligned = PFX(filterPixelToShort_16x12_avx2);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].p2s_aligned = PFX(filterPixelToShort_16x16_avx2);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].p2s_aligned = PFX(filterPixelToShort_16x32_avx2);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].p2s_aligned = PFX(filterPixelToShort_24x32_avx2);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].p2s_aligned = PFX(filterPixelToShort_aligned_32x8_avx512);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].p2s_aligned = PFX(filterPixelToShort_aligned_32x16_avx512);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].p2s_aligned = PFX(filterPixelToShort_aligned_32x24_avx512);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].p2s_aligned = PFX(filterPixelToShort_aligned_32x32_avx512);
+
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_2x8].p2s_aligned = PFX(filterPixelToShort_2x8_sse4);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_2x16].p2s_aligned = PFX(filterPixelToShort_2x16_sse4);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].p2s_aligned = PFX(filterPixelToShort_4x4_sse4);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].p2s_aligned = PFX(filterPixelToShort_4x8_sse4);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].p2s_aligned = PFX(filterPixelToShort_4x16_sse4);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x32].p2s_aligned = PFX(filterPixelToShort_4x32_sse4);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_6x16].p2s_aligned = PFX(filterPixelToShort_6x16_sse4);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].p2s_aligned = PFX(filterPixelToShort_8x4_ssse3);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].p2s_aligned = PFX(filterPixelToShort_8x8_ssse3);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x12].p2s_aligned = PFX(filterPixelToShort_8x12_ssse3);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].p2s_aligned = PFX(filterPixelToShort_8x16_ssse3);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].p2s_aligned = PFX(filterPixelToShort_8x32_ssse3);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].p2s_aligned = PFX(filterPixelToShort_8x64_ssse3);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_12x32].p2s_aligned = PFX(filterPixelToShort_12x32_ssse3);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].p2s_aligned = PFX(filterPixelToShort_16x8_avx2);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].p2s_aligned = PFX(filterPixelToShort_16x16_avx2);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].p2s_aligned = PFX(filterPixelToShort_16x24_avx2);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].p2s_aligned = PFX(filterPixelToShort_16x32_avx2);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].p2s_aligned = PFX(filterPixelToShort_16x64_avx2);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].p2s_aligned = PFX(filterPixelToShort_24x64_avx2);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].p2s_aligned = PFX(filterPixelToShort_aligned_32x16_avx512);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].p2s_aligned = PFX(filterPixelToShort_aligned_32x32_avx512);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].p2s_aligned = PFX(filterPixelToShort_aligned_32x48_avx512);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].p2s_aligned = PFX(filterPixelToShort_aligned_32x64_avx512);
+
+        p.chroma[X265_CSP_I444].pu[LUMA_4x4].p2s_aligned = PFX(filterPixelToShort_4x4_sse4);
+        p.chroma[X265_CSP_I444].pu[LUMA_4x8].p2s_aligned = PFX(filterPixelToShort_4x8_sse4);
+        p.chroma[X265_CSP_I444].pu[LUMA_4x16].p2s_aligned = PFX(filterPixelToShort_4x16_sse4);
+        p.chroma[X265_CSP_I444].pu[LUMA_8x8].p2s_aligned = PFX(filterPixelToShort_8x8_ssse3);
+        p.chroma[X265_CSP_I444].pu[LUMA_8x4].p2s_aligned = PFX(filterPixelToShort_8x4_ssse3);
+        p.chroma[X265_CSP_I444].pu[LUMA_8x16].p2s_aligned = PFX(filterPixelToShort_8x16_ssse3);
+        p.chroma[X265_CSP_I444].pu[LUMA_8x32].p2s_aligned = PFX(filterPixelToShort_8x32_ssse3);
+        p.chroma[X265_CSP_I444].pu[LUMA_12x16].p2s_aligned = PFX(filterPixelToShort_12x16_ssse3);
+        p.chroma[X265_CSP_I444].pu[LUMA_16x4].p2s_aligned = PFX(filterPixelToShort_16x4_avx2);
+        p.chroma[X265_CSP_I444].pu[LUMA_16x8].p2s_aligned = PFX(filterPixelToShort_16x8_avx2);
+        p.chroma[X265_CSP_I444].pu[LUMA_16x12].p2s_aligned = PFX(filterPixelToShort_16x12_avx2);
+        p.chroma[X265_CSP_I444].pu[LUMA_16x16].p2s_aligned = PFX(filterPixelToShort_16x16_avx2);
+        p.chroma[X265_CSP_I444].pu[LUMA_16x32].p2s_aligned = PFX(filterPixelToShort_16x32_avx2);
+        p.chroma[X265_CSP_I444].pu[LUMA_16x64].p2s_aligned = PFX(filterPixelToShort_16x64_avx2);
+        p.chroma[X265_CSP_I444].pu[LUMA_24x32].p2s_aligned = PFX(filterPixelToShort_24x32_avx2);
+        p.chroma[X265_CSP_I444].pu[LUMA_32x8].p2s_aligned = PFX(filterPixelToShort_aligned_32x8_avx512);
+        p.chroma[X265_CSP_I444].pu[LUMA_32x16].p2s_aligned = PFX(filterPixelToShort_aligned_32x16_avx512);
+        p.chroma[X265_CSP_I444].pu[LUMA_32x24].p2s_aligned = PFX(filterPixelToShort_aligned_32x24_avx512);
+        p.chroma[X265_CSP_I444].pu[LUMA_32x32].p2s_aligned = PFX(filterPixelToShort_aligned_32x32_avx512);
+        p.chroma[X265_CSP_I444].pu[LUMA_32x64].p2s_aligned = PFX(filterPixelToShort_aligned_32x64_avx512);
+        p.chroma[X265_CSP_I444].pu[LUMA_64x16].p2s_aligned = PFX(filterPixelToShort_aligned_64x16_avx512);
+        p.chroma[X265_CSP_I444].pu[LUMA_64x32].p2s_aligned = PFX(filterPixelToShort_aligned_64x32_avx512);
+        p.chroma[X265_CSP_I444].pu[LUMA_64x48].p2s_aligned = PFX(filterPixelToShort_aligned_64x48_avx512);
+        p.chroma[X265_CSP_I444].pu[LUMA_64x64].p2s_aligned = PFX(filterPixelToShort_aligned_64x64_avx512);
+
         p.cu[BLOCK_64x64].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_ss_64x64_avx512);
         p.cu[BLOCK_32x32].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_ss_32x32_avx512);
         p.cu[BLOCK_32x32].ssd_s = PFX(pixel_ssd_s_32_avx512);
diff -r ffd4c1528b37 -r b31fc8889e0f source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm	Thu Sep 21 16:39:45 2017 +0530
+++ b/source/common/x86/ipfilter8.asm	Mon Sep 25 13:11:24 2017 +0530
@@ -1969,6 +1969,10 @@
     P2S_H_32xN_avx2 64
     P2S_H_32xN_avx2 48
 
+;-----------------------------------------------------------------------------
+;p2s and p2s_aligned 32xN avx512 code start
+;-----------------------------------------------------------------------------
+
 %macro PROCESS_P2S_32x4_AVX512 0
     pmovzxbw    m0, [r0]
     pmovzxbw    m1, [r0 + r1]
@@ -2099,6 +2103,138 @@
     PROCESS_P2S_32x4_AVX512
     RET
 
+%macro PROCESS_P2S_ALIGNED_32x4_AVX512 0
+    pmovzxbw    m0, [r0]
+    pmovzxbw    m1, [r0 + r1]
+    pmovzxbw    m2, [r0 + r1 * 2]
+    pmovzxbw    m3, [r0 + r5]
+
+    psllw       m0, 6
+    psllw       m1, 6
+    psllw       m2, 6
+    psllw       m3, 6
+    psubw       m0, m4
+    psubw       m1, m4
+    psubw       m2, m4
+    psubw       m3, m4
+
+    mova        [r2],           m0
+    mova        [r2 + r3],      m1
+    mova        [r2 + r3 * 2],  m2
+    mova        [r2 + r6],      m3
+%endmacro
+
+;-----------------------------------------------------------------------------
+; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride)
+;-----------------------------------------------------------------------------
+INIT_ZMM avx512
+cglobal filterPixelToShort_aligned_32x8, 3, 7, 5
+    mov         r3d, r3m
+    add         r3d, r3d
+    lea         r5, [r1 * 3]
+    lea         r6, [r3 * 3]
+
+    ; load constant
+    vpbroadcastd      m4, [pw_2000]
+
+    PROCESS_P2S_ALIGNED_32x4_AVX512
+    lea         r0, [r0 + r1 * 4]
+    lea         r2, [r2 + r3 * 4]
+    PROCESS_P2S_ALIGNED_32x4_AVX512
+    RET
+
+INIT_ZMM avx512
+cglobal filterPixelToShort_aligned_32x16, 3, 7, 5
+    mov         r3d, r3m
+    add         r3d, r3d
+    lea         r5, [r1 * 3]
+    lea         r6, [r3 * 3]
+
+    ; load constant
+    vpbroadcastd      m4, [pw_2000]
+
+%rep 3
+    PROCESS_P2S_ALIGNED_32x4_AVX512
+    lea         r0, [r0 + r1 * 4]
+    lea         r2, [r2 + r3 * 4]
+%endrep
+    PROCESS_P2S_ALIGNED_32x4_AVX512
+    RET
+
+INIT_ZMM avx512
+cglobal filterPixelToShort_aligned_32x24, 3, 7, 5
+    mov         r3d, r3m
+    add         r3d, r3d
+    lea         r5, [r1 * 3]
+    lea         r6, [r3 * 3]
+
+    ; load constant
+    vpbroadcastd      m4, [pw_2000]
+
+%rep 5
+    PROCESS_P2S_ALIGNED_32x4_AVX512
+    lea         r0, [r0 + r1 * 4]
+    lea         r2, [r2 + r3 * 4]
+%endrep
+    PROCESS_P2S_ALIGNED_32x4_AVX512
+    RET
+
+INIT_ZMM avx512
+cglobal filterPixelToShort_aligned_32x32, 3, 7, 5
+    mov         r3d, r3m
+    add         r3d, r3d
+    lea         r5, [r1 * 3]
+    lea         r6, [r3 * 3]
+
+    ; load constant
+    vpbroadcastd      m4, [pw_2000]
+
+%rep 7
+    PROCESS_P2S_ALIGNED_32x4_AVX512
+    lea         r0, [r0 + r1 * 4]
+    lea         r2, [r2 + r3 * 4]
+%endrep
+    PROCESS_P2S_ALIGNED_32x4_AVX512
+    RET
+
+INIT_ZMM avx512
+cglobal filterPixelToShort_aligned_32x48, 3, 7, 5
+    mov         r3d, r3m
+    add         r3d, r3d
+    lea         r5, [r1 * 3]
+    lea         r6, [r3 * 3]
+
+    ; load constant
+    vpbroadcastd      m4, [pw_2000]
+
+%rep 11
+    PROCESS_P2S_ALIGNED_32x4_AVX512
+    lea         r0, [r0 + r1 * 4]
+    lea         r2, [r2 + r3 * 4]
+%endrep
+    PROCESS_P2S_ALIGNED_32x4_AVX512
+    RET
+
+INIT_ZMM avx512
+cglobal filterPixelToShort_aligned_32x64, 3, 7, 5
+    mov         r3d, r3m
+    add         r3d, r3d
+    lea         r5, [r1 * 3]
+    lea         r6, [r3 * 3]
+
+    ; load constant
+    vpbroadcastd      m4, [pw_2000]
+
+%rep 15
+    PROCESS_P2S_ALIGNED_32x4_AVX512
+    lea         r0, [r0 + r1 * 4]
+    lea         r2, [r2 + r3 * 4]
+%endrep
+    PROCESS_P2S_ALIGNED_32x4_AVX512
+    RET
+;-----------------------------------------------------------------------------
+;p2s and p2s_aligned 32xN avx512 code end
+;-----------------------------------------------------------------------------
 ;-----------------------------------------------------------------------------
 ; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride)
 ;-----------------------------------------------------------------------------
@@ -2414,6 +2550,9 @@
     P2S_H_64xN_avx2 32
     P2S_H_64xN_avx2 48
 
+;-----------------------------------------------------------------------------
+;p2s and p2s_aligned 64xN avx512 code start
+;-----------------------------------------------------------------------------
 %macro PROCESS_P2S_64x4_AVX512 0
     pmovzxbw    m0, [r0]
     pmovzxbw    m1, [r0 + mmsize/2]
@@ -2452,6 +2591,43 @@
     movu        [r2 + r6 + mmsize], m3
 %endmacro
 
+%macro PROCESS_P2S_ALIGNED_64x4_AVX512 0
+    pmovzxbw    m0, [r0]
+    pmovzxbw    m1, [r0 + mmsize/2]
+    pmovzxbw    m2, [r0 + r1]
+    pmovzxbw    m3, [r0 + r1 + mmsize/2]
+
+    psllw       m0, 6
+    psllw       m1, 6
+    psllw       m2, 6
+    psllw       m3, 6
+    psubw       m0, m4
+    psubw       m1, m4
+    psubw       m2, m4
+    psubw       m3, m4
+    mova        [r2], m0
+    mova        [r2 + mmsize], m1
+    mova        [r2 + r3], m2
+    mova        [r2 + r3 + mmsize], m3
+
+    pmovzxbw    m0, [r0 + r1 * 2]
+    pmovzxbw    m1, [r0 + r1 * 2 + mmsize/2]
+    pmovzxbw    m2, [r0 + r5]
+    pmovzxbw    m3, [r0 + r5 + mmsize/2]
+
+    psllw       m0, 6
+    psllw       m1, 6
+    psllw       m2, 6
+    psllw       m3, 6
+    psubw       m0, m4
+    psubw       m1, m4
+    psubw       m2, m4
+    psubw       m3, m4
+    mova        [r2 + r3 * 2], m0
+    mova        [r2 + r3 * 2 + mmsize], m1
+    mova        [r2 + r6], m2
+    mova        [r2 + r6 + mmsize], m3
+%endmacro
 ;-----------------------------------------------------------------------------
 ; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride)
 ;-----------------------------------------------------------------------------
@@ -2527,6 +2703,81 @@
     PROCESS_P2S_64x4_AVX512
     RET
 
+INIT_ZMM avx512
+cglobal filterPixelToShort_aligned_64x64, 3, 7, 5
+    mov         r3d, r3m
+    add         r3d, r3d
+    lea         r5, [r1 * 3]
+    lea         r6, [r3 * 3]
+
+    ; load constant
+    vpbroadcastd      m4, [pw_2000]
+
+%rep 15
+    PROCESS_P2S_ALIGNED_64x4_AVX512
+    lea         r0, [r0 + r1 * 4]
+    lea         r2, [r2 + r3 * 4]
+%endrep
+    PROCESS_P2S_ALIGNED_64x4_AVX512
+    RET
+
+INIT_ZMM avx512
+cglobal filterPixelToShort_aligned_64x48, 3, 7, 5
+    mov         r3d, r3m
+    add         r3d, r3d
+    lea         r5, [r1 * 3]
+    lea         r6, [r3 * 3]
+
+    ; load constant
+    vpbroadcastd      m4, [pw_2000]
+
+%rep 11
+    PROCESS_P2S_ALIGNED_64x4_AVX512
+    lea         r0, [r0 + r1 * 4]
+    lea         r2, [r2 + r3 * 4]
+%endrep
+    PROCESS_P2S_ALIGNED_64x4_AVX512
+    RET
+
+INIT_ZMM avx512
+cglobal filterPixelToShort_aligned_64x32, 3, 7, 5
+    mov         r3d, r3m
+    add         r3d, r3d
+    lea         r5, [r1 * 3]
+    lea         r6, [r3 * 3]
+
+    ; load constant
+    vpbroadcastd      m4, [pw_2000]
+
+%rep 7
+    PROCESS_P2S_ALIGNED_64x4_AVX512
+    lea         r0, [r0 + r1 * 4]
+    lea         r2, [r2 + r3 * 4]
+%endrep
+    PROCESS_P2S_ALIGNED_64x4_AVX512
+    RET
+
+INIT_ZMM avx512
+cglobal filterPixelToShort_aligned_64x16, 3, 7, 5
+    mov         r3d, r3m
+    add         r3d, r3d
+    lea         r5, [r1 * 3]
+    lea         r6, [r3 * 3]
+
+    ; load constant
+    vpbroadcastd      m4, [pw_2000]
+
+%rep 3
+    PROCESS_P2S_ALIGNED_64x4_AVX512
+    lea         r0, [r0 + r1 * 4]
+    lea         r2, [r2 + r3 * 4]
+%endrep
+    PROCESS_P2S_ALIGNED_64x4_AVX512
+    RET
+;-----------------------------------------------------------------------------
+;p2s and p2s_aligned 64xN avx512 code end
+;-----------------------------------------------------------------------------
+
 ;-----------------------------------------------------------------------------
 ; void filterPixelToShort(pixel src, intptr_t srcStride, int16_t dst, int16_t dstStride)
 ;-----------------------------------------------------------------------------
@@ -2948,6 +3199,9 @@
     jnz        .loop
     RET
 
+;-----------------------------------------------------------------------------
+;p2s and p2s_aligned 48xN avx512 code start
+;-----------------------------------------------------------------------------
 %macro PROCESS_P2S_48x8_AVX512 0
     pmovzxbw    m0, [r0]
     pmovzxbw    m1, [r0 + r1]
@@ -3021,6 +3275,78 @@
     movu        [r2 + r6 + 64],      ym3
 %endmacro
 
+%macro PROCESS_P2S_ALIGNED_48x8_AVX512 0
+    pmovzxbw    m0, [r0]
+    pmovzxbw    m1, [r0 + r1]
+    pmovzxbw    m2, [r0 + r1 * 2]
+    pmovzxbw    m3, [r0 + r5]
+    psllw       m0, 6
+    psllw       m1, 6
+    psllw       m2, 6
+    psllw       m3, 6
+    psubw       m0, m4
+    psubw       m1, m4
+    psubw       m2, m4
+    psubw       m3, m4
+    mova        [r2],           m0
+    mova        [r2 + r3],      m1
+    mova        [r2 + r3 * 2],  m2
+    mova        [r2 + r6],      m3
+
+    pmovzxbw    ym0, [r0 + 32]
+    pmovzxbw    ym1, [r0 + r1 + 32]
+    pmovzxbw    ym2, [r0 + r1 * 2 + 32]
+    pmovzxbw    ym3, [r0 + r5 + 32]
+    psllw       ym0, 6
+    psllw       ym1, 6
+    psllw       ym2, 6
+    psllw       ym3, 6
+    psubw       ym0, ym4
+    psubw       ym1, ym4
+    psubw       ym2, ym4
+    psubw       ym3, ym4
+    mova        [r2 + 64],           ym0
+    mova        [r2 + r3 + 64],      ym1
+    mova        [r2 + r3 * 2 + 64],  ym2
+    mova        [r2 + r6 + 64],      ym3
+
+    lea         r0, [r0 + r1 * 4]
+    lea         r2, [r2 + r3 * 4]
+
+    pmovzxbw    m0, [r0]
+    pmovzxbw    m1, [r0 + r1]
+    pmovzxbw    m2, [r0 + r1 * 2]
+    pmovzxbw    m3, [r0 + r5]
+    psllw       m0, 6
+    psllw       m1, 6
+    psllw       m2, 6
+    psllw       m3, 6
+    psubw       m0, m4
+    psubw       m1, m4
+    psubw       m2, m4
+    psubw       m3, m4
+    mova        [r2],           m0
+    mova        [r2 + r3],      m1
+    mova        [r2 + r3 * 2],  m2
+    mova        [r2 + r6],      m3
+
+    pmovzxbw    ym0, [r0 + 32]
+    pmovzxbw    ym1, [r0 + r1 + 32]
+    pmovzxbw    ym2, [r0 + r1 * 2 + 32]
+    pmovzxbw    ym3, [r0 + r5 + 32]
+    psllw       ym0, 6
+    psllw       ym1, 6
+    psllw       ym2, 6
+    psllw       ym3, 6
+    psubw       ym0, ym4
+    psubw       ym1, ym4
+    psubw       ym2, ym4
+    psubw       ym3, ym4
+    mova        [r2 + 64],           ym0
+    mova        [r2 + r3 + 64],      ym1
+    mova        [r2 + r3 * 2 + 64],  ym2
+    mova        [r2 + r6 + 64],      ym3
+%endmacro
 ;-----------------------------------------------------------------------------
 ; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride)
 ;-----------------------------------------------------------------------------
@@ -3058,6 +3384,43 @@
     PROCESS_P2S_48x8_AVX512
     RET
 
+INIT_ZMM avx512
+cglobal filterPixelToShort_aligned_48x64, 3,7,5
+    mov         r3d, r3m
+    add         r3d, r3d
+    lea         r5, [r1 * 3]
+    lea         r6, [r3 * 3]
+
+    ; load constant
+    vpbroadcastd m4, [pw_2000]
+
+    PROCESS_P2S_ALIGNED_48x8_AVX512
+    lea         r0, [r0 + r1 * 4]
+    lea         r2, [r2 + r3 * 4]
+    PROCESS_P2S_ALIGNED_48x8_AVX512
+    lea         r0, [r0 + r1 * 4]
+    lea         r2, [r2 + r3 * 4]
+    PROCESS_P2S_ALIGNED_48x8_AVX512
+    lea         r0, [r0 + r1 * 4]
+    lea         r2, [r2 + r3 * 4]
+    PROCESS_P2S_ALIGNED_48x8_AVX512
+    lea         r0, [r0 + r1 * 4]
+    lea         r2, [r2 + r3 * 4]
+    PROCESS_P2S_ALIGNED_48x8_AVX512
+    lea         r0, [r0 + r1 * 4]
+    lea         r2, [r2 + r3 * 4]
+    PROCESS_P2S_ALIGNED_48x8_AVX512
+    lea         r0, [r0 + r1 * 4]
+    lea         r2, [r2 + r3 * 4]
+    PROCESS_P2S_ALIGNED_48x8_AVX512
+    lea         r0, [r0 + r1 * 4]
+    lea         r2, [r2 + r3 * 4]
+    PROCESS_P2S_ALIGNED_48x8_AVX512
+    RET
+;-----------------------------------------------------------------------------
+;p2s and p2s_aligned 48xN avx512 code end
+;-----------------------------------------------------------------------------
+
 %macro PROCESS_LUMA_W4_4R 0
     movd        m0, [r0]
     movd        m1, [r0 + r1]


More information about the x265-devel mailing list