[x265] [PATCH 110 of 307] x86: Aligned routine implementation for p2s primitive

mythreyi at multicorewareinc.com mythreyi at multicorewareinc.com
Sat Apr 7 04:31:48 CEST 2018


# HG changeset patch
# User Vignesh Vijayakumar
# Date 1505992185 -19800
#      Thu Sep 21 16:39:45 2017 +0530
# Node ID ffd4c1528b37332493c5fa4677e780dbef121a01
# Parent  e1348316cd4431a5d39c8a9457d865f0f9d546cc
x86: Aligned routine implementation for p2s primitive

Makes default alignment 64 bytes
Links the high bit depth code of p2s primitive to the aligned code if AVX512 architecture is detected and the source and destination strides are multiples of 64.
Aligns all ipfilterharness buffers to 64 bytes
Disables sad_x3_48x64 and sad_x4_48x64 temporarily
Cleanup p2s code

diff -r e1348316cd44 -r ffd4c1528b37 source/common/common.cpp
--- a/source/common/common.cpp	Fri Sep 01 11:50:03 2017 +0530
+++ b/source/common/common.cpp	Thu Sep 21 16:39:45 2017 +0530
@@ -54,7 +54,7 @@
 #endif
 }
 
-#define X265_ALIGNBYTES 32
+#define X265_ALIGNBYTES 64
 
 #if _WIN32
 #if defined(__MINGW32__) && !defined(__MINGW64_VERSION_MAJOR)
diff -r e1348316cd44 -r ffd4c1528b37 source/common/common.h
--- a/source/common/common.h	Fri Sep 01 11:50:03 2017 +0530
+++ b/source/common/common.h	Thu Sep 21 16:39:45 2017 +0530
@@ -75,6 +75,7 @@
 #define ALIGN_VAR_8(T, var)  T var __attribute__((aligned(8)))
 #define ALIGN_VAR_16(T, var) T var __attribute__((aligned(16)))
 #define ALIGN_VAR_32(T, var) T var __attribute__((aligned(32)))
+#define ALIGN_VAR_64(T, var) T var __attribute__((aligned(64)))
 #if defined(__MINGW32__)
 #define fseeko fseeko64
 #define ftello ftello64
@@ -85,6 +86,7 @@
 #define ALIGN_VAR_8(T, var)  __declspec(align(8)) T var
 #define ALIGN_VAR_16(T, var) __declspec(align(16)) T var
 #define ALIGN_VAR_32(T, var) __declspec(align(32)) T var
+#define ALIGN_VAR_64(T, var) __declspec(align(64)) T var
 #define fseeko _fseeki64
 #define ftello _ftelli64
 #endif // if defined(__GNUC__)
diff -r e1348316cd44 -r ffd4c1528b37 source/common/ipfilter.cpp
--- a/source/common/ipfilter.cpp	Fri Sep 01 11:50:03 2017 +0530
+++ b/source/common/ipfilter.cpp	Thu Sep 21 16:39:45 2017 +0530
@@ -379,7 +379,8 @@
     p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].filter_vps = interp_vert_ps_c<4, W, H>;  \
     p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].filter_vsp = interp_vert_sp_c<4, W, H>;  \
     p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].filter_vss = interp_vert_ss_c<4, W, H>; \
-    p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].p2s = filterPixelToShort_c<W, H>;
+    p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].p2s = filterPixelToShort_c<W, H>;\
+    p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].p2s_aligned = filterPixelToShort_c<W, H>;
 
 #define CHROMA_422(W, H) \
     p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].filter_hpp = interp_horiz_pp_c<4, W, H>; \
@@ -388,7 +389,8 @@
     p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].filter_vps = interp_vert_ps_c<4, W, H>;  \
     p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].filter_vsp = interp_vert_sp_c<4, W, H>;  \
     p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].filter_vss = interp_vert_ss_c<4, W, H>; \
-    p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].p2s = filterPixelToShort_c<W, H>;
+    p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].p2s = filterPixelToShort_c<W, H>;\
+    p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].p2s_aligned = filterPixelToShort_c<W, H>;
 
 #define CHROMA_444(W, H) \
     p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].filter_hpp = interp_horiz_pp_c<4, W, H>; \
@@ -397,7 +399,8 @@
     p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].filter_vps = interp_vert_ps_c<4, W, H>;  \
     p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].filter_vsp = interp_vert_sp_c<4, W, H>;  \
     p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].filter_vss = interp_vert_ss_c<4, W, H>; \
-    p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].p2s = filterPixelToShort_c<W, H>;
+    p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].p2s = filterPixelToShort_c<W, H>;\
+    p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].p2s_aligned = filterPixelToShort_c<W, H>;
 
 #define LUMA(W, H) \
     p.pu[LUMA_ ## W ## x ## H].luma_hpp     = interp_horiz_pp_c<8, W, H>; \
@@ -407,7 +410,8 @@
     p.pu[LUMA_ ## W ## x ## H].luma_vsp     = interp_vert_sp_c<8, W, H>;  \
     p.pu[LUMA_ ## W ## x ## H].luma_vss     = interp_vert_ss_c<8, W, H>;  \
     p.pu[LUMA_ ## W ## x ## H].luma_hvpp    = interp_hv_pp_c<8, W, H>; \
-    p.pu[LUMA_ ## W ## x ## H].convert_p2s = filterPixelToShort_c<W, H>;
+    p.pu[LUMA_ ## W ## x ## H].convert_p2s = filterPixelToShort_c<W, H>;\
+    p.pu[LUMA_ ## W ## x ## H].convert_p2s_aligned = filterPixelToShort_c<W, H>;
 
 void setupFilterPrimitives_c(EncoderPrimitives& p)
 {
diff -r e1348316cd44 -r ffd4c1528b37 source/common/predict.cpp
--- a/source/common/predict.cpp	Fri Sep 01 11:50:03 2017 +0530
+++ b/source/common/predict.cpp	Thu Sep 21 16:39:45 2017 +0530
@@ -283,7 +283,18 @@
     int yFrac = mv.y & 3;
 
     if (!(yFrac | xFrac))
+    {
+#if HIGH_BIT_DEPTH
+        bool srcbufferAlignCheck = (refPic.m_cuOffsetY[pu.ctuAddr] + refPic.m_buOffsetY[pu.cuAbsPartIdx + pu.puAbsPartIdx] + srcOffset) % 64 == 0;
+        bool dstbufferAlignCheck = (dstSYuv.getAddrOffset(pu.puAbsPartIdx, dstSYuv.m_size) % 64) == 0;
+        if (srcStride % 64 == 0 && dstStride % 64 == 0 && srcbufferAlignCheck && dstbufferAlignCheck && (refPic.m_param->cpuid & X265_CPU_AVX512))
+            primitives.pu[partEnum].convert_p2s_aligned(src, srcStride, dst, dstStride);
+        else
+            primitives.pu[partEnum].convert_p2s(src, srcStride, dst, dstStride);
+#else
         primitives.pu[partEnum].convert_p2s(src, srcStride, dst, dstStride);
+#endif
+    }
     else if (!yFrac)
         primitives.pu[partEnum].luma_hps(src, srcStride, dst, dstStride, xFrac, 0);
     else if (!xFrac)
@@ -375,8 +386,23 @@
 
     if (!(yFrac | xFrac))
     {
+#if HIGH_BIT_DEPTH
+        bool srcbufferAlignCheckC = (refPic.m_cuOffsetC[pu.ctuAddr] + refPic.m_buOffsetC[pu.cuAbsPartIdx + pu.puAbsPartIdx] + refOffset) % 64 == 0;
+        bool dstbufferAlignCheckC = dstSYuv.getChromaAddrOffset(pu.puAbsPartIdx) % 64 == 0;
+        if (refStride % 64 == 0 && dstStride % 64 == 0 && srcbufferAlignCheckC  && dstbufferAlignCheckC && (refPic.m_param->cpuid & X265_CPU_AVX512))
+        {
+            primitives.chroma[m_csp].pu[partEnum].p2s_aligned(refCb, refStride, dstCb, dstStride);
+            primitives.chroma[m_csp].pu[partEnum].p2s_aligned(refCr, refStride, dstCr, dstStride);
+        }
+        else
+        {
+            primitives.chroma[m_csp].pu[partEnum].p2s(refCb, refStride, dstCb, dstStride);
+            primitives.chroma[m_csp].pu[partEnum].p2s(refCr, refStride, dstCr, dstStride);
+        }
+#else
         primitives.chroma[m_csp].pu[partEnum].p2s(refCb, refStride, dstCb, dstStride);
         primitives.chroma[m_csp].pu[partEnum].p2s(refCr, refStride, dstCr, dstStride);
+#endif
     }
     else if (!yFrac)
     {
diff -r e1348316cd44 -r ffd4c1528b37 source/common/primitives.h
--- a/source/common/primitives.h	Fri Sep 01 11:50:03 2017 +0530
+++ b/source/common/primitives.h	Thu Sep 21 16:39:45 2017 +0530
@@ -248,6 +248,7 @@
 
         copy_pp_t      copy_pp;
         filter_p2s_t   convert_p2s;
+        filter_p2s_t   convert_p2s_aligned;
     }
     pu[NUM_PU_SIZES];
 
@@ -387,6 +388,7 @@
             addAvg_t     addAvg;
             copy_pp_t    copy_pp;
             filter_p2s_t p2s;
+            filter_p2s_t p2s_aligned;
 
         }
         pu[NUM_PU_SIZES];
diff -r e1348316cd44 -r ffd4c1528b37 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Fri Sep 01 11:50:03 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp	Thu Sep 21 16:39:45 2017 +0530
@@ -2239,12 +2239,13 @@
         p.pu[LUMA_64x32].convert_p2s = PFX(filterPixelToShort_64x32_avx512);
         p.pu[LUMA_64x48].convert_p2s = PFX(filterPixelToShort_64x48_avx512);
         p.pu[LUMA_64x64].convert_p2s = PFX(filterPixelToShort_64x64_avx512);
-        p.pu[LUMA_32x8].convert_p2s = PFX(filterPixelToShort_32x8_avx2);
+        p.pu[LUMA_32x8].convert_p2s = PFX(filterPixelToShort_32x8_avx512);
         p.pu[LUMA_32x16].convert_p2s = PFX(filterPixelToShort_32x16_avx512);
         p.pu[LUMA_32x24].convert_p2s = PFX(filterPixelToShort_32x24_avx512);
         p.pu[LUMA_32x32].convert_p2s = PFX(filterPixelToShort_32x32_avx512);
         p.pu[LUMA_32x64].convert_p2s = PFX(filterPixelToShort_32x64_avx512);
         p.pu[LUMA_48x64].convert_p2s = PFX(filterPixelToShort_48x64_avx512);
+
         p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].p2s = PFX(filterPixelToShort_32x8_avx512);
         p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].p2s = PFX(filterPixelToShort_32x16_avx512);
         p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].p2s = PFX(filterPixelToShort_32x24_avx512);
@@ -2253,7 +2254,7 @@
         p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].p2s = PFX(filterPixelToShort_32x32_avx512);
         p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].p2s = PFX(filterPixelToShort_32x48_avx512);
         p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].p2s = PFX(filterPixelToShort_32x64_avx512);
-        p.chroma[X265_CSP_I444].pu[LUMA_32x8].p2s = PFX(filterPixelToShort_32x8_avx2);
+        p.chroma[X265_CSP_I444].pu[LUMA_32x8].p2s = PFX(filterPixelToShort_32x8_avx512);
         p.chroma[X265_CSP_I444].pu[LUMA_32x16].p2s = PFX(filterPixelToShort_32x16_avx512);
         p.chroma[X265_CSP_I444].pu[LUMA_32x24].p2s = PFX(filterPixelToShort_32x24_avx512);
         p.chroma[X265_CSP_I444].pu[LUMA_32x32].p2s = PFX(filterPixelToShort_32x32_avx512);
@@ -2263,6 +2264,106 @@
         p.chroma[X265_CSP_I444].pu[LUMA_64x48].p2s = PFX(filterPixelToShort_64x48_avx512);
         p.chroma[X265_CSP_I444].pu[LUMA_64x64].p2s = PFX(filterPixelToShort_64x64_avx512);
 
+        p.pu[LUMA_4x4].convert_p2s_aligned = PFX(filterPixelToShort_4x4_ssse3);
+        p.pu[LUMA_4x8].convert_p2s_aligned = PFX(filterPixelToShort_4x8_ssse3);
+        p.pu[LUMA_4x16].convert_p2s_aligned = PFX(filterPixelToShort_4x16_ssse3);
+        p.pu[LUMA_8x8].convert_p2s_aligned = PFX(filterPixelToShort_8x8_ssse3);
+        p.pu[LUMA_8x4].convert_p2s_aligned = PFX(filterPixelToShort_8x4_ssse3);
+        p.pu[LUMA_8x16].convert_p2s_aligned = PFX(filterPixelToShort_8x16_ssse3);
+        p.pu[LUMA_8x32].convert_p2s_aligned = PFX(filterPixelToShort_8x32_ssse3);
+        p.pu[LUMA_12x16].convert_p2s_aligned = PFX(filterPixelToShort_12x16_ssse3);
+        p.pu[LUMA_16x4].convert_p2s_aligned = PFX(filterPixelToShort_16x4_avx2);
+        p.pu[LUMA_16x8].convert_p2s_aligned = PFX(filterPixelToShort_16x8_avx2);
+        p.pu[LUMA_16x12].convert_p2s_aligned = PFX(filterPixelToShort_16x12_avx2);
+        p.pu[LUMA_16x16].convert_p2s_aligned = PFX(filterPixelToShort_16x16_avx2);
+        p.pu[LUMA_16x32].convert_p2s_aligned = PFX(filterPixelToShort_16x32_avx2);
+        p.pu[LUMA_16x64].convert_p2s_aligned = PFX(filterPixelToShort_16x64_avx2);
+        p.pu[LUMA_24x32].convert_p2s_aligned = PFX(filterPixelToShort_24x32_avx2);
+        p.pu[LUMA_64x16].convert_p2s_aligned = PFX(filterPixelToShort_aligned_64x16_avx512);
+        p.pu[LUMA_64x32].convert_p2s_aligned = PFX(filterPixelToShort_aligned_64x32_avx512);
+        p.pu[LUMA_64x48].convert_p2s_aligned = PFX(filterPixelToShort_aligned_64x48_avx512);
+        p.pu[LUMA_64x64].convert_p2s_aligned = PFX(filterPixelToShort_aligned_64x64_avx512);
+        p.pu[LUMA_32x8].convert_p2s_aligned = PFX(filterPixelToShort_aligned_32x8_avx512);
+        p.pu[LUMA_32x16].convert_p2s_aligned = PFX(filterPixelToShort_aligned_32x16_avx512);
+        p.pu[LUMA_32x24].convert_p2s_aligned = PFX(filterPixelToShort_aligned_32x24_avx512);
+        p.pu[LUMA_32x32].convert_p2s_aligned = PFX(filterPixelToShort_aligned_32x32_avx512);
+        p.pu[LUMA_32x64].convert_p2s_aligned = PFX(filterPixelToShort_aligned_32x64_avx512);
+        p.pu[LUMA_48x64].convert_p2s_aligned = PFX(filterPixelToShort_aligned_48x64_avx512);
+
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].p2s_aligned = PFX(filterPixelToShort_4x4_ssse3);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x2].p2s_aligned = PFX(filterPixelToShort_4x2_ssse3);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].p2s_aligned = PFX(filterPixelToShort_4x8_ssse3);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].p2s_aligned = PFX(filterPixelToShort_4x16_ssse3);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].p2s_aligned = PFX(filterPixelToShort_8x8_ssse3);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].p2s_aligned = PFX(filterPixelToShort_8x4_ssse3);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x2].p2s_aligned = PFX(filterPixelToShort_8x2_ssse3);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x6].p2s_aligned = PFX(filterPixelToShort_8x6_ssse3);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].p2s_aligned = PFX(filterPixelToShort_8x16_ssse3);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].p2s_aligned = PFX(filterPixelToShort_8x32_ssse3);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_12x16].p2s_aligned = PFX(filterPixelToShort_12x16_ssse3);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_2x4].p2s_aligned = PFX(filterPixelToShort_2x4_sse4);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_2x8].p2s_aligned = PFX(filterPixelToShort_2x8_sse4);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_6x8].p2s_aligned = PFX(filterPixelToShort_6x8_sse4);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].p2s_aligned = PFX(filterPixelToShort_16x8_avx2);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].p2s_aligned = PFX(filterPixelToShort_16x12_avx2);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].p2s_aligned = PFX(filterPixelToShort_16x16_avx2);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].p2s_aligned = PFX(filterPixelToShort_16x32_avx2);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].p2s_aligned = PFX(filterPixelToShort_24x32_avx2);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].p2s_aligned = PFX(filterPixelToShort_aligned_32x8_avx512);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].p2s_aligned = PFX(filterPixelToShort_aligned_32x16_avx512);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].p2s_aligned = PFX(filterPixelToShort_aligned_32x24_avx512);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].p2s_aligned = PFX(filterPixelToShort_aligned_32x32_avx512);
+
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_2x8].p2s_aligned = PFX(filterPixelToShort_2x8_sse4);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_2x16].p2s_aligned = PFX(filterPixelToShort_2x16_sse4);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].p2s_aligned = PFX(filterPixelToShort_4x4_ssse3);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].p2s_aligned = PFX(filterPixelToShort_4x8_ssse3);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].p2s_aligned = PFX(filterPixelToShort_4x16_ssse3);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x32].p2s_aligned = PFX(filterPixelToShort_4x32_ssse3);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_6x16].p2s_aligned = PFX(filterPixelToShort_6x16_sse4);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].p2s_aligned = PFX(filterPixelToShort_8x4_ssse3);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].p2s_aligned = PFX(filterPixelToShort_8x8_ssse3);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x12].p2s_aligned = PFX(filterPixelToShort_8x12_ssse3);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].p2s_aligned = PFX(filterPixelToShort_8x16_ssse3);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].p2s_aligned = PFX(filterPixelToShort_8x32_ssse3);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].p2s_aligned = PFX(filterPixelToShort_8x64_ssse3);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_12x32].p2s_aligned = PFX(filterPixelToShort_12x32_ssse3);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].p2s_aligned = PFX(filterPixelToShort_16x8_avx2);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].p2s_aligned = PFX(filterPixelToShort_16x16_avx2);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].p2s_aligned = PFX(filterPixelToShort_16x24_avx2);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].p2s_aligned = PFX(filterPixelToShort_16x32_avx2);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].p2s_aligned = PFX(filterPixelToShort_16x64_avx2);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].p2s_aligned = PFX(filterPixelToShort_24x64_avx2);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].p2s_aligned = PFX(filterPixelToShort_aligned_32x16_avx512);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].p2s_aligned = PFX(filterPixelToShort_aligned_32x32_avx512);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].p2s_aligned = PFX(filterPixelToShort_aligned_32x48_avx512);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].p2s_aligned = PFX(filterPixelToShort_aligned_32x64_avx512);
+
+        p.chroma[X265_CSP_I444].pu[LUMA_4x4].p2s_aligned = PFX(filterPixelToShort_4x4_ssse3);
+        p.chroma[X265_CSP_I444].pu[LUMA_4x8].p2s_aligned = PFX(filterPixelToShort_4x8_ssse3);
+        p.chroma[X265_CSP_I444].pu[LUMA_4x16].p2s_aligned = PFX(filterPixelToShort_4x16_ssse3);
+        p.chroma[X265_CSP_I444].pu[LUMA_8x8].p2s_aligned = PFX(filterPixelToShort_8x8_ssse3);
+        p.chroma[X265_CSP_I444].pu[LUMA_8x4].p2s_aligned = PFX(filterPixelToShort_8x4_ssse3);
+        p.chroma[X265_CSP_I444].pu[LUMA_8x16].p2s_aligned = PFX(filterPixelToShort_8x16_ssse3);
+        p.chroma[X265_CSP_I444].pu[LUMA_8x32].p2s_aligned = PFX(filterPixelToShort_8x32_ssse3);
+        p.chroma[X265_CSP_I444].pu[LUMA_12x16].p2s_aligned = PFX(filterPixelToShort_12x16_ssse3);
+        p.chroma[X265_CSP_I444].pu[LUMA_16x4].p2s_aligned = PFX(filterPixelToShort_16x4_avx2);
+        p.chroma[X265_CSP_I444].pu[LUMA_16x8].p2s_aligned = PFX(filterPixelToShort_16x8_avx2);
+        p.chroma[X265_CSP_I444].pu[LUMA_16x12].p2s_aligned = PFX(filterPixelToShort_16x12_avx2);
+        p.chroma[X265_CSP_I444].pu[LUMA_16x16].p2s_aligned = PFX(filterPixelToShort_16x16_avx2);
+        p.chroma[X265_CSP_I444].pu[LUMA_16x32].p2s_aligned = PFX(filterPixelToShort_16x32_avx2);
+        p.chroma[X265_CSP_I444].pu[LUMA_16x64].p2s_aligned = PFX(filterPixelToShort_16x64_avx2);
+        p.chroma[X265_CSP_I444].pu[LUMA_24x32].p2s_aligned = PFX(filterPixelToShort_24x32_avx2);
+        p.chroma[X265_CSP_I444].pu[LUMA_32x8].p2s_aligned = PFX(filterPixelToShort_aligned_32x8_avx512);
+        p.chroma[X265_CSP_I444].pu[LUMA_32x16].p2s_aligned = PFX(filterPixelToShort_aligned_32x16_avx512);
+        p.chroma[X265_CSP_I444].pu[LUMA_32x24].p2s_aligned = PFX(filterPixelToShort_aligned_32x24_avx512);
+        p.chroma[X265_CSP_I444].pu[LUMA_32x32].p2s_aligned = PFX(filterPixelToShort_aligned_32x32_avx512);
+        p.chroma[X265_CSP_I444].pu[LUMA_32x64].p2s_aligned = PFX(filterPixelToShort_aligned_32x64_avx512);
+        p.chroma[X265_CSP_I444].pu[LUMA_64x16].p2s_aligned = PFX(filterPixelToShort_aligned_64x16_avx512);
+        p.chroma[X265_CSP_I444].pu[LUMA_64x32].p2s_aligned = PFX(filterPixelToShort_aligned_64x32_avx512);
+        p.chroma[X265_CSP_I444].pu[LUMA_64x48].p2s_aligned = PFX(filterPixelToShort_aligned_64x48_avx512);
+        p.chroma[X265_CSP_I444].pu[LUMA_64x64].p2s_aligned = PFX(filterPixelToShort_aligned_64x64_avx512);
+
         p.cu[BLOCK_32x32].ssd_s = PFX(pixel_ssd_s_32_avx512);
 
         p.pu[LUMA_32x8].sad = PFX(pixel_sad_32x8_avx512);
@@ -2329,7 +2430,7 @@
         p.pu[LUMA_32x24].sad_x3 = PFX(pixel_sad_x3_32x24_avx512);
         p.pu[LUMA_32x32].sad_x3 = PFX(pixel_sad_x3_32x32_avx512);
         p.pu[LUMA_32x64].sad_x3 = PFX(pixel_sad_x3_32x64_avx512);
-        p.pu[LUMA_48x64].sad_x3 = PFX(pixel_sad_x3_48x64_avx512);
+        //p.pu[LUMA_48x64].sad_x3 = PFX(pixel_sad_x3_48x64_avx512);
         p.pu[LUMA_64x16].sad_x3 = PFX(pixel_sad_x3_64x16_avx512);
         p.pu[LUMA_64x32].sad_x3 = PFX(pixel_sad_x3_64x32_avx512);
         p.pu[LUMA_64x48].sad_x3 = PFX(pixel_sad_x3_64x48_avx512);
@@ -2340,7 +2441,7 @@
         p.pu[LUMA_32x24].sad_x4 = PFX(pixel_sad_x4_32x24_avx512);
         p.pu[LUMA_32x32].sad_x4 = PFX(pixel_sad_x4_32x32_avx512);
         p.pu[LUMA_32x64].sad_x4 = PFX(pixel_sad_x4_32x64_avx512);
-        p.pu[LUMA_48x64].sad_x4 = PFX(pixel_sad_x4_48x64_avx512);
+        //p.pu[LUMA_48x64].sad_x4 = PFX(pixel_sad_x4_48x64_avx512);
         p.pu[LUMA_64x16].sad_x4 = PFX(pixel_sad_x4_64x16_avx512);
         p.pu[LUMA_64x32].sad_x4 = PFX(pixel_sad_x4_64x32_avx512);
         p.pu[LUMA_64x48].sad_x4 = PFX(pixel_sad_x4_64x48_avx512);
@@ -4097,6 +4198,7 @@
         p.pu[LUMA_32x32].convert_p2s = PFX(filterPixelToShort_32x32_avx512);
         p.pu[LUMA_32x64].convert_p2s = PFX(filterPixelToShort_32x64_avx512);
         p.pu[LUMA_48x64].convert_p2s = PFX(filterPixelToShort_48x64_avx512);
+
         p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].p2s = PFX(filterPixelToShort_32x8_avx512);
         p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].p2s = PFX(filterPixelToShort_32x16_avx512);
         p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].p2s = PFX(filterPixelToShort_32x24_avx512);
diff -r e1348316cd44 -r ffd4c1528b37 source/common/x86/ipfilter16.asm
--- a/source/common/x86/ipfilter16.asm	Fri Sep 01 11:50:03 2017 +0530
+++ b/source/common/x86/ipfilter16.asm	Thu Sep 21 16:39:45 2017 +0530
@@ -322,46 +322,9 @@
     FILTER_VER_LUMA_sse2 ps, 16, 64
 
 ;-----------------------------------------------------------------------------
-;p2s avx512 code start
+;p2s and p2s_aligned avx512 code start
 ;-----------------------------------------------------------------------------
-%macro P2S_64x8_AVX512 0
-    movu       m0, [r0]
-    movu       m1, [r0 + r1]
-    movu       m2, [r0 + r1 * 2]
-    movu       m3, [r0 + r5]
-    psllw      m0, (14 - BIT_DEPTH)
-    psllw      m1, (14 - BIT_DEPTH)
-    psllw      m2, (14 - BIT_DEPTH)
-    psllw      m3, (14 - BIT_DEPTH)
-    psubw      m0, m4
-    psubw      m1, m4
-    psubw      m2, m4
-    psubw      m3, m4
-    movu       [r2], m0
-    movu       [r2 + r3], m1
-    movu       [r2 + r3 * 2], m2
-    movu       [r2 + r4], m3
-
-    movu       m0, [r0 + mmsize]
-    movu       m1, [r0 + r1 + mmsize]
-    movu       m2, [r0 + r1 * 2 + mmsize]
-    movu       m3, [r0 + r5 + mmsize]
-    psllw      m0, (14 - BIT_DEPTH)
-    psllw      m1, (14 - BIT_DEPTH)
-    psllw      m2, (14 - BIT_DEPTH)
-    psllw      m3, (14 - BIT_DEPTH)
-    psubw      m0, m4
-    psubw      m1, m4
-    psubw      m2, m4
-    psubw      m3, m4
-    movu       [r2 + mmsize], m0
-    movu       [r2 + r3 + mmsize], m1
-    movu       [r2 + r3 * 2 + mmsize], m2
-    movu       [r2 + r4 + mmsize], m3
-
-    lea        r0, [r0 + r1 * 4]
-    lea        r2, [r2 + r3 * 4]
-
+%macro P2S_64x4_AVX512 0
     movu       m0, [r0]
     movu       m1, [r0 + r1]
     movu       m2, [r0 + r1 * 2]
@@ -397,11 +360,11 @@
     movu       [r2 + r4 + mmsize], m3
 %endmacro
 
-%macro P2S_32x8_AVX512 0
-    movu       m0, [r0]
-    movu       m1, [r0 + r1]
-    movu       m2, [r0 + r1 * 2]
-    movu       m3, [r0 + r5]
+%macro P2S_ALIGNED_64x4_AVX512 0
+    mova       m0, [r0]
+    mova       m1, [r0 + r1]
+    mova       m2, [r0 + r1 * 2]
+    mova       m3, [r0 + r5]
     psllw      m0, (14 - BIT_DEPTH)
     psllw      m1, (14 - BIT_DEPTH)
     psllw      m2, (14 - BIT_DEPTH)
@@ -410,14 +373,30 @@
     psubw      m1, m4
     psubw      m2, m4
     psubw      m3, m4
-    movu       [r2], m0
-    movu       [r2 + r3], m1
-    movu       [r2 + r3 * 2], m2
-    movu       [r2 + r4], m3
-
-    lea        r0, [r0 + r1 * 4]
-    lea        r2, [r2 + r3 * 4]
-
+    mova       [r2], m0
+    mova       [r2 + r3], m1
+    mova       [r2 + r3 * 2], m2
+    mova       [r2 + r4], m3
+
+    mova       m0, [r0 + mmsize]
+    mova       m1, [r0 + r1 + mmsize]
+    mova       m2, [r0 + r1 * 2 + mmsize]
+    mova       m3, [r0 + r5 + mmsize]
+    psllw      m0, (14 - BIT_DEPTH)
+    psllw      m1, (14 - BIT_DEPTH)
+    psllw      m2, (14 - BIT_DEPTH)
+    psllw      m3, (14 - BIT_DEPTH)
+    psubw      m0, m4
+    psubw      m1, m4
+    psubw      m2, m4
+    psubw      m3, m4
+    mova       [r2 + mmsize], m0
+    mova       [r2 + r3 + mmsize], m1
+    mova       [r2 + r3 * 2 + mmsize], m2
+    mova       [r2 + r4 + mmsize], m3
+%endmacro
+
+%macro P2S_32x4_AVX512 0
     movu       m0, [r0]
     movu       m1, [r0 + r1]
     movu       m2, [r0 + r1 * 2]
@@ -436,11 +415,11 @@
     movu       [r2 + r4], m3
 %endmacro
 
-%macro P2S_48x8_AVX512 0
-    movu       m0, [r0]
-    movu       m1, [r0 + r1]
-    movu       m2, [r0 + r1 * 2]
-    movu       m3, [r0 + r5]
+%macro P2S_ALIGNED_32x4_AVX512 0
+    mova       m0, [r0]
+    mova       m1, [r0 + r1]
+    mova       m2, [r0 + r1 * 2]
+    mova       m3, [r0 + r5]
     psllw      m0, (14 - BIT_DEPTH)
     psllw      m1, (14 - BIT_DEPTH)
     psllw      m2, (14 - BIT_DEPTH)
@@ -449,31 +428,13 @@
     psubw      m1, m4
     psubw      m2, m4
     psubw      m3, m4
-    movu       [r2], m0
-    movu       [r2 + r3], m1
-    movu       [r2 + r3 * 2], m2
-    movu       [r2 + r4], m3
-
-    movu       ym0, [r0 + mmsize]
-    movu       ym1, [r0 + r1 + mmsize]
-    movu       ym2, [r0 + r1 * 2 + mmsize]
-    movu       ym3, [r0 + r5 + mmsize]
-    psllw      ym0, (14 - BIT_DEPTH)
-    psllw      ym1, (14 - BIT_DEPTH)
-    psllw      ym2, (14 - BIT_DEPTH)
-    psllw      ym3, (14 - BIT_DEPTH)
-    psubw      ym0, ym4
-    psubw      ym1, ym4
-    psubw      ym2, ym4
-    psubw      ym3, ym4
-    movu       [r2 + mmsize], ym0
-    movu       [r2 + r3 + mmsize], ym1
-    movu       [r2 + r3 * 2 + mmsize], ym2
-    movu       [r2 + r4 + mmsize], ym3
-
-    lea        r0, [r0 + r1 * 4]
-    lea        r2, [r2 + r3 * 4]
-
+    mova       [r2], m0
+    mova       [r2 + r3], m1
+    mova       [r2 + r3 * 2], m2
+    mova       [r2 + r4], m3
+%endmacro
+
+%macro P2S_48x4_AVX512 0
     movu       m0, [r0]
     movu       m1, [r0 + r1]
     movu       m2, [r0 + r1 * 2]
@@ -509,6 +470,42 @@
     movu       [r2 + r4 + mmsize], ym3
 %endmacro
 
+%macro P2S_ALIGNED_48x4_AVX512 0
+    mova       m0, [r0]
+    mova       m1, [r0 + r1]
+    mova       m2, [r0 + r1 * 2]
+    mova       m3, [r0 + r5]
+    psllw      m0, (14 - BIT_DEPTH)
+    psllw      m1, (14 - BIT_DEPTH)
+    psllw      m2, (14 - BIT_DEPTH)
+    psllw      m3, (14 - BIT_DEPTH)
+    psubw      m0, m4
+    psubw      m1, m4
+    psubw      m2, m4
+    psubw      m3, m4
+    mova       [r2], m0
+    mova       [r2 + r3], m1
+    mova       [r2 + r3 * 2], m2
+    mova       [r2 + r4], m3
+
+    mova       ym0, [r0 + mmsize]
+    mova       ym1, [r0 + r1 + mmsize]
+    mova       ym2, [r0 + r1 * 2 + mmsize]
+    mova       ym3, [r0 + r5 + mmsize]
+    psllw      ym0, (14 - BIT_DEPTH)
+    psllw      ym1, (14 - BIT_DEPTH)
+    psllw      ym2, (14 - BIT_DEPTH)
+    psllw      ym3, (14 - BIT_DEPTH)
+    psubw      ym0, ym4
+    psubw      ym1, ym4
+    psubw      ym2, ym4
+    psubw      ym3, ym4
+    mova       [r2 + mmsize], ym0
+    mova       [r2 + r3 + mmsize], ym1
+    mova       [r2 + r3 * 2 + mmsize], ym2
+    mova       [r2 + r4 + mmsize], ym3
+%endmacro
+
 ;-----------------------------------------------------------------------------
 ; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride)
 ;-----------------------------------------------------------------------------
@@ -521,12 +518,15 @@
 
     ; load constant
     vbroadcasti32x8    m4, [pw_2000]
-    P2S_64x8_AVX512
+%rep 3
+    P2S_64x4_AVX512
     lea        r0, [r0 + r1 * 4]
     lea        r2, [r2 + r3 * 4]
-    P2S_64x8_AVX512
+%endrep
+    P2S_64x4_AVX512
     RET
 
+
 INIT_ZMM avx512
 cglobal filterPixelToShort_64x32, 4, 6, 5
     add        r1d, r1d
@@ -536,16 +536,12 @@
 
     ; load constant
     vbroadcasti32x8    m4, [pw_2000]
-    P2S_64x8_AVX512
+%rep 7
+    P2S_64x4_AVX512
     lea        r0, [r0 + r1 * 4]
     lea        r2, [r2 + r3 * 4]
-    P2S_64x8_AVX512
-    lea        r0, [r0 + r1 * 4]
-    lea        r2, [r2 + r3 * 4]
-    P2S_64x8_AVX512
-    lea        r0, [r0 + r1 * 4]
-    lea        r2, [r2 + r3 * 4]
-    P2S_64x8_AVX512
+%endrep
+    P2S_64x4_AVX512
     RET
 
 INIT_ZMM avx512
@@ -557,22 +553,12 @@
 
     ; load constant
     vbroadcasti32x8    m4, [pw_2000]
-    P2S_64x8_AVX512
+%rep 11
+    P2S_64x4_AVX512
     lea        r0, [r0 + r1 * 4]
     lea        r2, [r2 + r3 * 4]
-    P2S_64x8_AVX512
-    lea        r0, [r0 + r1 * 4]
-    lea        r2, [r2 + r3 * 4]
-    P2S_64x8_AVX512
-    lea        r0, [r0 + r1 * 4]
-    lea        r2, [r2 + r3 * 4]
-    P2S_64x8_AVX512
-    lea        r0, [r0 + r1 * 4]
-    lea        r2, [r2 + r3 * 4]
-    P2S_64x8_AVX512
-    lea        r0, [r0 + r1 * 4]
-    lea        r2, [r2 + r3 * 4]
-    P2S_64x8_AVX512
+%endrep
+    P2S_64x4_AVX512
     RET
 
 INIT_ZMM avx512
@@ -584,28 +570,12 @@
 
     ; load constant
     vbroadcasti32x8    m4, [pw_2000]
-    P2S_64x8_AVX512
+%rep 15
+    P2S_64x4_AVX512
     lea        r0, [r0 + r1 * 4]
     lea        r2, [r2 + r3 * 4]
-    P2S_64x8_AVX512
-    lea        r0, [r0 + r1 * 4]
-    lea        r2, [r2 + r3 * 4]
-    P2S_64x8_AVX512
-    lea        r0, [r0 + r1 * 4]
-    lea        r2, [r2 + r3 * 4]
-    P2S_64x8_AVX512
-    lea        r0, [r0 + r1 * 4]
-    lea        r2, [r2 + r3 * 4]
-    P2S_64x8_AVX512
-    lea        r0, [r0 + r1 * 4]
-    lea        r2, [r2 + r3 * 4]
-    P2S_64x8_AVX512
-    lea        r0, [r0 + r1 * 4]
-    lea        r2, [r2 + r3 * 4]
-    P2S_64x8_AVX512
-    lea        r0, [r0 + r1 * 4]
-    lea        r2, [r2 + r3 * 4]
-    P2S_64x8_AVX512
+%endrep
+    P2S_64x4_AVX512
     RET
 
 INIT_ZMM avx512
@@ -617,7 +587,10 @@
 
     ; load constant
     vbroadcasti32x8    m4, [pw_2000]
-    P2S_32x8_AVX512
+    P2S_32x4_AVX512
+    lea        r0, [r0 + r1 * 4]
+    lea        r2, [r2 + r3 * 4]
+    P2S_32x4_AVX512
     RET
 
 INIT_ZMM avx512
@@ -629,10 +602,12 @@
 
     ; load constant
     vbroadcasti32x8    m4, [pw_2000]
-    P2S_32x8_AVX512
+%rep 3
+    P2S_32x4_AVX512
     lea        r0, [r0 + r1 * 4]
     lea        r2, [r2 + r3 * 4]
-    P2S_32x8_AVX512
+%endrep
+    P2S_32x4_AVX512
     RET
 
 INIT_ZMM avx512
@@ -644,13 +619,12 @@
 
     ; load constant
     vbroadcasti32x8    m4, [pw_2000]
-    P2S_32x8_AVX512
+%rep 5
+    P2S_32x4_AVX512
     lea        r0, [r0 + r1 * 4]
     lea        r2, [r2 + r3 * 4]
-    P2S_32x8_AVX512
-    lea        r0, [r0 + r1 * 4]
-    lea        r2, [r2 + r3 * 4]
-    P2S_32x8_AVX512
+%endrep
+    P2S_32x4_AVX512
     RET
 
 INIT_ZMM avx512
@@ -662,16 +636,12 @@
 
     ; load constant
     vbroadcasti32x8    m4, [pw_2000]
-    P2S_32x8_AVX512
+%rep 7
+    P2S_32x4_AVX512
     lea        r0, [r0 + r1 * 4]
     lea        r2, [r2 + r3 * 4]
-    P2S_32x8_AVX512
-    lea        r0, [r0 + r1 * 4]
-    lea        r2, [r2 + r3 * 4]
-    P2S_32x8_AVX512
-    lea        r0, [r0 + r1 * 4]
-    lea        r2, [r2 + r3 * 4]
-    P2S_32x8_AVX512
+%endrep
+    P2S_32x4_AVX512
     RET
 
 INIT_ZMM avx512
@@ -683,22 +653,12 @@
 
     ; load constant
     vbroadcasti32x8    m4, [pw_2000]
-    P2S_32x8_AVX512
+%rep 11
+    P2S_32x4_AVX512
     lea        r0, [r0 + r1 * 4]
     lea        r2, [r2 + r3 * 4]
-    P2S_32x8_AVX512
-    lea        r0, [r0 + r1 * 4]
-    lea        r2, [r2 + r3 * 4]
-    P2S_32x8_AVX512
-    lea        r0, [r0 + r1 * 4]
-    lea        r2, [r2 + r3 * 4]
-    P2S_32x8_AVX512
-    lea        r0, [r0 + r1 * 4]
-    lea        r2, [r2 + r3 * 4]
-    P2S_32x8_AVX512
-    lea        r0, [r0 + r1 * 4]
-    lea        r2, [r2 + r3 * 4]
-    P2S_32x8_AVX512
+%endrep
+    P2S_32x4_AVX512
     RET
 
 INIT_ZMM avx512
@@ -710,28 +670,12 @@
 
     ; load constant
     vbroadcasti32x8    m4, [pw_2000]
-    P2S_32x8_AVX512
+%rep 15
+    P2S_32x4_AVX512
     lea        r0, [r0 + r1 * 4]
     lea        r2, [r2 + r3 * 4]
-    P2S_32x8_AVX512
-    lea        r0, [r0 + r1 * 4]
-    lea        r2, [r2 + r3 * 4]
-    P2S_32x8_AVX512
-    lea        r0, [r0 + r1 * 4]
-    lea        r2, [r2 + r3 * 4]
-    P2S_32x8_AVX512
-    lea        r0, [r0 + r1 * 4]
-    lea        r2, [r2 + r3 * 4]
-    P2S_32x8_AVX512
-    lea        r0, [r0 + r1 * 4]
-    lea        r2, [r2 + r3 * 4]
-    P2S_32x8_AVX512
-    lea        r0, [r0 + r1 * 4]
-    lea        r2, [r2 + r3 * 4]
-    P2S_32x8_AVX512
-    lea        r0, [r0 + r1 * 4]
-    lea        r2, [r2 + r3 * 4]
-    P2S_32x8_AVX512
+%endrep
+    P2S_32x4_AVX512
     RET
 
 INIT_ZMM avx512
@@ -743,31 +687,201 @@
 
     ; load constant
     vbroadcasti32x8    m4, [pw_2000]
-    P2S_48x8_AVX512
+%rep 15
+    P2S_48x4_AVX512
     lea        r0, [r0 + r1 * 4]
     lea        r2, [r2 + r3 * 4]
-    P2S_48x8_AVX512
+%endrep
+    P2S_48x4_AVX512
+    RET
+
+INIT_ZMM avx512
+cglobal filterPixelToShort_aligned_64x16, 4, 6, 5
+    add        r1d, r1d
+    add        r3d, r3d
+    lea        r4, [r3 * 3]
+    lea        r5, [r1 * 3]
+
+    ; load constant
+    vbroadcasti32x8    m4, [pw_2000]
+%rep 3
+    P2S_ALIGNED_64x4_AVX512
     lea        r0, [r0 + r1 * 4]
     lea        r2, [r2 + r3 * 4]
-    P2S_48x8_AVX512
+%endrep
+    P2S_ALIGNED_64x4_AVX512
+    RET
+
+
+INIT_ZMM avx512
+cglobal filterPixelToShort_aligned_64x32, 4, 6, 5
+    add        r1d, r1d
+    add        r3d, r3d
+    lea        r4, [r3 * 3]
+    lea        r5, [r1 * 3]
+
+    ; load constant
+    vbroadcasti32x8    m4, [pw_2000]
+%rep 7
+    P2S_ALIGNED_64x4_AVX512
     lea        r0, [r0 + r1 * 4]
     lea        r2, [r2 + r3 * 4]
-    P2S_48x8_AVX512
+%endrep
+    P2S_ALIGNED_64x4_AVX512
+    RET
+
+INIT_ZMM avx512
+cglobal filterPixelToShort_aligned_64x48, 4, 6, 5
+    add        r1d, r1d
+    add        r3d, r3d
+    lea        r4, [r3 * 3]
+    lea        r5, [r1 * 3]
+
+    ; load constant
+    vbroadcasti32x8    m4, [pw_2000]
+%rep 11
+    P2S_ALIGNED_64x4_AVX512
     lea        r0, [r0 + r1 * 4]
     lea        r2, [r2 + r3 * 4]
-    P2S_48x8_AVX512
+%endrep
+    P2S_ALIGNED_64x4_AVX512
+    RET
+
+INIT_ZMM avx512
+cglobal filterPixelToShort_aligned_64x64, 4, 6, 5
+    add        r1d, r1d
+    add        r3d, r3d
+    lea        r4, [r3 * 3]
+    lea        r5, [r1 * 3]
+
+    ; load constant
+    vbroadcasti32x8    m4, [pw_2000]
+%rep 15
+    P2S_ALIGNED_64x4_AVX512
     lea        r0, [r0 + r1 * 4]
     lea        r2, [r2 + r3 * 4]
-    P2S_48x8_AVX512
+%endrep
+    P2S_ALIGNED_64x4_AVX512
+    RET
+
+INIT_ZMM avx512
+cglobal filterPixelToShort_aligned_32x8, 4, 6, 5
+    add        r1d, r1d
+    add        r3d, r3d
+    lea        r4, [r3 * 3]
+    lea        r5, [r1 * 3]
+
+    ; load constant
+    vbroadcasti32x8    m4, [pw_2000]
+    P2S_ALIGNED_32x4_AVX512
     lea        r0, [r0 + r1 * 4]
     lea        r2, [r2 + r3 * 4]
-    P2S_48x8_AVX512
+    P2S_ALIGNED_32x4_AVX512
+    RET
+
+INIT_ZMM avx512
+cglobal filterPixelToShort_aligned_32x16, 4, 6, 5
+    add        r1d, r1d
+    add        r3d, r3d
+    lea        r4, [r3 * 3]
+    lea        r5, [r1 * 3]
+
+    ; load constant
+    vbroadcasti32x8    m4, [pw_2000]
+%rep 3
+    P2S_ALIGNED_32x4_AVX512
     lea        r0, [r0 + r1 * 4]
     lea        r2, [r2 + r3 * 4]
-    P2S_48x8_AVX512
+%endrep
+    P2S_ALIGNED_32x4_AVX512
+    RET
+
+INIT_ZMM avx512
+cglobal filterPixelToShort_aligned_32x24, 4, 6, 5
+    add        r1d, r1d
+    add        r3d, r3d
+    lea        r4, [r3 * 3]
+    lea        r5, [r1 * 3]
+
+    ; load constant
+    vbroadcasti32x8    m4, [pw_2000]
+%rep 5
+    P2S_ALIGNED_32x4_AVX512
+    lea        r0, [r0 + r1 * 4]
+    lea        r2, [r2 + r3 * 4]
+%endrep
+    P2S_ALIGNED_32x4_AVX512
+    RET
+
+INIT_ZMM avx512
+cglobal filterPixelToShort_aligned_32x32, 4, 6, 5
+    add        r1d, r1d
+    add        r3d, r3d
+    lea        r4, [r3 * 3]
+    lea        r5, [r1 * 3]
+
+    ; load constant
+    vbroadcasti32x8    m4, [pw_2000]
+%rep 7
+    P2S_ALIGNED_32x4_AVX512
+    lea        r0, [r0 + r1 * 4]
+    lea        r2, [r2 + r3 * 4]
+%endrep
+    P2S_ALIGNED_32x4_AVX512
+    RET
+
+INIT_ZMM avx512
+cglobal filterPixelToShort_aligned_32x48, 4, 6, 5
+    add        r1d, r1d
+    add        r3d, r3d
+    lea        r4, [r3 * 3]
+    lea        r5, [r1 * 3]
+
+    ; load constant
+    vbroadcasti32x8    m4, [pw_2000]
+%rep 11
+    P2S_ALIGNED_32x4_AVX512
+    lea        r0, [r0 + r1 * 4]
+    lea        r2, [r2 + r3 * 4]
+%endrep
+    P2S_ALIGNED_32x4_AVX512
+    RET
+
+INIT_ZMM avx512
+cglobal filterPixelToShort_aligned_32x64, 4, 6, 5
+    add        r1d, r1d
+    add        r3d, r3d
+    lea        r4, [r3 * 3]
+    lea        r5, [r1 * 3]
+
+    ; load constant
+    vbroadcasti32x8    m4, [pw_2000]
+%rep 15
+    P2S_ALIGNED_32x4_AVX512
+    lea        r0, [r0 + r1 * 4]
+    lea        r2, [r2 + r3 * 4]
+%endrep
+    P2S_ALIGNED_32x4_AVX512
+    RET
+
+INIT_ZMM avx512
+cglobal filterPixelToShort_aligned_48x64, 4, 6, 5
+    add        r1d, r1d
+    add        r3d, r3d
+    lea        r4, [r3 * 3]
+    lea        r5, [r1 * 3]
+
+    ; load constant
+    vbroadcasti32x8    m4, [pw_2000]
+%rep 15
+    P2S_ALIGNED_48x4_AVX512
+    lea        r0, [r0 + r1 * 4]
+    lea        r2, [r2 + r3 * 4]
+%endrep
+    P2S_ALIGNED_48x4_AVX512
     RET
 ;-----------------------------------------------------------------------------------------------------------------------------
-;p2s avx512 code end
+;p2s and p2s_aligned avx512 code end
 ;-----------------------------------------------------------------------------------------------------------------------------
 
 %macro PROCESS_LUMA_VER_W4_4R 0
diff -r e1348316cd44 -r ffd4c1528b37 source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm	Fri Sep 01 11:50:03 2017 +0530
+++ b/source/common/x86/ipfilter8.asm	Thu Sep 21 16:39:45 2017 +0530
@@ -1969,29 +1969,7 @@
     P2S_H_32xN_avx2 64
     P2S_H_32xN_avx2 48
 
-%macro PROCESS_P2S_32x8_AVX512 0
-    pmovzxbw    m0, [r0]
-    pmovzxbw    m1, [r0 + r1]
-    pmovzxbw    m2, [r0 + r1 * 2]
-    pmovzxbw    m3, [r0 + r5]
-
-    psllw       m0, 6
-    psllw       m1, 6
-    psllw       m2, 6
-    psllw       m3, 6
-    psubw       m0, m4
-    psubw       m1, m4
-    psubw       m2, m4
-    psubw       m3, m4
-
-    movu        [r2],           m0
-    movu        [r2 + r3],      m1
-    movu        [r2 + r3 * 2],  m2
-    movu        [r2 + r6],      m3
-
-    lea         r0, [r0 + r1 * 4]
-    lea         r2, [r2 + r3 * 4]
-
+%macro PROCESS_P2S_32x4_AVX512 0
     pmovzxbw    m0, [r0]
     pmovzxbw    m1, [r0 + r1]
     pmovzxbw    m2, [r0 + r1 * 2]
@@ -2025,7 +2003,10 @@
     ; load constant
     vpbroadcastd      m4, [pw_2000]
 
-    PROCESS_P2S_32x8_AVX512
+    PROCESS_P2S_32x4_AVX512
+    lea         r0, [r0 + r1 * 4]
+    lea         r2, [r2 + r3 * 4]
+    PROCESS_P2S_32x4_AVX512
     RET
 
 INIT_ZMM avx512
@@ -2038,10 +2019,12 @@
     ; load constant
     vpbroadcastd      m4, [pw_2000]
 
-    PROCESS_P2S_32x8_AVX512
+%rep 3
+    PROCESS_P2S_32x4_AVX512
     lea         r0, [r0 + r1 * 4]
     lea         r2, [r2 + r3 * 4]
-    PROCESS_P2S_32x8_AVX512
+%endrep
+    PROCESS_P2S_32x4_AVX512
     RET
 
 INIT_ZMM avx512
@@ -2054,13 +2037,12 @@
     ; load constant
     vpbroadcastd      m4, [pw_2000]
 
-    PROCESS_P2S_32x8_AVX512
+%rep 5
+    PROCESS_P2S_32x4_AVX512
     lea         r0, [r0 + r1 * 4]
     lea         r2, [r2 + r3 * 4]
-    PROCESS_P2S_32x8_AVX512
-    lea         r0, [r0 + r1 * 4]
-    lea         r2, [r2 + r3 * 4]
-    PROCESS_P2S_32x8_AVX512
+%endrep
+    PROCESS_P2S_32x4_AVX512
     RET
 
 INIT_ZMM avx512
@@ -2073,16 +2055,12 @@
     ; load constant
     vpbroadcastd      m4, [pw_2000]
 
-    PROCESS_P2S_32x8_AVX512
+%rep 7
+    PROCESS_P2S_32x4_AVX512
     lea         r0, [r0 + r1 * 4]
     lea         r2, [r2 + r3 * 4]
-    PROCESS_P2S_32x8_AVX512
-    lea         r0, [r0 + r1 * 4]
-    lea         r2, [r2 + r3 * 4]
-    PROCESS_P2S_32x8_AVX512
-    lea         r0, [r0 + r1 * 4]
-    lea         r2, [r2 + r3 * 4]
-    PROCESS_P2S_32x8_AVX512
+%endrep
+    PROCESS_P2S_32x4_AVX512
     RET
 
 INIT_ZMM avx512
@@ -2095,22 +2073,12 @@
     ; load constant
     vpbroadcastd      m4, [pw_2000]
 
-    PROCESS_P2S_32x8_AVX512
+%rep 11
+    PROCESS_P2S_32x4_AVX512
     lea         r0, [r0 + r1 * 4]
     lea         r2, [r2 + r3 * 4]
-    PROCESS_P2S_32x8_AVX512
-    lea         r0, [r0 + r1 * 4]
-    lea         r2, [r2 + r3 * 4]
-    PROCESS_P2S_32x8_AVX512
-    lea         r0, [r0 + r1 * 4]
-    lea         r2, [r2 + r3 * 4]
-    PROCESS_P2S_32x8_AVX512
-    lea         r0, [r0 + r1 * 4]
-    lea         r2, [r2 + r3 * 4]
-    PROCESS_P2S_32x8_AVX512
-    lea         r0, [r0 + r1 * 4]
-    lea         r2, [r2 + r3 * 4]
-    PROCESS_P2S_32x8_AVX512
+%endrep
+    PROCESS_P2S_32x4_AVX512
     RET
 
 INIT_ZMM avx512
@@ -2123,28 +2091,12 @@
     ; load constant
     vpbroadcastd      m4, [pw_2000]
 
-    PROCESS_P2S_32x8_AVX512
+%rep 15
+    PROCESS_P2S_32x4_AVX512
     lea         r0, [r0 + r1 * 4]
     lea         r2, [r2 + r3 * 4]
-    PROCESS_P2S_32x8_AVX512
-    lea         r0, [r0 + r1 * 4]
-    lea         r2, [r2 + r3 * 4]
-    PROCESS_P2S_32x8_AVX512
-    lea         r0, [r0 + r1 * 4]
-    lea         r2, [r2 + r3 * 4]
-    PROCESS_P2S_32x8_AVX512
-    lea         r0, [r0 + r1 * 4]
-    lea         r2, [r2 + r3 * 4]
-    PROCESS_P2S_32x8_AVX512
-    lea         r0, [r0 + r1 * 4]
-    lea         r2, [r2 + r3 * 4]
-    PROCESS_P2S_32x8_AVX512
-    lea         r0, [r0 + r1 * 4]
-    lea         r2, [r2 + r3 * 4]
-    PROCESS_P2S_32x8_AVX512
-    lea         r0, [r0 + r1 * 4]
-    lea         r2, [r2 + r3 * 4]
-    PROCESS_P2S_32x8_AVX512
+%endrep
+    PROCESS_P2S_32x4_AVX512
     RET
 
 ;-----------------------------------------------------------------------------
@@ -2462,7 +2414,7 @@
     P2S_H_64xN_avx2 32
     P2S_H_64xN_avx2 48
 
-%macro PROCESS_P2S_64x8_AVX512 0
+%macro PROCESS_P2S_64x4_AVX512 0
     pmovzxbw    m0, [r0]
     pmovzxbw    m1, [r0 + mmsize/2]
     pmovzxbw    m2, [r0 + r1]
@@ -2498,45 +2450,6 @@
     movu        [r2 + r3 * 2 + mmsize], m1
     movu        [r2 + r6], m2
     movu        [r2 + r6 + mmsize], m3
-
-    lea         r0, [r0 + r1 * 4]
-    lea         r2, [r2 + r3 * 4]
-
-    pmovzxbw    m0, [r0]
-    pmovzxbw    m1, [r0 + mmsize/2]
-    pmovzxbw    m2, [r0 + r1]
-    pmovzxbw    m3, [r0 + r1 + mmsize/2]
-
-    psllw       m0, 6
-    psllw       m1, 6
-    psllw       m2, 6
-    psllw       m3, 6
-    psubw       m0, m4
-    psubw       m1, m4
-    psubw       m2, m4
-    psubw       m3, m4
-    movu        [r2], m0
-    movu        [r2 + mmsize], m1
-    movu        [r2 + r3], m2
-    movu        [r2 + r3 + mmsize], m3
-
-    pmovzxbw    m0, [r0 + r1 * 2]
-    pmovzxbw    m1, [r0 + r1 * 2 + mmsize/2]
-    pmovzxbw    m2, [r0 + r5]
-    pmovzxbw    m3, [r0 + r5 + mmsize/2]
-
-    psllw       m0, 6
-    psllw       m1, 6
-    psllw       m2, 6
-    psllw       m3, 6
-    psubw       m0, m4
-    psubw       m1, m4
-    psubw       m2, m4
-    psubw       m3, m4
-    movu        [r2 + r3 * 2], m0
-    movu        [r2 + r3 * 2 + mmsize], m1
-    movu        [r2 + r6], m2
-    movu        [r2 + r6 + mmsize], m3
 %endmacro
 
 ;-----------------------------------------------------------------------------
@@ -2552,28 +2465,12 @@
     ; load constant
     vpbroadcastd      m4, [pw_2000]
 
-    PROCESS_P2S_64x8_AVX512
+%rep 15
+    PROCESS_P2S_64x4_AVX512
     lea         r0, [r0 + r1 * 4]
     lea         r2, [r2 + r3 * 4]
-    PROCESS_P2S_64x8_AVX512
-    lea         r0, [r0 + r1 * 4]
-    lea         r2, [r2 + r3 * 4]
-    PROCESS_P2S_64x8_AVX512
-    lea         r0, [r0 + r1 * 4]
-    lea         r2, [r2 + r3 * 4]
-    PROCESS_P2S_64x8_AVX512
-    lea         r0, [r0 + r1 * 4]
-    lea         r2, [r2 + r3 * 4]
-    PROCESS_P2S_64x8_AVX512
-    lea         r0, [r0 + r1 * 4]
-    lea         r2, [r2 + r3 * 4]
-    PROCESS_P2S_64x8_AVX512
-    lea         r0, [r0 + r1 * 4]
-    lea         r2, [r2 + r3 * 4]
-    PROCESS_P2S_64x8_AVX512
-    lea         r0, [r0 + r1 * 4]
-    lea         r2, [r2 + r3 * 4]
-    PROCESS_P2S_64x8_AVX512
+%endrep
+    PROCESS_P2S_64x4_AVX512
     RET
 
 INIT_ZMM avx512
@@ -2586,22 +2483,12 @@
     ; load constant
     vpbroadcastd      m4, [pw_2000]
 
-    PROCESS_P2S_64x8_AVX512
+%rep 11
+    PROCESS_P2S_64x4_AVX512
     lea         r0, [r0 + r1 * 4]
     lea         r2, [r2 + r3 * 4]
-    PROCESS_P2S_64x8_AVX512
-    lea         r0, [r0 + r1 * 4]
-    lea         r2, [r2 + r3 * 4]
-    PROCESS_P2S_64x8_AVX512
-    lea         r0, [r0 + r1 * 4]
-    lea         r2, [r2 + r3 * 4]
-    PROCESS_P2S_64x8_AVX512
-    lea         r0, [r0 + r1 * 4]
-    lea         r2, [r2 + r3 * 4]
-    PROCESS_P2S_64x8_AVX512
-    lea         r0, [r0 + r1 * 4]
-    lea         r2, [r2 + r3 * 4]
-    PROCESS_P2S_64x8_AVX512
+%endrep
+    PROCESS_P2S_64x4_AVX512
     RET
 
 INIT_ZMM avx512
@@ -2614,16 +2501,12 @@
     ; load constant
     vpbroadcastd      m4, [pw_2000]
 
-    PROCESS_P2S_64x8_AVX512
+%rep 7
+    PROCESS_P2S_64x4_AVX512
     lea         r0, [r0 + r1 * 4]
     lea         r2, [r2 + r3 * 4]
-    PROCESS_P2S_64x8_AVX512
-    lea         r0, [r0 + r1 * 4]
-    lea         r2, [r2 + r3 * 4]
-    PROCESS_P2S_64x8_AVX512
-    lea         r0, [r0 + r1 * 4]
-    lea         r2, [r2 + r3 * 4]
-    PROCESS_P2S_64x8_AVX512
+%endrep
+    PROCESS_P2S_64x4_AVX512
     RET
 
 INIT_ZMM avx512
@@ -2636,10 +2519,12 @@
     ; load constant
     vpbroadcastd      m4, [pw_2000]
 
-    PROCESS_P2S_64x8_AVX512
+%rep 3
+    PROCESS_P2S_64x4_AVX512
     lea         r0, [r0 + r1 * 4]
     lea         r2, [r2 + r3 * 4]
-    PROCESS_P2S_64x8_AVX512
+%endrep
+    PROCESS_P2S_64x4_AVX512
     RET
 
 ;-----------------------------------------------------------------------------
diff -r e1348316cd44 -r ffd4c1528b37 source/common/x86/ipfilter8.h
--- a/source/common/x86/ipfilter8.h	Fri Sep 01 11:50:03 2017 +0530
+++ b/source/common/x86/ipfilter8.h	Thu Sep 21 16:39:45 2017 +0530
@@ -33,6 +33,7 @@
     FUNCDEF_PU(void, interp_8tap_vert_ss, cpu, const int16_t* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx); \
     FUNCDEF_PU(void, interp_8tap_hv_pp, cpu, const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int idxX, int idxY); \
     FUNCDEF_CHROMA_PU(void, filterPixelToShort, cpu, const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride); \
+    FUNCDEF_CHROMA_PU(void, filterPixelToShort_aligned, cpu, const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride); \
     FUNCDEF_CHROMA_PU(void, interp_4tap_horiz_pp, cpu, const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \
     FUNCDEF_CHROMA_PU(void, interp_4tap_horiz_ps, cpu, const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt); \
     FUNCDEF_CHROMA_PU(void, interp_4tap_vert_pp, cpu, const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \
diff -r e1348316cd44 -r ffd4c1528b37 source/test/ipfilterharness.cpp
--- a/source/test/ipfilterharness.cpp	Fri Sep 01 11:50:03 2017 +0530
+++ b/source/test/ipfilterharness.cpp	Thu Sep 21 16:39:45 2017 +0530
@@ -489,6 +489,26 @@
     return true;
 }
 
+bool IPFilterHarness::check_IPFilterLumaP2S_aligned_primitive(filter_p2s_t ref, filter_p2s_t opt)
+{
+    for (int i = 0; i < TEST_CASES; i++)
+    {
+        int index = i % TEST_CASES;
+        intptr_t rand_srcStride[] = { 128, 192, 256, 512 };
+        intptr_t dstStride[] = { 192, 256, 512, 576 };
+        for (int p = 0; p < 4; p++)
+        {
+            ref(pixel_test_buff[index], rand_srcStride[p], IPF_C_output_s, dstStride[p]);
+            checked(opt, pixel_test_buff[index] + (64 * i), rand_srcStride[p], IPF_vec_output_s, dstStride[p]);
+            if (memcmp(IPF_vec_output_s, IPF_C_output_s, TEST_BUF_SIZE * sizeof(int16_t)))
+                return false;
+        }
+        reportfail();
+    }
+
+    return true;
+}
+
 bool IPFilterHarness::check_IPFilterChromaP2S_primitive(filter_p2s_t ref, filter_p2s_t opt)
 {
     for (int i = 0; i < ITERS; i++)
@@ -510,6 +530,29 @@
     return true;
 }
 
+bool IPFilterHarness::check_IPFilterChromaP2S_aligned_primitive(filter_p2s_t ref, filter_p2s_t opt)
+{
+    for (int i = 0; i < TEST_CASES; i++)
+    {
+        int index = i % TEST_CASES;
+        intptr_t rand_srcStride[] = { 128, 192, 256, 512};
+        intptr_t dstStride[] = { 192, 256, 512, 576 };
+
+        for (int p = 0; p < 4; p++)
+        {
+            ref(pixel_test_buff[index], rand_srcStride[p], IPF_C_output_s, dstStride[p]);
+
+            checked(opt, pixel_test_buff[index], rand_srcStride[p], IPF_vec_output_s, dstStride[p]);
+
+            if (memcmp(IPF_vec_output_s, IPF_C_output_s, TEST_BUF_SIZE * sizeof(int16_t)))
+                return false;
+        }
+        reportfail();
+    }
+
+    return true;
+}
+
 bool IPFilterHarness::testCorrectness(const EncoderPrimitives& ref, const EncoderPrimitives& opt)
 {
 
@@ -579,6 +622,14 @@
                 return false;
             }
         }
+        if (opt.pu[value].convert_p2s_aligned)
+        {
+            if (!check_IPFilterLumaP2S_aligned_primitive(ref.pu[value].convert_p2s_aligned, opt.pu[value].convert_p2s_aligned))
+            {
+                printf("convert_p2s_aligned[%s]", lumaPartStr[value]);
+                return false;
+            }
+        }
     }
 
     for (int csp = X265_CSP_I420; csp < X265_CSP_COUNT; csp++)
@@ -633,6 +684,14 @@
                     return false;
                 }
             }
+            if (opt.chroma[csp].pu[value].p2s_aligned)
+            {
+                if (!check_IPFilterChromaP2S_aligned_primitive(ref.chroma[csp].pu[value].p2s_aligned, opt.chroma[csp].pu[value].p2s_aligned))
+                {
+                    printf("chroma_p2s_aligned[%s]", chromaPartStr[csp][value]);
+                    return false;
+                }
+            }
             if (opt.chroma[csp].pu[value].p2s)
             {
                 if (!check_IPFilterChromaP2S_primitive(ref.chroma[csp].pu[value].p2s, opt.chroma[csp].pu[value].p2s))
@@ -649,8 +708,8 @@
 
 void IPFilterHarness::measureSpeed(const EncoderPrimitives& ref, const EncoderPrimitives& opt)
 {
-    int16_t srcStride = 96;
-    int16_t dstStride = 96;
+    int16_t srcStride = 192;  /* Multiple of 64 */
+    int16_t dstStride = 192;
     int maxVerticalfilterHalfDistance = 3;
 
     for (int value = 0; value < NUM_PU_SIZES; value++)
@@ -659,62 +718,70 @@
         {
             printf("luma_hpp[%s]\t", lumaPartStr[value]);
             REPORT_SPEEDUP(opt.pu[value].luma_hpp, ref.pu[value].luma_hpp,
-                           pixel_buff + srcStride, srcStride, IPF_vec_output_p, dstStride, 1);
+                pixel_buff + srcStride, srcStride, IPF_vec_output_p, dstStride, 1);
         }
 
         if (opt.pu[value].luma_hps)
         {
             printf("luma_hps[%s]\t", lumaPartStr[value]);
             REPORT_SPEEDUP(opt.pu[value].luma_hps, ref.pu[value].luma_hps,
-                           pixel_buff + maxVerticalfilterHalfDistance * srcStride, srcStride,
-                           IPF_vec_output_s, dstStride, 1, 1);
+                pixel_buff + maxVerticalfilterHalfDistance * srcStride, srcStride,
+                IPF_vec_output_s, dstStride, 1, 1);
         }
 
         if (opt.pu[value].luma_vpp)
         {
             printf("luma_vpp[%s]\t", lumaPartStr[value]);
             REPORT_SPEEDUP(opt.pu[value].luma_vpp, ref.pu[value].luma_vpp,
-                           pixel_buff + maxVerticalfilterHalfDistance * srcStride, srcStride,
-                           IPF_vec_output_p, dstStride, 1);
+                pixel_buff + maxVerticalfilterHalfDistance * srcStride, srcStride,
+                IPF_vec_output_p, dstStride, 1);
         }
 
         if (opt.pu[value].luma_vps)
         {
             printf("luma_vps[%s]\t", lumaPartStr[value]);
             REPORT_SPEEDUP(opt.pu[value].luma_vps, ref.pu[value].luma_vps,
-                           pixel_buff + maxVerticalfilterHalfDistance * srcStride, srcStride,
-                           IPF_vec_output_s, dstStride, 1);
+                pixel_buff + maxVerticalfilterHalfDistance * srcStride, srcStride,
+                IPF_vec_output_s, dstStride, 1);
         }
 
         if (opt.pu[value].luma_vsp)
         {
             printf("luma_vsp[%s]\t", lumaPartStr[value]);
             REPORT_SPEEDUP(opt.pu[value].luma_vsp, ref.pu[value].luma_vsp,
-                           short_buff + maxVerticalfilterHalfDistance * srcStride, srcStride,
-                           IPF_vec_output_p, dstStride, 1);
+                short_buff + maxVerticalfilterHalfDistance * srcStride, srcStride,
+                IPF_vec_output_p, dstStride, 1);
         }
 
         if (opt.pu[value].luma_vss)
         {
             printf("luma_vss[%s]\t", lumaPartStr[value]);
             REPORT_SPEEDUP(opt.pu[value].luma_vss, ref.pu[value].luma_vss,
-                           short_buff + maxVerticalfilterHalfDistance * srcStride, srcStride,
-                           IPF_vec_output_s, dstStride, 1);
+                short_buff + maxVerticalfilterHalfDistance * srcStride, srcStride,
+                IPF_vec_output_s, dstStride, 1);
         }
 
         if (opt.pu[value].luma_hvpp)
         {
             printf("luma_hv [%s]\t", lumaPartStr[value]);
             REPORT_SPEEDUP(opt.pu[value].luma_hvpp, ref.pu[value].luma_hvpp,
-                           pixel_buff + 3 * srcStride, srcStride, IPF_vec_output_p, srcStride, 1, 3);
+                pixel_buff + 3 * srcStride, srcStride, IPF_vec_output_p, srcStride, 1, 3);
         }
 
         if (opt.pu[value].convert_p2s)
         {
             printf("convert_p2s[%s]\t", lumaPartStr[value]);
             REPORT_SPEEDUP(opt.pu[value].convert_p2s, ref.pu[value].convert_p2s,
-                               pixel_buff, srcStride,
-                               IPF_vec_output_s, dstStride);
+                pixel_buff, srcStride,
+                IPF_vec_output_s, dstStride);
+        }
+
+        if (opt.pu[value].convert_p2s_aligned)
+        {
+            printf("convert_p2s_aligned[%s]\t", lumaPartStr[value]);
+            REPORT_SPEEDUP(opt.pu[value].convert_p2s_aligned, ref.pu[value].convert_p2s_aligned,
+                pixel_buff, srcStride,
+                IPF_vec_output_s, dstStride);
         }
     }
 
@@ -727,47 +794,53 @@
             {
                 printf("chroma_hpp[%s]", chromaPartStr[csp][value]);
                 REPORT_SPEEDUP(opt.chroma[csp].pu[value].filter_hpp, ref.chroma[csp].pu[value].filter_hpp,
-                               pixel_buff + srcStride, srcStride, IPF_vec_output_p, dstStride, 1);
+                    pixel_buff + srcStride, srcStride, IPF_vec_output_p, dstStride, 1);
             }
             if (opt.chroma[csp].pu[value].filter_hps)
             {
                 printf("chroma_hps[%s]", chromaPartStr[csp][value]);
                 REPORT_SPEEDUP(opt.chroma[csp].pu[value].filter_hps, ref.chroma[csp].pu[value].filter_hps,
-                               pixel_buff + srcStride, srcStride, IPF_vec_output_s, dstStride, 1, 1);
+                    pixel_buff + srcStride, srcStride, IPF_vec_output_s, dstStride, 1, 1);
             }
             if (opt.chroma[csp].pu[value].filter_vpp)
             {
                 printf("chroma_vpp[%s]", chromaPartStr[csp][value]);
                 REPORT_SPEEDUP(opt.chroma[csp].pu[value].filter_vpp, ref.chroma[csp].pu[value].filter_vpp,
-                               pixel_buff + maxVerticalfilterHalfDistance * srcStride, srcStride,
-                               IPF_vec_output_p, dstStride, 1);
+                    pixel_buff + maxVerticalfilterHalfDistance * srcStride, srcStride,
+                    IPF_vec_output_p, dstStride, 1);
             }
             if (opt.chroma[csp].pu[value].filter_vps)
             {
                 printf("chroma_vps[%s]", chromaPartStr[csp][value]);
                 REPORT_SPEEDUP(opt.chroma[csp].pu[value].filter_vps, ref.chroma[csp].pu[value].filter_vps,
-                               pixel_buff + maxVerticalfilterHalfDistance * srcStride, srcStride,
-                               IPF_vec_output_s, dstStride, 1);
+                    pixel_buff + maxVerticalfilterHalfDistance * srcStride, srcStride,
+                    IPF_vec_output_s, dstStride, 1);
             }
             if (opt.chroma[csp].pu[value].filter_vsp)
             {
                 printf("chroma_vsp[%s]", chromaPartStr[csp][value]);
                 REPORT_SPEEDUP(opt.chroma[csp].pu[value].filter_vsp, ref.chroma[csp].pu[value].filter_vsp,
-                               short_buff + maxVerticalfilterHalfDistance * srcStride, srcStride,
-                               IPF_vec_output_p, dstStride, 1);
+                    short_buff + maxVerticalfilterHalfDistance * srcStride, srcStride,
+                    IPF_vec_output_p, dstStride, 1);
             }
             if (opt.chroma[csp].pu[value].filter_vss)
             {
                 printf("chroma_vss[%s]", chromaPartStr[csp][value]);
                 REPORT_SPEEDUP(opt.chroma[csp].pu[value].filter_vss, ref.chroma[csp].pu[value].filter_vss,
-                               short_buff + maxVerticalfilterHalfDistance * srcStride, srcStride,
-                               IPF_vec_output_s, dstStride, 1);
+                    short_buff + maxVerticalfilterHalfDistance * srcStride, srcStride,
+                    IPF_vec_output_s, dstStride, 1);
             }
             if (opt.chroma[csp].pu[value].p2s)
             {
                 printf("chroma_p2s[%s]\t", chromaPartStr[csp][value]);
                 REPORT_SPEEDUP(opt.chroma[csp].pu[value].p2s, ref.chroma[csp].pu[value].p2s,
-                               pixel_buff, srcStride, IPF_vec_output_s, dstStride);
+                    pixel_buff, srcStride, IPF_vec_output_s, dstStride);
+            }
+            if (opt.chroma[csp].pu[value].p2s_aligned)
+            {
+                printf("chroma_p2s_aligned[%s]\t", chromaPartStr[csp][value]);
+                REPORT_SPEEDUP(opt.chroma[csp].pu[value].p2s_aligned, ref.chroma[csp].pu[value].p2s_aligned,
+                    pixel_buff, srcStride, IPF_vec_output_s, dstStride);
             }
         }
     }
diff -r e1348316cd44 -r ffd4c1528b37 source/test/ipfilterharness.h
--- a/source/test/ipfilterharness.h	Fri Sep 01 11:50:03 2017 +0530
+++ b/source/test/ipfilterharness.h	Thu Sep 21 16:39:45 2017 +0530
@@ -40,15 +40,15 @@
     enum { TEST_CASES = 3 };
     enum { SMAX = 1 << 12 };
     enum { SMIN = (unsigned)-1 << 12 };
-    ALIGN_VAR_32(pixel, pixel_buff[TEST_BUF_SIZE]);
-    int16_t short_buff[TEST_BUF_SIZE];
-    int16_t IPF_vec_output_s[TEST_BUF_SIZE];
-    int16_t IPF_C_output_s[TEST_BUF_SIZE];
-    pixel   IPF_vec_output_p[TEST_BUF_SIZE];
-    pixel   IPF_C_output_p[TEST_BUF_SIZE];
+    ALIGN_VAR_64(pixel, pixel_buff[TEST_BUF_SIZE]);
+    ALIGN_VAR_64(int16_t, short_buff[TEST_BUF_SIZE]);
+    ALIGN_VAR_64(int16_t, IPF_vec_output_s[TEST_BUF_SIZE]);
+    ALIGN_VAR_64(int16_t, IPF_C_output_s[TEST_BUF_SIZE]);
+    ALIGN_VAR_64(pixel,   IPF_vec_output_p[TEST_BUF_SIZE]);
+    ALIGN_VAR_64(pixel,   IPF_C_output_p[TEST_BUF_SIZE]);
 
-    pixel   pixel_test_buff[TEST_CASES][TEST_BUF_SIZE];
-    int16_t short_test_buff[TEST_CASES][TEST_BUF_SIZE];
+    ALIGN_VAR_64(pixel,   pixel_test_buff[TEST_CASES][TEST_BUF_SIZE]);
+    ALIGN_VAR_64(int16_t, short_test_buff[TEST_CASES][TEST_BUF_SIZE]);
 
     bool check_IPFilterChroma_primitive(filter_pp_t ref, filter_pp_t opt);
     bool check_IPFilterChroma_ps_primitive(filter_ps_t ref, filter_ps_t opt);
@@ -62,7 +62,9 @@
     bool check_IPFilterLuma_ss_primitive(filter_ss_t ref, filter_ss_t opt);
     bool check_IPFilterLumaHV_primitive(filter_hv_pp_t ref, filter_hv_pp_t opt);
     bool check_IPFilterLumaP2S_primitive(filter_p2s_t ref, filter_p2s_t opt);
+    bool check_IPFilterLumaP2S_aligned_primitive(filter_p2s_t ref, filter_p2s_t opt);
     bool check_IPFilterChromaP2S_primitive(filter_p2s_t ref, filter_p2s_t opt);
+    bool check_IPFilterChromaP2S_aligned_primitive(filter_p2s_t ref, filter_p2s_t opt);
 
 public:
 


More information about the x265-devel mailing list