[x265] [PATCH 110 of 307] x86: Aligned routine implementation for p2s primitive
mythreyi at multicorewareinc.com
mythreyi at multicorewareinc.com
Sat Apr 7 04:31:48 CEST 2018
# HG changeset patch
# User Vignesh Vijayakumar
# Date 1505992185 -19800
# Thu Sep 21 16:39:45 2017 +0530
# Node ID ffd4c1528b37332493c5fa4677e780dbef121a01
# Parent e1348316cd4431a5d39c8a9457d865f0f9d546cc
x86: Aligned routine implementation for p2s primitive
Makes default alignment 64 bytes
Links the high bit depth code of p2s primitive to the aligned code if AVX512 architecture is detected and the source and destination strides are multiples of 64.
Aligns all ipfilterharness buffers to 64 bytes
Disables sad_x3_48x64 and sad_x4_48x64 temporarily
Cleanup p2s code
diff -r e1348316cd44 -r ffd4c1528b37 source/common/common.cpp
--- a/source/common/common.cpp Fri Sep 01 11:50:03 2017 +0530
+++ b/source/common/common.cpp Thu Sep 21 16:39:45 2017 +0530
@@ -54,7 +54,7 @@
#endif
}
-#define X265_ALIGNBYTES 32
+#define X265_ALIGNBYTES 64
#if _WIN32
#if defined(__MINGW32__) && !defined(__MINGW64_VERSION_MAJOR)
diff -r e1348316cd44 -r ffd4c1528b37 source/common/common.h
--- a/source/common/common.h Fri Sep 01 11:50:03 2017 +0530
+++ b/source/common/common.h Thu Sep 21 16:39:45 2017 +0530
@@ -75,6 +75,7 @@
#define ALIGN_VAR_8(T, var) T var __attribute__((aligned(8)))
#define ALIGN_VAR_16(T, var) T var __attribute__((aligned(16)))
#define ALIGN_VAR_32(T, var) T var __attribute__((aligned(32)))
+#define ALIGN_VAR_64(T, var) T var __attribute__((aligned(64)))
#if defined(__MINGW32__)
#define fseeko fseeko64
#define ftello ftello64
@@ -85,6 +86,7 @@
#define ALIGN_VAR_8(T, var) __declspec(align(8)) T var
#define ALIGN_VAR_16(T, var) __declspec(align(16)) T var
#define ALIGN_VAR_32(T, var) __declspec(align(32)) T var
+#define ALIGN_VAR_64(T, var) __declspec(align(64)) T var
#define fseeko _fseeki64
#define ftello _ftelli64
#endif // if defined(__GNUC__)
diff -r e1348316cd44 -r ffd4c1528b37 source/common/ipfilter.cpp
--- a/source/common/ipfilter.cpp Fri Sep 01 11:50:03 2017 +0530
+++ b/source/common/ipfilter.cpp Thu Sep 21 16:39:45 2017 +0530
@@ -379,7 +379,8 @@
p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].filter_vps = interp_vert_ps_c<4, W, H>; \
p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].filter_vsp = interp_vert_sp_c<4, W, H>; \
p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].filter_vss = interp_vert_ss_c<4, W, H>; \
- p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].p2s = filterPixelToShort_c<W, H>;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].p2s = filterPixelToShort_c<W, H>;\
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].p2s_aligned = filterPixelToShort_c<W, H>;
#define CHROMA_422(W, H) \
p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].filter_hpp = interp_horiz_pp_c<4, W, H>; \
@@ -388,7 +389,8 @@
p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].filter_vps = interp_vert_ps_c<4, W, H>; \
p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].filter_vsp = interp_vert_sp_c<4, W, H>; \
p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].filter_vss = interp_vert_ss_c<4, W, H>; \
- p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].p2s = filterPixelToShort_c<W, H>;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].p2s = filterPixelToShort_c<W, H>;\
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].p2s_aligned = filterPixelToShort_c<W, H>;
#define CHROMA_444(W, H) \
p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].filter_hpp = interp_horiz_pp_c<4, W, H>; \
@@ -397,7 +399,8 @@
p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].filter_vps = interp_vert_ps_c<4, W, H>; \
p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].filter_vsp = interp_vert_sp_c<4, W, H>; \
p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].filter_vss = interp_vert_ss_c<4, W, H>; \
- p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].p2s = filterPixelToShort_c<W, H>;
+ p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].p2s = filterPixelToShort_c<W, H>;\
+ p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].p2s_aligned = filterPixelToShort_c<W, H>;
#define LUMA(W, H) \
p.pu[LUMA_ ## W ## x ## H].luma_hpp = interp_horiz_pp_c<8, W, H>; \
@@ -407,7 +410,8 @@
p.pu[LUMA_ ## W ## x ## H].luma_vsp = interp_vert_sp_c<8, W, H>; \
p.pu[LUMA_ ## W ## x ## H].luma_vss = interp_vert_ss_c<8, W, H>; \
p.pu[LUMA_ ## W ## x ## H].luma_hvpp = interp_hv_pp_c<8, W, H>; \
- p.pu[LUMA_ ## W ## x ## H].convert_p2s = filterPixelToShort_c<W, H>;
+ p.pu[LUMA_ ## W ## x ## H].convert_p2s = filterPixelToShort_c<W, H>;\
+ p.pu[LUMA_ ## W ## x ## H].convert_p2s_aligned = filterPixelToShort_c<W, H>;
void setupFilterPrimitives_c(EncoderPrimitives& p)
{
diff -r e1348316cd44 -r ffd4c1528b37 source/common/predict.cpp
--- a/source/common/predict.cpp Fri Sep 01 11:50:03 2017 +0530
+++ b/source/common/predict.cpp Thu Sep 21 16:39:45 2017 +0530
@@ -283,7 +283,18 @@
int yFrac = mv.y & 3;
if (!(yFrac | xFrac))
+ {
+#if HIGH_BIT_DEPTH
+ bool srcbufferAlignCheck = (refPic.m_cuOffsetY[pu.ctuAddr] + refPic.m_buOffsetY[pu.cuAbsPartIdx + pu.puAbsPartIdx] + srcOffset) % 64 == 0;
+ bool dstbufferAlignCheck = (dstSYuv.getAddrOffset(pu.puAbsPartIdx, dstSYuv.m_size) % 64) == 0;
+ if (srcStride % 64 == 0 && dstStride % 64 == 0 && srcbufferAlignCheck && dstbufferAlignCheck && (refPic.m_param->cpuid & X265_CPU_AVX512))
+ primitives.pu[partEnum].convert_p2s_aligned(src, srcStride, dst, dstStride);
+ else
+ primitives.pu[partEnum].convert_p2s(src, srcStride, dst, dstStride);
+#else
primitives.pu[partEnum].convert_p2s(src, srcStride, dst, dstStride);
+#endif
+ }
else if (!yFrac)
primitives.pu[partEnum].luma_hps(src, srcStride, dst, dstStride, xFrac, 0);
else if (!xFrac)
@@ -375,8 +386,23 @@
if (!(yFrac | xFrac))
{
+#if HIGH_BIT_DEPTH
+ bool srcbufferAlignCheckC = (refPic.m_cuOffsetC[pu.ctuAddr] + refPic.m_buOffsetC[pu.cuAbsPartIdx + pu.puAbsPartIdx] + refOffset) % 64 == 0;
+ bool dstbufferAlignCheckC = dstSYuv.getChromaAddrOffset(pu.puAbsPartIdx) % 64 == 0;
+ if (refStride % 64 == 0 && dstStride % 64 == 0 && srcbufferAlignCheckC && dstbufferAlignCheckC && (refPic.m_param->cpuid & X265_CPU_AVX512))
+ {
+ primitives.chroma[m_csp].pu[partEnum].p2s_aligned(refCb, refStride, dstCb, dstStride);
+ primitives.chroma[m_csp].pu[partEnum].p2s_aligned(refCr, refStride, dstCr, dstStride);
+ }
+ else
+ {
+ primitives.chroma[m_csp].pu[partEnum].p2s(refCb, refStride, dstCb, dstStride);
+ primitives.chroma[m_csp].pu[partEnum].p2s(refCr, refStride, dstCr, dstStride);
+ }
+#else
primitives.chroma[m_csp].pu[partEnum].p2s(refCb, refStride, dstCb, dstStride);
primitives.chroma[m_csp].pu[partEnum].p2s(refCr, refStride, dstCr, dstStride);
+#endif
}
else if (!yFrac)
{
diff -r e1348316cd44 -r ffd4c1528b37 source/common/primitives.h
--- a/source/common/primitives.h Fri Sep 01 11:50:03 2017 +0530
+++ b/source/common/primitives.h Thu Sep 21 16:39:45 2017 +0530
@@ -248,6 +248,7 @@
copy_pp_t copy_pp;
filter_p2s_t convert_p2s;
+ filter_p2s_t convert_p2s_aligned;
}
pu[NUM_PU_SIZES];
@@ -387,6 +388,7 @@
addAvg_t addAvg;
copy_pp_t copy_pp;
filter_p2s_t p2s;
+ filter_p2s_t p2s_aligned;
}
pu[NUM_PU_SIZES];
diff -r e1348316cd44 -r ffd4c1528b37 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Fri Sep 01 11:50:03 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp Thu Sep 21 16:39:45 2017 +0530
@@ -2239,12 +2239,13 @@
p.pu[LUMA_64x32].convert_p2s = PFX(filterPixelToShort_64x32_avx512);
p.pu[LUMA_64x48].convert_p2s = PFX(filterPixelToShort_64x48_avx512);
p.pu[LUMA_64x64].convert_p2s = PFX(filterPixelToShort_64x64_avx512);
- p.pu[LUMA_32x8].convert_p2s = PFX(filterPixelToShort_32x8_avx2);
+ p.pu[LUMA_32x8].convert_p2s = PFX(filterPixelToShort_32x8_avx512);
p.pu[LUMA_32x16].convert_p2s = PFX(filterPixelToShort_32x16_avx512);
p.pu[LUMA_32x24].convert_p2s = PFX(filterPixelToShort_32x24_avx512);
p.pu[LUMA_32x32].convert_p2s = PFX(filterPixelToShort_32x32_avx512);
p.pu[LUMA_32x64].convert_p2s = PFX(filterPixelToShort_32x64_avx512);
p.pu[LUMA_48x64].convert_p2s = PFX(filterPixelToShort_48x64_avx512);
+
p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].p2s = PFX(filterPixelToShort_32x8_avx512);
p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].p2s = PFX(filterPixelToShort_32x16_avx512);
p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].p2s = PFX(filterPixelToShort_32x24_avx512);
@@ -2253,7 +2254,7 @@
p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].p2s = PFX(filterPixelToShort_32x32_avx512);
p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].p2s = PFX(filterPixelToShort_32x48_avx512);
p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].p2s = PFX(filterPixelToShort_32x64_avx512);
- p.chroma[X265_CSP_I444].pu[LUMA_32x8].p2s = PFX(filterPixelToShort_32x8_avx2);
+ p.chroma[X265_CSP_I444].pu[LUMA_32x8].p2s = PFX(filterPixelToShort_32x8_avx512);
p.chroma[X265_CSP_I444].pu[LUMA_32x16].p2s = PFX(filterPixelToShort_32x16_avx512);
p.chroma[X265_CSP_I444].pu[LUMA_32x24].p2s = PFX(filterPixelToShort_32x24_avx512);
p.chroma[X265_CSP_I444].pu[LUMA_32x32].p2s = PFX(filterPixelToShort_32x32_avx512);
@@ -2263,6 +2264,106 @@
p.chroma[X265_CSP_I444].pu[LUMA_64x48].p2s = PFX(filterPixelToShort_64x48_avx512);
p.chroma[X265_CSP_I444].pu[LUMA_64x64].p2s = PFX(filterPixelToShort_64x64_avx512);
+ p.pu[LUMA_4x4].convert_p2s_aligned = PFX(filterPixelToShort_4x4_ssse3);
+ p.pu[LUMA_4x8].convert_p2s_aligned = PFX(filterPixelToShort_4x8_ssse3);
+ p.pu[LUMA_4x16].convert_p2s_aligned = PFX(filterPixelToShort_4x16_ssse3);
+ p.pu[LUMA_8x8].convert_p2s_aligned = PFX(filterPixelToShort_8x8_ssse3);
+ p.pu[LUMA_8x4].convert_p2s_aligned = PFX(filterPixelToShort_8x4_ssse3);
+ p.pu[LUMA_8x16].convert_p2s_aligned = PFX(filterPixelToShort_8x16_ssse3);
+ p.pu[LUMA_8x32].convert_p2s_aligned = PFX(filterPixelToShort_8x32_ssse3);
+ p.pu[LUMA_12x16].convert_p2s_aligned = PFX(filterPixelToShort_12x16_ssse3);
+ p.pu[LUMA_16x4].convert_p2s_aligned = PFX(filterPixelToShort_16x4_avx2);
+ p.pu[LUMA_16x8].convert_p2s_aligned = PFX(filterPixelToShort_16x8_avx2);
+ p.pu[LUMA_16x12].convert_p2s_aligned = PFX(filterPixelToShort_16x12_avx2);
+ p.pu[LUMA_16x16].convert_p2s_aligned = PFX(filterPixelToShort_16x16_avx2);
+ p.pu[LUMA_16x32].convert_p2s_aligned = PFX(filterPixelToShort_16x32_avx2);
+ p.pu[LUMA_16x64].convert_p2s_aligned = PFX(filterPixelToShort_16x64_avx2);
+ p.pu[LUMA_24x32].convert_p2s_aligned = PFX(filterPixelToShort_24x32_avx2);
+ p.pu[LUMA_64x16].convert_p2s_aligned = PFX(filterPixelToShort_aligned_64x16_avx512);
+ p.pu[LUMA_64x32].convert_p2s_aligned = PFX(filterPixelToShort_aligned_64x32_avx512);
+ p.pu[LUMA_64x48].convert_p2s_aligned = PFX(filterPixelToShort_aligned_64x48_avx512);
+ p.pu[LUMA_64x64].convert_p2s_aligned = PFX(filterPixelToShort_aligned_64x64_avx512);
+ p.pu[LUMA_32x8].convert_p2s_aligned = PFX(filterPixelToShort_aligned_32x8_avx512);
+ p.pu[LUMA_32x16].convert_p2s_aligned = PFX(filterPixelToShort_aligned_32x16_avx512);
+ p.pu[LUMA_32x24].convert_p2s_aligned = PFX(filterPixelToShort_aligned_32x24_avx512);
+ p.pu[LUMA_32x32].convert_p2s_aligned = PFX(filterPixelToShort_aligned_32x32_avx512);
+ p.pu[LUMA_32x64].convert_p2s_aligned = PFX(filterPixelToShort_aligned_32x64_avx512);
+ p.pu[LUMA_48x64].convert_p2s_aligned = PFX(filterPixelToShort_aligned_48x64_avx512);
+
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].p2s_aligned = PFX(filterPixelToShort_4x4_ssse3);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_4x2].p2s_aligned = PFX(filterPixelToShort_4x2_ssse3);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].p2s_aligned = PFX(filterPixelToShort_4x8_ssse3);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].p2s_aligned = PFX(filterPixelToShort_4x16_ssse3);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].p2s_aligned = PFX(filterPixelToShort_8x8_ssse3);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].p2s_aligned = PFX(filterPixelToShort_8x4_ssse3);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_8x2].p2s_aligned = PFX(filterPixelToShort_8x2_ssse3);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_8x6].p2s_aligned = PFX(filterPixelToShort_8x6_ssse3);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].p2s_aligned = PFX(filterPixelToShort_8x16_ssse3);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].p2s_aligned = PFX(filterPixelToShort_8x32_ssse3);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_12x16].p2s_aligned = PFX(filterPixelToShort_12x16_ssse3);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_2x4].p2s_aligned = PFX(filterPixelToShort_2x4_sse4);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_2x8].p2s_aligned = PFX(filterPixelToShort_2x8_sse4);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_6x8].p2s_aligned = PFX(filterPixelToShort_6x8_sse4);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].p2s_aligned = PFX(filterPixelToShort_16x8_avx2);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].p2s_aligned = PFX(filterPixelToShort_16x12_avx2);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].p2s_aligned = PFX(filterPixelToShort_16x16_avx2);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].p2s_aligned = PFX(filterPixelToShort_16x32_avx2);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].p2s_aligned = PFX(filterPixelToShort_24x32_avx2);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].p2s_aligned = PFX(filterPixelToShort_aligned_32x8_avx512);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].p2s_aligned = PFX(filterPixelToShort_aligned_32x16_avx512);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].p2s_aligned = PFX(filterPixelToShort_aligned_32x24_avx512);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].p2s_aligned = PFX(filterPixelToShort_aligned_32x32_avx512);
+
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_2x8].p2s_aligned = PFX(filterPixelToShort_2x8_sse4);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_2x16].p2s_aligned = PFX(filterPixelToShort_2x16_sse4);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].p2s_aligned = PFX(filterPixelToShort_4x4_ssse3);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].p2s_aligned = PFX(filterPixelToShort_4x8_ssse3);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].p2s_aligned = PFX(filterPixelToShort_4x16_ssse3);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_4x32].p2s_aligned = PFX(filterPixelToShort_4x32_ssse3);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_6x16].p2s_aligned = PFX(filterPixelToShort_6x16_sse4);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].p2s_aligned = PFX(filterPixelToShort_8x4_ssse3);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].p2s_aligned = PFX(filterPixelToShort_8x8_ssse3);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_8x12].p2s_aligned = PFX(filterPixelToShort_8x12_ssse3);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].p2s_aligned = PFX(filterPixelToShort_8x16_ssse3);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].p2s_aligned = PFX(filterPixelToShort_8x32_ssse3);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].p2s_aligned = PFX(filterPixelToShort_8x64_ssse3);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_12x32].p2s_aligned = PFX(filterPixelToShort_12x32_ssse3);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].p2s_aligned = PFX(filterPixelToShort_16x8_avx2);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].p2s_aligned = PFX(filterPixelToShort_16x16_avx2);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].p2s_aligned = PFX(filterPixelToShort_16x24_avx2);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].p2s_aligned = PFX(filterPixelToShort_16x32_avx2);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].p2s_aligned = PFX(filterPixelToShort_16x64_avx2);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].p2s_aligned = PFX(filterPixelToShort_24x64_avx2);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].p2s_aligned = PFX(filterPixelToShort_aligned_32x16_avx512);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].p2s_aligned = PFX(filterPixelToShort_aligned_32x32_avx512);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].p2s_aligned = PFX(filterPixelToShort_aligned_32x48_avx512);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].p2s_aligned = PFX(filterPixelToShort_aligned_32x64_avx512);
+
+ p.chroma[X265_CSP_I444].pu[LUMA_4x4].p2s_aligned = PFX(filterPixelToShort_4x4_ssse3);
+ p.chroma[X265_CSP_I444].pu[LUMA_4x8].p2s_aligned = PFX(filterPixelToShort_4x8_ssse3);
+ p.chroma[X265_CSP_I444].pu[LUMA_4x16].p2s_aligned = PFX(filterPixelToShort_4x16_ssse3);
+ p.chroma[X265_CSP_I444].pu[LUMA_8x8].p2s_aligned = PFX(filterPixelToShort_8x8_ssse3);
+ p.chroma[X265_CSP_I444].pu[LUMA_8x4].p2s_aligned = PFX(filterPixelToShort_8x4_ssse3);
+ p.chroma[X265_CSP_I444].pu[LUMA_8x16].p2s_aligned = PFX(filterPixelToShort_8x16_ssse3);
+ p.chroma[X265_CSP_I444].pu[LUMA_8x32].p2s_aligned = PFX(filterPixelToShort_8x32_ssse3);
+ p.chroma[X265_CSP_I444].pu[LUMA_12x16].p2s_aligned = PFX(filterPixelToShort_12x16_ssse3);
+ p.chroma[X265_CSP_I444].pu[LUMA_16x4].p2s_aligned = PFX(filterPixelToShort_16x4_avx2);
+ p.chroma[X265_CSP_I444].pu[LUMA_16x8].p2s_aligned = PFX(filterPixelToShort_16x8_avx2);
+ p.chroma[X265_CSP_I444].pu[LUMA_16x12].p2s_aligned = PFX(filterPixelToShort_16x12_avx2);
+ p.chroma[X265_CSP_I444].pu[LUMA_16x16].p2s_aligned = PFX(filterPixelToShort_16x16_avx2);
+ p.chroma[X265_CSP_I444].pu[LUMA_16x32].p2s_aligned = PFX(filterPixelToShort_16x32_avx2);
+ p.chroma[X265_CSP_I444].pu[LUMA_16x64].p2s_aligned = PFX(filterPixelToShort_16x64_avx2);
+ p.chroma[X265_CSP_I444].pu[LUMA_24x32].p2s_aligned = PFX(filterPixelToShort_24x32_avx2);
+ p.chroma[X265_CSP_I444].pu[LUMA_32x8].p2s_aligned = PFX(filterPixelToShort_aligned_32x8_avx512);
+ p.chroma[X265_CSP_I444].pu[LUMA_32x16].p2s_aligned = PFX(filterPixelToShort_aligned_32x16_avx512);
+ p.chroma[X265_CSP_I444].pu[LUMA_32x24].p2s_aligned = PFX(filterPixelToShort_aligned_32x24_avx512);
+ p.chroma[X265_CSP_I444].pu[LUMA_32x32].p2s_aligned = PFX(filterPixelToShort_aligned_32x32_avx512);
+ p.chroma[X265_CSP_I444].pu[LUMA_32x64].p2s_aligned = PFX(filterPixelToShort_aligned_32x64_avx512);
+ p.chroma[X265_CSP_I444].pu[LUMA_64x16].p2s_aligned = PFX(filterPixelToShort_aligned_64x16_avx512);
+ p.chroma[X265_CSP_I444].pu[LUMA_64x32].p2s_aligned = PFX(filterPixelToShort_aligned_64x32_avx512);
+ p.chroma[X265_CSP_I444].pu[LUMA_64x48].p2s_aligned = PFX(filterPixelToShort_aligned_64x48_avx512);
+ p.chroma[X265_CSP_I444].pu[LUMA_64x64].p2s_aligned = PFX(filterPixelToShort_aligned_64x64_avx512);
+
p.cu[BLOCK_32x32].ssd_s = PFX(pixel_ssd_s_32_avx512);
p.pu[LUMA_32x8].sad = PFX(pixel_sad_32x8_avx512);
@@ -2329,7 +2430,7 @@
p.pu[LUMA_32x24].sad_x3 = PFX(pixel_sad_x3_32x24_avx512);
p.pu[LUMA_32x32].sad_x3 = PFX(pixel_sad_x3_32x32_avx512);
p.pu[LUMA_32x64].sad_x3 = PFX(pixel_sad_x3_32x64_avx512);
- p.pu[LUMA_48x64].sad_x3 = PFX(pixel_sad_x3_48x64_avx512);
+ //p.pu[LUMA_48x64].sad_x3 = PFX(pixel_sad_x3_48x64_avx512);
p.pu[LUMA_64x16].sad_x3 = PFX(pixel_sad_x3_64x16_avx512);
p.pu[LUMA_64x32].sad_x3 = PFX(pixel_sad_x3_64x32_avx512);
p.pu[LUMA_64x48].sad_x3 = PFX(pixel_sad_x3_64x48_avx512);
@@ -2340,7 +2441,7 @@
p.pu[LUMA_32x24].sad_x4 = PFX(pixel_sad_x4_32x24_avx512);
p.pu[LUMA_32x32].sad_x4 = PFX(pixel_sad_x4_32x32_avx512);
p.pu[LUMA_32x64].sad_x4 = PFX(pixel_sad_x4_32x64_avx512);
- p.pu[LUMA_48x64].sad_x4 = PFX(pixel_sad_x4_48x64_avx512);
+ //p.pu[LUMA_48x64].sad_x4 = PFX(pixel_sad_x4_48x64_avx512);
p.pu[LUMA_64x16].sad_x4 = PFX(pixel_sad_x4_64x16_avx512);
p.pu[LUMA_64x32].sad_x4 = PFX(pixel_sad_x4_64x32_avx512);
p.pu[LUMA_64x48].sad_x4 = PFX(pixel_sad_x4_64x48_avx512);
@@ -4097,6 +4198,7 @@
p.pu[LUMA_32x32].convert_p2s = PFX(filterPixelToShort_32x32_avx512);
p.pu[LUMA_32x64].convert_p2s = PFX(filterPixelToShort_32x64_avx512);
p.pu[LUMA_48x64].convert_p2s = PFX(filterPixelToShort_48x64_avx512);
+
p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].p2s = PFX(filterPixelToShort_32x8_avx512);
p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].p2s = PFX(filterPixelToShort_32x16_avx512);
p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].p2s = PFX(filterPixelToShort_32x24_avx512);
diff -r e1348316cd44 -r ffd4c1528b37 source/common/x86/ipfilter16.asm
--- a/source/common/x86/ipfilter16.asm Fri Sep 01 11:50:03 2017 +0530
+++ b/source/common/x86/ipfilter16.asm Thu Sep 21 16:39:45 2017 +0530
@@ -322,46 +322,9 @@
FILTER_VER_LUMA_sse2 ps, 16, 64
;-----------------------------------------------------------------------------
-;p2s avx512 code start
+;p2s and p2s_aligned avx512 code start
;-----------------------------------------------------------------------------
-%macro P2S_64x8_AVX512 0
- movu m0, [r0]
- movu m1, [r0 + r1]
- movu m2, [r0 + r1 * 2]
- movu m3, [r0 + r5]
- psllw m0, (14 - BIT_DEPTH)
- psllw m1, (14 - BIT_DEPTH)
- psllw m2, (14 - BIT_DEPTH)
- psllw m3, (14 - BIT_DEPTH)
- psubw m0, m4
- psubw m1, m4
- psubw m2, m4
- psubw m3, m4
- movu [r2], m0
- movu [r2 + r3], m1
- movu [r2 + r3 * 2], m2
- movu [r2 + r4], m3
-
- movu m0, [r0 + mmsize]
- movu m1, [r0 + r1 + mmsize]
- movu m2, [r0 + r1 * 2 + mmsize]
- movu m3, [r0 + r5 + mmsize]
- psllw m0, (14 - BIT_DEPTH)
- psllw m1, (14 - BIT_DEPTH)
- psllw m2, (14 - BIT_DEPTH)
- psllw m3, (14 - BIT_DEPTH)
- psubw m0, m4
- psubw m1, m4
- psubw m2, m4
- psubw m3, m4
- movu [r2 + mmsize], m0
- movu [r2 + r3 + mmsize], m1
- movu [r2 + r3 * 2 + mmsize], m2
- movu [r2 + r4 + mmsize], m3
-
- lea r0, [r0 + r1 * 4]
- lea r2, [r2 + r3 * 4]
-
+%macro P2S_64x4_AVX512 0
movu m0, [r0]
movu m1, [r0 + r1]
movu m2, [r0 + r1 * 2]
@@ -397,11 +360,11 @@
movu [r2 + r4 + mmsize], m3
%endmacro
-%macro P2S_32x8_AVX512 0
- movu m0, [r0]
- movu m1, [r0 + r1]
- movu m2, [r0 + r1 * 2]
- movu m3, [r0 + r5]
+%macro P2S_ALIGNED_64x4_AVX512 0
+ mova m0, [r0]
+ mova m1, [r0 + r1]
+ mova m2, [r0 + r1 * 2]
+ mova m3, [r0 + r5]
psllw m0, (14 - BIT_DEPTH)
psllw m1, (14 - BIT_DEPTH)
psllw m2, (14 - BIT_DEPTH)
@@ -410,14 +373,30 @@
psubw m1, m4
psubw m2, m4
psubw m3, m4
- movu [r2], m0
- movu [r2 + r3], m1
- movu [r2 + r3 * 2], m2
- movu [r2 + r4], m3
-
- lea r0, [r0 + r1 * 4]
- lea r2, [r2 + r3 * 4]
-
+ mova [r2], m0
+ mova [r2 + r3], m1
+ mova [r2 + r3 * 2], m2
+ mova [r2 + r4], m3
+
+ mova m0, [r0 + mmsize]
+ mova m1, [r0 + r1 + mmsize]
+ mova m2, [r0 + r1 * 2 + mmsize]
+ mova m3, [r0 + r5 + mmsize]
+ psllw m0, (14 - BIT_DEPTH)
+ psllw m1, (14 - BIT_DEPTH)
+ psllw m2, (14 - BIT_DEPTH)
+ psllw m3, (14 - BIT_DEPTH)
+ psubw m0, m4
+ psubw m1, m4
+ psubw m2, m4
+ psubw m3, m4
+ mova [r2 + mmsize], m0
+ mova [r2 + r3 + mmsize], m1
+ mova [r2 + r3 * 2 + mmsize], m2
+ mova [r2 + r4 + mmsize], m3
+%endmacro
+
+%macro P2S_32x4_AVX512 0
movu m0, [r0]
movu m1, [r0 + r1]
movu m2, [r0 + r1 * 2]
@@ -436,11 +415,11 @@
movu [r2 + r4], m3
%endmacro
-%macro P2S_48x8_AVX512 0
- movu m0, [r0]
- movu m1, [r0 + r1]
- movu m2, [r0 + r1 * 2]
- movu m3, [r0 + r5]
+%macro P2S_ALIGNED_32x4_AVX512 0
+ mova m0, [r0]
+ mova m1, [r0 + r1]
+ mova m2, [r0 + r1 * 2]
+ mova m3, [r0 + r5]
psllw m0, (14 - BIT_DEPTH)
psllw m1, (14 - BIT_DEPTH)
psllw m2, (14 - BIT_DEPTH)
@@ -449,31 +428,13 @@
psubw m1, m4
psubw m2, m4
psubw m3, m4
- movu [r2], m0
- movu [r2 + r3], m1
- movu [r2 + r3 * 2], m2
- movu [r2 + r4], m3
-
- movu ym0, [r0 + mmsize]
- movu ym1, [r0 + r1 + mmsize]
- movu ym2, [r0 + r1 * 2 + mmsize]
- movu ym3, [r0 + r5 + mmsize]
- psllw ym0, (14 - BIT_DEPTH)
- psllw ym1, (14 - BIT_DEPTH)
- psllw ym2, (14 - BIT_DEPTH)
- psllw ym3, (14 - BIT_DEPTH)
- psubw ym0, ym4
- psubw ym1, ym4
- psubw ym2, ym4
- psubw ym3, ym4
- movu [r2 + mmsize], ym0
- movu [r2 + r3 + mmsize], ym1
- movu [r2 + r3 * 2 + mmsize], ym2
- movu [r2 + r4 + mmsize], ym3
-
- lea r0, [r0 + r1 * 4]
- lea r2, [r2 + r3 * 4]
-
+ mova [r2], m0
+ mova [r2 + r3], m1
+ mova [r2 + r3 * 2], m2
+ mova [r2 + r4], m3
+%endmacro
+
+%macro P2S_48x4_AVX512 0
movu m0, [r0]
movu m1, [r0 + r1]
movu m2, [r0 + r1 * 2]
@@ -509,6 +470,42 @@
movu [r2 + r4 + mmsize], ym3
%endmacro
+%macro P2S_ALIGNED_48x4_AVX512 0
+ mova m0, [r0]
+ mova m1, [r0 + r1]
+ mova m2, [r0 + r1 * 2]
+ mova m3, [r0 + r5]
+ psllw m0, (14 - BIT_DEPTH)
+ psllw m1, (14 - BIT_DEPTH)
+ psllw m2, (14 - BIT_DEPTH)
+ psllw m3, (14 - BIT_DEPTH)
+ psubw m0, m4
+ psubw m1, m4
+ psubw m2, m4
+ psubw m3, m4
+ mova [r2], m0
+ mova [r2 + r3], m1
+ mova [r2 + r3 * 2], m2
+ mova [r2 + r4], m3
+
+ mova ym0, [r0 + mmsize]
+ mova ym1, [r0 + r1 + mmsize]
+ mova ym2, [r0 + r1 * 2 + mmsize]
+ mova ym3, [r0 + r5 + mmsize]
+ psllw ym0, (14 - BIT_DEPTH)
+ psllw ym1, (14 - BIT_DEPTH)
+ psllw ym2, (14 - BIT_DEPTH)
+ psllw ym3, (14 - BIT_DEPTH)
+ psubw ym0, ym4
+ psubw ym1, ym4
+ psubw ym2, ym4
+ psubw ym3, ym4
+ mova [r2 + mmsize], ym0
+ mova [r2 + r3 + mmsize], ym1
+ mova [r2 + r3 * 2 + mmsize], ym2
+ mova [r2 + r4 + mmsize], ym3
+%endmacro
+
;-----------------------------------------------------------------------------
; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride)
;-----------------------------------------------------------------------------
@@ -521,12 +518,15 @@
; load constant
vbroadcasti32x8 m4, [pw_2000]
- P2S_64x8_AVX512
+%rep 3
+ P2S_64x4_AVX512
lea r0, [r0 + r1 * 4]
lea r2, [r2 + r3 * 4]
- P2S_64x8_AVX512
+%endrep
+ P2S_64x4_AVX512
RET
+
INIT_ZMM avx512
cglobal filterPixelToShort_64x32, 4, 6, 5
add r1d, r1d
@@ -536,16 +536,12 @@
; load constant
vbroadcasti32x8 m4, [pw_2000]
- P2S_64x8_AVX512
+%rep 7
+ P2S_64x4_AVX512
lea r0, [r0 + r1 * 4]
lea r2, [r2 + r3 * 4]
- P2S_64x8_AVX512
- lea r0, [r0 + r1 * 4]
- lea r2, [r2 + r3 * 4]
- P2S_64x8_AVX512
- lea r0, [r0 + r1 * 4]
- lea r2, [r2 + r3 * 4]
- P2S_64x8_AVX512
+%endrep
+ P2S_64x4_AVX512
RET
INIT_ZMM avx512
@@ -557,22 +553,12 @@
; load constant
vbroadcasti32x8 m4, [pw_2000]
- P2S_64x8_AVX512
+%rep 11
+ P2S_64x4_AVX512
lea r0, [r0 + r1 * 4]
lea r2, [r2 + r3 * 4]
- P2S_64x8_AVX512
- lea r0, [r0 + r1 * 4]
- lea r2, [r2 + r3 * 4]
- P2S_64x8_AVX512
- lea r0, [r0 + r1 * 4]
- lea r2, [r2 + r3 * 4]
- P2S_64x8_AVX512
- lea r0, [r0 + r1 * 4]
- lea r2, [r2 + r3 * 4]
- P2S_64x8_AVX512
- lea r0, [r0 + r1 * 4]
- lea r2, [r2 + r3 * 4]
- P2S_64x8_AVX512
+%endrep
+ P2S_64x4_AVX512
RET
INIT_ZMM avx512
@@ -584,28 +570,12 @@
; load constant
vbroadcasti32x8 m4, [pw_2000]
- P2S_64x8_AVX512
+%rep 15
+ P2S_64x4_AVX512
lea r0, [r0 + r1 * 4]
lea r2, [r2 + r3 * 4]
- P2S_64x8_AVX512
- lea r0, [r0 + r1 * 4]
- lea r2, [r2 + r3 * 4]
- P2S_64x8_AVX512
- lea r0, [r0 + r1 * 4]
- lea r2, [r2 + r3 * 4]
- P2S_64x8_AVX512
- lea r0, [r0 + r1 * 4]
- lea r2, [r2 + r3 * 4]
- P2S_64x8_AVX512
- lea r0, [r0 + r1 * 4]
- lea r2, [r2 + r3 * 4]
- P2S_64x8_AVX512
- lea r0, [r0 + r1 * 4]
- lea r2, [r2 + r3 * 4]
- P2S_64x8_AVX512
- lea r0, [r0 + r1 * 4]
- lea r2, [r2 + r3 * 4]
- P2S_64x8_AVX512
+%endrep
+ P2S_64x4_AVX512
RET
INIT_ZMM avx512
@@ -617,7 +587,10 @@
; load constant
vbroadcasti32x8 m4, [pw_2000]
- P2S_32x8_AVX512
+ P2S_32x4_AVX512
+ lea r0, [r0 + r1 * 4]
+ lea r2, [r2 + r3 * 4]
+ P2S_32x4_AVX512
RET
INIT_ZMM avx512
@@ -629,10 +602,12 @@
; load constant
vbroadcasti32x8 m4, [pw_2000]
- P2S_32x8_AVX512
+%rep 3
+ P2S_32x4_AVX512
lea r0, [r0 + r1 * 4]
lea r2, [r2 + r3 * 4]
- P2S_32x8_AVX512
+%endrep
+ P2S_32x4_AVX512
RET
INIT_ZMM avx512
@@ -644,13 +619,12 @@
; load constant
vbroadcasti32x8 m4, [pw_2000]
- P2S_32x8_AVX512
+%rep 5
+ P2S_32x4_AVX512
lea r0, [r0 + r1 * 4]
lea r2, [r2 + r3 * 4]
- P2S_32x8_AVX512
- lea r0, [r0 + r1 * 4]
- lea r2, [r2 + r3 * 4]
- P2S_32x8_AVX512
+%endrep
+ P2S_32x4_AVX512
RET
INIT_ZMM avx512
@@ -662,16 +636,12 @@
; load constant
vbroadcasti32x8 m4, [pw_2000]
- P2S_32x8_AVX512
+%rep 7
+ P2S_32x4_AVX512
lea r0, [r0 + r1 * 4]
lea r2, [r2 + r3 * 4]
- P2S_32x8_AVX512
- lea r0, [r0 + r1 * 4]
- lea r2, [r2 + r3 * 4]
- P2S_32x8_AVX512
- lea r0, [r0 + r1 * 4]
- lea r2, [r2 + r3 * 4]
- P2S_32x8_AVX512
+%endrep
+ P2S_32x4_AVX512
RET
INIT_ZMM avx512
@@ -683,22 +653,12 @@
; load constant
vbroadcasti32x8 m4, [pw_2000]
- P2S_32x8_AVX512
+%rep 11
+ P2S_32x4_AVX512
lea r0, [r0 + r1 * 4]
lea r2, [r2 + r3 * 4]
- P2S_32x8_AVX512
- lea r0, [r0 + r1 * 4]
- lea r2, [r2 + r3 * 4]
- P2S_32x8_AVX512
- lea r0, [r0 + r1 * 4]
- lea r2, [r2 + r3 * 4]
- P2S_32x8_AVX512
- lea r0, [r0 + r1 * 4]
- lea r2, [r2 + r3 * 4]
- P2S_32x8_AVX512
- lea r0, [r0 + r1 * 4]
- lea r2, [r2 + r3 * 4]
- P2S_32x8_AVX512
+%endrep
+ P2S_32x4_AVX512
RET
INIT_ZMM avx512
@@ -710,28 +670,12 @@
; load constant
vbroadcasti32x8 m4, [pw_2000]
- P2S_32x8_AVX512
+%rep 15
+ P2S_32x4_AVX512
lea r0, [r0 + r1 * 4]
lea r2, [r2 + r3 * 4]
- P2S_32x8_AVX512
- lea r0, [r0 + r1 * 4]
- lea r2, [r2 + r3 * 4]
- P2S_32x8_AVX512
- lea r0, [r0 + r1 * 4]
- lea r2, [r2 + r3 * 4]
- P2S_32x8_AVX512
- lea r0, [r0 + r1 * 4]
- lea r2, [r2 + r3 * 4]
- P2S_32x8_AVX512
- lea r0, [r0 + r1 * 4]
- lea r2, [r2 + r3 * 4]
- P2S_32x8_AVX512
- lea r0, [r0 + r1 * 4]
- lea r2, [r2 + r3 * 4]
- P2S_32x8_AVX512
- lea r0, [r0 + r1 * 4]
- lea r2, [r2 + r3 * 4]
- P2S_32x8_AVX512
+%endrep
+ P2S_32x4_AVX512
RET
INIT_ZMM avx512
@@ -743,31 +687,201 @@
; load constant
vbroadcasti32x8 m4, [pw_2000]
- P2S_48x8_AVX512
+%rep 15
+ P2S_48x4_AVX512
lea r0, [r0 + r1 * 4]
lea r2, [r2 + r3 * 4]
- P2S_48x8_AVX512
+%endrep
+ P2S_48x4_AVX512
+ RET
+
+INIT_ZMM avx512
+cglobal filterPixelToShort_aligned_64x16, 4, 6, 5
+ add r1d, r1d
+ add r3d, r3d
+ lea r4, [r3 * 3]
+ lea r5, [r1 * 3]
+
+ ; load constant
+ vbroadcasti32x8 m4, [pw_2000]
+%rep 3
+ P2S_ALIGNED_64x4_AVX512
lea r0, [r0 + r1 * 4]
lea r2, [r2 + r3 * 4]
- P2S_48x8_AVX512
+%endrep
+ P2S_ALIGNED_64x4_AVX512
+ RET
+
+
+INIT_ZMM avx512
+cglobal filterPixelToShort_aligned_64x32, 4, 6, 5
+ add r1d, r1d
+ add r3d, r3d
+ lea r4, [r3 * 3]
+ lea r5, [r1 * 3]
+
+ ; load constant
+ vbroadcasti32x8 m4, [pw_2000]
+%rep 7
+ P2S_ALIGNED_64x4_AVX512
lea r0, [r0 + r1 * 4]
lea r2, [r2 + r3 * 4]
- P2S_48x8_AVX512
+%endrep
+ P2S_ALIGNED_64x4_AVX512
+ RET
+
+INIT_ZMM avx512
+cglobal filterPixelToShort_aligned_64x48, 4, 6, 5
+ add r1d, r1d
+ add r3d, r3d
+ lea r4, [r3 * 3]
+ lea r5, [r1 * 3]
+
+ ; load constant
+ vbroadcasti32x8 m4, [pw_2000]
+%rep 11
+ P2S_ALIGNED_64x4_AVX512
lea r0, [r0 + r1 * 4]
lea r2, [r2 + r3 * 4]
- P2S_48x8_AVX512
+%endrep
+ P2S_ALIGNED_64x4_AVX512
+ RET
+
+INIT_ZMM avx512
+cglobal filterPixelToShort_aligned_64x64, 4, 6, 5
+ add r1d, r1d
+ add r3d, r3d
+ lea r4, [r3 * 3]
+ lea r5, [r1 * 3]
+
+ ; load constant
+ vbroadcasti32x8 m4, [pw_2000]
+%rep 15
+ P2S_ALIGNED_64x4_AVX512
lea r0, [r0 + r1 * 4]
lea r2, [r2 + r3 * 4]
- P2S_48x8_AVX512
+%endrep
+ P2S_ALIGNED_64x4_AVX512
+ RET
+
+INIT_ZMM avx512
+cglobal filterPixelToShort_aligned_32x8, 4, 6, 5
+ add r1d, r1d
+ add r3d, r3d
+ lea r4, [r3 * 3]
+ lea r5, [r1 * 3]
+
+ ; load constant
+ vbroadcasti32x8 m4, [pw_2000]
+ P2S_ALIGNED_32x4_AVX512
lea r0, [r0 + r1 * 4]
lea r2, [r2 + r3 * 4]
- P2S_48x8_AVX512
+ P2S_ALIGNED_32x4_AVX512
+ RET
+
+INIT_ZMM avx512
+cglobal filterPixelToShort_aligned_32x16, 4, 6, 5
+ add r1d, r1d
+ add r3d, r3d
+ lea r4, [r3 * 3]
+ lea r5, [r1 * 3]
+
+ ; load constant
+ vbroadcasti32x8 m4, [pw_2000]
+%rep 3
+ P2S_ALIGNED_32x4_AVX512
lea r0, [r0 + r1 * 4]
lea r2, [r2 + r3 * 4]
- P2S_48x8_AVX512
+%endrep
+ P2S_ALIGNED_32x4_AVX512
+ RET
+
+INIT_ZMM avx512
+cglobal filterPixelToShort_aligned_32x24, 4, 6, 5
+ add r1d, r1d
+ add r3d, r3d
+ lea r4, [r3 * 3]
+ lea r5, [r1 * 3]
+
+ ; load constant
+ vbroadcasti32x8 m4, [pw_2000]
+%rep 5
+ P2S_ALIGNED_32x4_AVX512
+ lea r0, [r0 + r1 * 4]
+ lea r2, [r2 + r3 * 4]
+%endrep
+ P2S_ALIGNED_32x4_AVX512
+ RET
+
+INIT_ZMM avx512
+cglobal filterPixelToShort_aligned_32x32, 4, 6, 5
+ add r1d, r1d
+ add r3d, r3d
+ lea r4, [r3 * 3]
+ lea r5, [r1 * 3]
+
+ ; load constant
+ vbroadcasti32x8 m4, [pw_2000]
+%rep 7
+ P2S_ALIGNED_32x4_AVX512
+ lea r0, [r0 + r1 * 4]
+ lea r2, [r2 + r3 * 4]
+%endrep
+ P2S_ALIGNED_32x4_AVX512
+ RET
+
+INIT_ZMM avx512
+cglobal filterPixelToShort_aligned_32x48, 4, 6, 5
+ add r1d, r1d
+ add r3d, r3d
+ lea r4, [r3 * 3]
+ lea r5, [r1 * 3]
+
+ ; load constant
+ vbroadcasti32x8 m4, [pw_2000]
+%rep 11
+ P2S_ALIGNED_32x4_AVX512
+ lea r0, [r0 + r1 * 4]
+ lea r2, [r2 + r3 * 4]
+%endrep
+ P2S_ALIGNED_32x4_AVX512
+ RET
+
+INIT_ZMM avx512
+cglobal filterPixelToShort_aligned_32x64, 4, 6, 5
+ add r1d, r1d
+ add r3d, r3d
+ lea r4, [r3 * 3]
+ lea r5, [r1 * 3]
+
+ ; load constant
+ vbroadcasti32x8 m4, [pw_2000]
+%rep 15
+ P2S_ALIGNED_32x4_AVX512
+ lea r0, [r0 + r1 * 4]
+ lea r2, [r2 + r3 * 4]
+%endrep
+ P2S_ALIGNED_32x4_AVX512
+ RET
+
+INIT_ZMM avx512
+cglobal filterPixelToShort_aligned_48x64, 4, 6, 5
+ add r1d, r1d
+ add r3d, r3d
+ lea r4, [r3 * 3]
+ lea r5, [r1 * 3]
+
+ ; load constant
+ vbroadcasti32x8 m4, [pw_2000]
+%rep 15
+ P2S_ALIGNED_48x4_AVX512
+ lea r0, [r0 + r1 * 4]
+ lea r2, [r2 + r3 * 4]
+%endrep
+ P2S_ALIGNED_48x4_AVX512
RET
;-----------------------------------------------------------------------------------------------------------------------------
-;p2s avx512 code end
+;p2s and p2s_aligned avx512 code end
;-----------------------------------------------------------------------------------------------------------------------------
%macro PROCESS_LUMA_VER_W4_4R 0
diff -r e1348316cd44 -r ffd4c1528b37 source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm Fri Sep 01 11:50:03 2017 +0530
+++ b/source/common/x86/ipfilter8.asm Thu Sep 21 16:39:45 2017 +0530
@@ -1969,29 +1969,7 @@
P2S_H_32xN_avx2 64
P2S_H_32xN_avx2 48
-%macro PROCESS_P2S_32x8_AVX512 0
- pmovzxbw m0, [r0]
- pmovzxbw m1, [r0 + r1]
- pmovzxbw m2, [r0 + r1 * 2]
- pmovzxbw m3, [r0 + r5]
-
- psllw m0, 6
- psllw m1, 6
- psllw m2, 6
- psllw m3, 6
- psubw m0, m4
- psubw m1, m4
- psubw m2, m4
- psubw m3, m4
-
- movu [r2], m0
- movu [r2 + r3], m1
- movu [r2 + r3 * 2], m2
- movu [r2 + r6], m3
-
- lea r0, [r0 + r1 * 4]
- lea r2, [r2 + r3 * 4]
-
+%macro PROCESS_P2S_32x4_AVX512 0
pmovzxbw m0, [r0]
pmovzxbw m1, [r0 + r1]
pmovzxbw m2, [r0 + r1 * 2]
@@ -2025,7 +2003,10 @@
; load constant
vpbroadcastd m4, [pw_2000]
- PROCESS_P2S_32x8_AVX512
+ PROCESS_P2S_32x4_AVX512
+ lea r0, [r0 + r1 * 4]
+ lea r2, [r2 + r3 * 4]
+ PROCESS_P2S_32x4_AVX512
RET
INIT_ZMM avx512
@@ -2038,10 +2019,12 @@
; load constant
vpbroadcastd m4, [pw_2000]
- PROCESS_P2S_32x8_AVX512
+%rep 3
+ PROCESS_P2S_32x4_AVX512
lea r0, [r0 + r1 * 4]
lea r2, [r2 + r3 * 4]
- PROCESS_P2S_32x8_AVX512
+%endrep
+ PROCESS_P2S_32x4_AVX512
RET
INIT_ZMM avx512
@@ -2054,13 +2037,12 @@
; load constant
vpbroadcastd m4, [pw_2000]
- PROCESS_P2S_32x8_AVX512
+%rep 5
+ PROCESS_P2S_32x4_AVX512
lea r0, [r0 + r1 * 4]
lea r2, [r2 + r3 * 4]
- PROCESS_P2S_32x8_AVX512
- lea r0, [r0 + r1 * 4]
- lea r2, [r2 + r3 * 4]
- PROCESS_P2S_32x8_AVX512
+%endrep
+ PROCESS_P2S_32x4_AVX512
RET
INIT_ZMM avx512
@@ -2073,16 +2055,12 @@
; load constant
vpbroadcastd m4, [pw_2000]
- PROCESS_P2S_32x8_AVX512
+%rep 7
+ PROCESS_P2S_32x4_AVX512
lea r0, [r0 + r1 * 4]
lea r2, [r2 + r3 * 4]
- PROCESS_P2S_32x8_AVX512
- lea r0, [r0 + r1 * 4]
- lea r2, [r2 + r3 * 4]
- PROCESS_P2S_32x8_AVX512
- lea r0, [r0 + r1 * 4]
- lea r2, [r2 + r3 * 4]
- PROCESS_P2S_32x8_AVX512
+%endrep
+ PROCESS_P2S_32x4_AVX512
RET
INIT_ZMM avx512
@@ -2095,22 +2073,12 @@
; load constant
vpbroadcastd m4, [pw_2000]
- PROCESS_P2S_32x8_AVX512
+%rep 11
+ PROCESS_P2S_32x4_AVX512
lea r0, [r0 + r1 * 4]
lea r2, [r2 + r3 * 4]
- PROCESS_P2S_32x8_AVX512
- lea r0, [r0 + r1 * 4]
- lea r2, [r2 + r3 * 4]
- PROCESS_P2S_32x8_AVX512
- lea r0, [r0 + r1 * 4]
- lea r2, [r2 + r3 * 4]
- PROCESS_P2S_32x8_AVX512
- lea r0, [r0 + r1 * 4]
- lea r2, [r2 + r3 * 4]
- PROCESS_P2S_32x8_AVX512
- lea r0, [r0 + r1 * 4]
- lea r2, [r2 + r3 * 4]
- PROCESS_P2S_32x8_AVX512
+%endrep
+ PROCESS_P2S_32x4_AVX512
RET
INIT_ZMM avx512
@@ -2123,28 +2091,12 @@
; load constant
vpbroadcastd m4, [pw_2000]
- PROCESS_P2S_32x8_AVX512
+%rep 15
+ PROCESS_P2S_32x4_AVX512
lea r0, [r0 + r1 * 4]
lea r2, [r2 + r3 * 4]
- PROCESS_P2S_32x8_AVX512
- lea r0, [r0 + r1 * 4]
- lea r2, [r2 + r3 * 4]
- PROCESS_P2S_32x8_AVX512
- lea r0, [r0 + r1 * 4]
- lea r2, [r2 + r3 * 4]
- PROCESS_P2S_32x8_AVX512
- lea r0, [r0 + r1 * 4]
- lea r2, [r2 + r3 * 4]
- PROCESS_P2S_32x8_AVX512
- lea r0, [r0 + r1 * 4]
- lea r2, [r2 + r3 * 4]
- PROCESS_P2S_32x8_AVX512
- lea r0, [r0 + r1 * 4]
- lea r2, [r2 + r3 * 4]
- PROCESS_P2S_32x8_AVX512
- lea r0, [r0 + r1 * 4]
- lea r2, [r2 + r3 * 4]
- PROCESS_P2S_32x8_AVX512
+%endrep
+ PROCESS_P2S_32x4_AVX512
RET
;-----------------------------------------------------------------------------
@@ -2462,7 +2414,7 @@
P2S_H_64xN_avx2 32
P2S_H_64xN_avx2 48
-%macro PROCESS_P2S_64x8_AVX512 0
+%macro PROCESS_P2S_64x4_AVX512 0
pmovzxbw m0, [r0]
pmovzxbw m1, [r0 + mmsize/2]
pmovzxbw m2, [r0 + r1]
@@ -2498,45 +2450,6 @@
movu [r2 + r3 * 2 + mmsize], m1
movu [r2 + r6], m2
movu [r2 + r6 + mmsize], m3
-
- lea r0, [r0 + r1 * 4]
- lea r2, [r2 + r3 * 4]
-
- pmovzxbw m0, [r0]
- pmovzxbw m1, [r0 + mmsize/2]
- pmovzxbw m2, [r0 + r1]
- pmovzxbw m3, [r0 + r1 + mmsize/2]
-
- psllw m0, 6
- psllw m1, 6
- psllw m2, 6
- psllw m3, 6
- psubw m0, m4
- psubw m1, m4
- psubw m2, m4
- psubw m3, m4
- movu [r2], m0
- movu [r2 + mmsize], m1
- movu [r2 + r3], m2
- movu [r2 + r3 + mmsize], m3
-
- pmovzxbw m0, [r0 + r1 * 2]
- pmovzxbw m1, [r0 + r1 * 2 + mmsize/2]
- pmovzxbw m2, [r0 + r5]
- pmovzxbw m3, [r0 + r5 + mmsize/2]
-
- psllw m0, 6
- psllw m1, 6
- psllw m2, 6
- psllw m3, 6
- psubw m0, m4
- psubw m1, m4
- psubw m2, m4
- psubw m3, m4
- movu [r2 + r3 * 2], m0
- movu [r2 + r3 * 2 + mmsize], m1
- movu [r2 + r6], m2
- movu [r2 + r6 + mmsize], m3
%endmacro
;-----------------------------------------------------------------------------
@@ -2552,28 +2465,12 @@
; load constant
vpbroadcastd m4, [pw_2000]
- PROCESS_P2S_64x8_AVX512
+%rep 15
+ PROCESS_P2S_64x4_AVX512
lea r0, [r0 + r1 * 4]
lea r2, [r2 + r3 * 4]
- PROCESS_P2S_64x8_AVX512
- lea r0, [r0 + r1 * 4]
- lea r2, [r2 + r3 * 4]
- PROCESS_P2S_64x8_AVX512
- lea r0, [r0 + r1 * 4]
- lea r2, [r2 + r3 * 4]
- PROCESS_P2S_64x8_AVX512
- lea r0, [r0 + r1 * 4]
- lea r2, [r2 + r3 * 4]
- PROCESS_P2S_64x8_AVX512
- lea r0, [r0 + r1 * 4]
- lea r2, [r2 + r3 * 4]
- PROCESS_P2S_64x8_AVX512
- lea r0, [r0 + r1 * 4]
- lea r2, [r2 + r3 * 4]
- PROCESS_P2S_64x8_AVX512
- lea r0, [r0 + r1 * 4]
- lea r2, [r2 + r3 * 4]
- PROCESS_P2S_64x8_AVX512
+%endrep
+ PROCESS_P2S_64x4_AVX512
RET
INIT_ZMM avx512
@@ -2586,22 +2483,12 @@
; load constant
vpbroadcastd m4, [pw_2000]
- PROCESS_P2S_64x8_AVX512
+%rep 11
+ PROCESS_P2S_64x4_AVX512
lea r0, [r0 + r1 * 4]
lea r2, [r2 + r3 * 4]
- PROCESS_P2S_64x8_AVX512
- lea r0, [r0 + r1 * 4]
- lea r2, [r2 + r3 * 4]
- PROCESS_P2S_64x8_AVX512
- lea r0, [r0 + r1 * 4]
- lea r2, [r2 + r3 * 4]
- PROCESS_P2S_64x8_AVX512
- lea r0, [r0 + r1 * 4]
- lea r2, [r2 + r3 * 4]
- PROCESS_P2S_64x8_AVX512
- lea r0, [r0 + r1 * 4]
- lea r2, [r2 + r3 * 4]
- PROCESS_P2S_64x8_AVX512
+%endrep
+ PROCESS_P2S_64x4_AVX512
RET
INIT_ZMM avx512
@@ -2614,16 +2501,12 @@
; load constant
vpbroadcastd m4, [pw_2000]
- PROCESS_P2S_64x8_AVX512
+%rep 7
+ PROCESS_P2S_64x4_AVX512
lea r0, [r0 + r1 * 4]
lea r2, [r2 + r3 * 4]
- PROCESS_P2S_64x8_AVX512
- lea r0, [r0 + r1 * 4]
- lea r2, [r2 + r3 * 4]
- PROCESS_P2S_64x8_AVX512
- lea r0, [r0 + r1 * 4]
- lea r2, [r2 + r3 * 4]
- PROCESS_P2S_64x8_AVX512
+%endrep
+ PROCESS_P2S_64x4_AVX512
RET
INIT_ZMM avx512
@@ -2636,10 +2519,12 @@
; load constant
vpbroadcastd m4, [pw_2000]
- PROCESS_P2S_64x8_AVX512
+%rep 3
+ PROCESS_P2S_64x4_AVX512
lea r0, [r0 + r1 * 4]
lea r2, [r2 + r3 * 4]
- PROCESS_P2S_64x8_AVX512
+%endrep
+ PROCESS_P2S_64x4_AVX512
RET
;-----------------------------------------------------------------------------
diff -r e1348316cd44 -r ffd4c1528b37 source/common/x86/ipfilter8.h
--- a/source/common/x86/ipfilter8.h Fri Sep 01 11:50:03 2017 +0530
+++ b/source/common/x86/ipfilter8.h Thu Sep 21 16:39:45 2017 +0530
@@ -33,6 +33,7 @@
FUNCDEF_PU(void, interp_8tap_vert_ss, cpu, const int16_t* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx); \
FUNCDEF_PU(void, interp_8tap_hv_pp, cpu, const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int idxX, int idxY); \
FUNCDEF_CHROMA_PU(void, filterPixelToShort, cpu, const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride); \
+ FUNCDEF_CHROMA_PU(void, filterPixelToShort_aligned, cpu, const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride); \
FUNCDEF_CHROMA_PU(void, interp_4tap_horiz_pp, cpu, const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \
FUNCDEF_CHROMA_PU(void, interp_4tap_horiz_ps, cpu, const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt); \
FUNCDEF_CHROMA_PU(void, interp_4tap_vert_pp, cpu, const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \
diff -r e1348316cd44 -r ffd4c1528b37 source/test/ipfilterharness.cpp
--- a/source/test/ipfilterharness.cpp Fri Sep 01 11:50:03 2017 +0530
+++ b/source/test/ipfilterharness.cpp Thu Sep 21 16:39:45 2017 +0530
@@ -489,6 +489,26 @@
return true;
}
+bool IPFilterHarness::check_IPFilterLumaP2S_aligned_primitive(filter_p2s_t ref, filter_p2s_t opt)
+{
+ for (int i = 0; i < TEST_CASES; i++)
+ {
+ int index = i % TEST_CASES;
+ intptr_t rand_srcStride[] = { 128, 192, 256, 512 };
+ intptr_t dstStride[] = { 192, 256, 512, 576 };
+ for (int p = 0; p < 4; p++)
+ {
+ ref(pixel_test_buff[index], rand_srcStride[p], IPF_C_output_s, dstStride[p]);
+ checked(opt, pixel_test_buff[index] + (64 * i), rand_srcStride[p], IPF_vec_output_s, dstStride[p]);
+ if (memcmp(IPF_vec_output_s, IPF_C_output_s, TEST_BUF_SIZE * sizeof(int16_t)))
+ return false;
+ }
+ reportfail();
+ }
+
+ return true;
+}
+
bool IPFilterHarness::check_IPFilterChromaP2S_primitive(filter_p2s_t ref, filter_p2s_t opt)
{
for (int i = 0; i < ITERS; i++)
@@ -510,6 +530,29 @@
return true;
}
+bool IPFilterHarness::check_IPFilterChromaP2S_aligned_primitive(filter_p2s_t ref, filter_p2s_t opt)
+{
+ for (int i = 0; i < TEST_CASES; i++)
+ {
+ int index = i % TEST_CASES;
+ intptr_t rand_srcStride[] = { 128, 192, 256, 512};
+ intptr_t dstStride[] = { 192, 256, 512, 576 };
+
+ for (int p = 0; p < 4; p++)
+ {
+ ref(pixel_test_buff[index], rand_srcStride[p], IPF_C_output_s, dstStride[p]);
+
+ checked(opt, pixel_test_buff[index], rand_srcStride[p], IPF_vec_output_s, dstStride[p]);
+
+ if (memcmp(IPF_vec_output_s, IPF_C_output_s, TEST_BUF_SIZE * sizeof(int16_t)))
+ return false;
+ }
+ reportfail();
+ }
+
+ return true;
+}
+
bool IPFilterHarness::testCorrectness(const EncoderPrimitives& ref, const EncoderPrimitives& opt)
{
@@ -579,6 +622,14 @@
return false;
}
}
+ if (opt.pu[value].convert_p2s_aligned)
+ {
+ if (!check_IPFilterLumaP2S_aligned_primitive(ref.pu[value].convert_p2s_aligned, opt.pu[value].convert_p2s_aligned))
+ {
+ printf("convert_p2s_aligned[%s]", lumaPartStr[value]);
+ return false;
+ }
+ }
}
for (int csp = X265_CSP_I420; csp < X265_CSP_COUNT; csp++)
@@ -633,6 +684,14 @@
return false;
}
}
+ if (opt.chroma[csp].pu[value].p2s_aligned)
+ {
+ if (!check_IPFilterChromaP2S_aligned_primitive(ref.chroma[csp].pu[value].p2s_aligned, opt.chroma[csp].pu[value].p2s_aligned))
+ {
+ printf("chroma_p2s_aligned[%s]", chromaPartStr[csp][value]);
+ return false;
+ }
+ }
if (opt.chroma[csp].pu[value].p2s)
{
if (!check_IPFilterChromaP2S_primitive(ref.chroma[csp].pu[value].p2s, opt.chroma[csp].pu[value].p2s))
@@ -649,8 +708,8 @@
void IPFilterHarness::measureSpeed(const EncoderPrimitives& ref, const EncoderPrimitives& opt)
{
- int16_t srcStride = 96;
- int16_t dstStride = 96;
+ int16_t srcStride = 192; /* Multiple of 64 */
+ int16_t dstStride = 192;
int maxVerticalfilterHalfDistance = 3;
for (int value = 0; value < NUM_PU_SIZES; value++)
@@ -659,62 +718,70 @@
{
printf("luma_hpp[%s]\t", lumaPartStr[value]);
REPORT_SPEEDUP(opt.pu[value].luma_hpp, ref.pu[value].luma_hpp,
- pixel_buff + srcStride, srcStride, IPF_vec_output_p, dstStride, 1);
+ pixel_buff + srcStride, srcStride, IPF_vec_output_p, dstStride, 1);
}
if (opt.pu[value].luma_hps)
{
printf("luma_hps[%s]\t", lumaPartStr[value]);
REPORT_SPEEDUP(opt.pu[value].luma_hps, ref.pu[value].luma_hps,
- pixel_buff + maxVerticalfilterHalfDistance * srcStride, srcStride,
- IPF_vec_output_s, dstStride, 1, 1);
+ pixel_buff + maxVerticalfilterHalfDistance * srcStride, srcStride,
+ IPF_vec_output_s, dstStride, 1, 1);
}
if (opt.pu[value].luma_vpp)
{
printf("luma_vpp[%s]\t", lumaPartStr[value]);
REPORT_SPEEDUP(opt.pu[value].luma_vpp, ref.pu[value].luma_vpp,
- pixel_buff + maxVerticalfilterHalfDistance * srcStride, srcStride,
- IPF_vec_output_p, dstStride, 1);
+ pixel_buff + maxVerticalfilterHalfDistance * srcStride, srcStride,
+ IPF_vec_output_p, dstStride, 1);
}
if (opt.pu[value].luma_vps)
{
printf("luma_vps[%s]\t", lumaPartStr[value]);
REPORT_SPEEDUP(opt.pu[value].luma_vps, ref.pu[value].luma_vps,
- pixel_buff + maxVerticalfilterHalfDistance * srcStride, srcStride,
- IPF_vec_output_s, dstStride, 1);
+ pixel_buff + maxVerticalfilterHalfDistance * srcStride, srcStride,
+ IPF_vec_output_s, dstStride, 1);
}
if (opt.pu[value].luma_vsp)
{
printf("luma_vsp[%s]\t", lumaPartStr[value]);
REPORT_SPEEDUP(opt.pu[value].luma_vsp, ref.pu[value].luma_vsp,
- short_buff + maxVerticalfilterHalfDistance * srcStride, srcStride,
- IPF_vec_output_p, dstStride, 1);
+ short_buff + maxVerticalfilterHalfDistance * srcStride, srcStride,
+ IPF_vec_output_p, dstStride, 1);
}
if (opt.pu[value].luma_vss)
{
printf("luma_vss[%s]\t", lumaPartStr[value]);
REPORT_SPEEDUP(opt.pu[value].luma_vss, ref.pu[value].luma_vss,
- short_buff + maxVerticalfilterHalfDistance * srcStride, srcStride,
- IPF_vec_output_s, dstStride, 1);
+ short_buff + maxVerticalfilterHalfDistance * srcStride, srcStride,
+ IPF_vec_output_s, dstStride, 1);
}
if (opt.pu[value].luma_hvpp)
{
printf("luma_hv [%s]\t", lumaPartStr[value]);
REPORT_SPEEDUP(opt.pu[value].luma_hvpp, ref.pu[value].luma_hvpp,
- pixel_buff + 3 * srcStride, srcStride, IPF_vec_output_p, srcStride, 1, 3);
+ pixel_buff + 3 * srcStride, srcStride, IPF_vec_output_p, srcStride, 1, 3);
}
if (opt.pu[value].convert_p2s)
{
printf("convert_p2s[%s]\t", lumaPartStr[value]);
REPORT_SPEEDUP(opt.pu[value].convert_p2s, ref.pu[value].convert_p2s,
- pixel_buff, srcStride,
- IPF_vec_output_s, dstStride);
+ pixel_buff, srcStride,
+ IPF_vec_output_s, dstStride);
+ }
+
+ if (opt.pu[value].convert_p2s_aligned)
+ {
+ printf("convert_p2s_aligned[%s]\t", lumaPartStr[value]);
+ REPORT_SPEEDUP(opt.pu[value].convert_p2s_aligned, ref.pu[value].convert_p2s_aligned,
+ pixel_buff, srcStride,
+ IPF_vec_output_s, dstStride);
}
}
@@ -727,47 +794,53 @@
{
printf("chroma_hpp[%s]", chromaPartStr[csp][value]);
REPORT_SPEEDUP(opt.chroma[csp].pu[value].filter_hpp, ref.chroma[csp].pu[value].filter_hpp,
- pixel_buff + srcStride, srcStride, IPF_vec_output_p, dstStride, 1);
+ pixel_buff + srcStride, srcStride, IPF_vec_output_p, dstStride, 1);
}
if (opt.chroma[csp].pu[value].filter_hps)
{
printf("chroma_hps[%s]", chromaPartStr[csp][value]);
REPORT_SPEEDUP(opt.chroma[csp].pu[value].filter_hps, ref.chroma[csp].pu[value].filter_hps,
- pixel_buff + srcStride, srcStride, IPF_vec_output_s, dstStride, 1, 1);
+ pixel_buff + srcStride, srcStride, IPF_vec_output_s, dstStride, 1, 1);
}
if (opt.chroma[csp].pu[value].filter_vpp)
{
printf("chroma_vpp[%s]", chromaPartStr[csp][value]);
REPORT_SPEEDUP(opt.chroma[csp].pu[value].filter_vpp, ref.chroma[csp].pu[value].filter_vpp,
- pixel_buff + maxVerticalfilterHalfDistance * srcStride, srcStride,
- IPF_vec_output_p, dstStride, 1);
+ pixel_buff + maxVerticalfilterHalfDistance * srcStride, srcStride,
+ IPF_vec_output_p, dstStride, 1);
}
if (opt.chroma[csp].pu[value].filter_vps)
{
printf("chroma_vps[%s]", chromaPartStr[csp][value]);
REPORT_SPEEDUP(opt.chroma[csp].pu[value].filter_vps, ref.chroma[csp].pu[value].filter_vps,
- pixel_buff + maxVerticalfilterHalfDistance * srcStride, srcStride,
- IPF_vec_output_s, dstStride, 1);
+ pixel_buff + maxVerticalfilterHalfDistance * srcStride, srcStride,
+ IPF_vec_output_s, dstStride, 1);
}
if (opt.chroma[csp].pu[value].filter_vsp)
{
printf("chroma_vsp[%s]", chromaPartStr[csp][value]);
REPORT_SPEEDUP(opt.chroma[csp].pu[value].filter_vsp, ref.chroma[csp].pu[value].filter_vsp,
- short_buff + maxVerticalfilterHalfDistance * srcStride, srcStride,
- IPF_vec_output_p, dstStride, 1);
+ short_buff + maxVerticalfilterHalfDistance * srcStride, srcStride,
+ IPF_vec_output_p, dstStride, 1);
}
if (opt.chroma[csp].pu[value].filter_vss)
{
printf("chroma_vss[%s]", chromaPartStr[csp][value]);
REPORT_SPEEDUP(opt.chroma[csp].pu[value].filter_vss, ref.chroma[csp].pu[value].filter_vss,
- short_buff + maxVerticalfilterHalfDistance * srcStride, srcStride,
- IPF_vec_output_s, dstStride, 1);
+ short_buff + maxVerticalfilterHalfDistance * srcStride, srcStride,
+ IPF_vec_output_s, dstStride, 1);
}
if (opt.chroma[csp].pu[value].p2s)
{
printf("chroma_p2s[%s]\t", chromaPartStr[csp][value]);
REPORT_SPEEDUP(opt.chroma[csp].pu[value].p2s, ref.chroma[csp].pu[value].p2s,
- pixel_buff, srcStride, IPF_vec_output_s, dstStride);
+ pixel_buff, srcStride, IPF_vec_output_s, dstStride);
+ }
+ if (opt.chroma[csp].pu[value].p2s_aligned)
+ {
+ printf("chroma_p2s_aligned[%s]\t", chromaPartStr[csp][value]);
+ REPORT_SPEEDUP(opt.chroma[csp].pu[value].p2s_aligned, ref.chroma[csp].pu[value].p2s_aligned,
+ pixel_buff, srcStride, IPF_vec_output_s, dstStride);
}
}
}
diff -r e1348316cd44 -r ffd4c1528b37 source/test/ipfilterharness.h
--- a/source/test/ipfilterharness.h Fri Sep 01 11:50:03 2017 +0530
+++ b/source/test/ipfilterharness.h Thu Sep 21 16:39:45 2017 +0530
@@ -40,15 +40,15 @@
enum { TEST_CASES = 3 };
enum { SMAX = 1 << 12 };
enum { SMIN = (unsigned)-1 << 12 };
- ALIGN_VAR_32(pixel, pixel_buff[TEST_BUF_SIZE]);
- int16_t short_buff[TEST_BUF_SIZE];
- int16_t IPF_vec_output_s[TEST_BUF_SIZE];
- int16_t IPF_C_output_s[TEST_BUF_SIZE];
- pixel IPF_vec_output_p[TEST_BUF_SIZE];
- pixel IPF_C_output_p[TEST_BUF_SIZE];
+ ALIGN_VAR_64(pixel, pixel_buff[TEST_BUF_SIZE]);
+ ALIGN_VAR_64(int16_t, short_buff[TEST_BUF_SIZE]);
+ ALIGN_VAR_64(int16_t, IPF_vec_output_s[TEST_BUF_SIZE]);
+ ALIGN_VAR_64(int16_t, IPF_C_output_s[TEST_BUF_SIZE]);
+ ALIGN_VAR_64(pixel, IPF_vec_output_p[TEST_BUF_SIZE]);
+ ALIGN_VAR_64(pixel, IPF_C_output_p[TEST_BUF_SIZE]);
- pixel pixel_test_buff[TEST_CASES][TEST_BUF_SIZE];
- int16_t short_test_buff[TEST_CASES][TEST_BUF_SIZE];
+ ALIGN_VAR_64(pixel, pixel_test_buff[TEST_CASES][TEST_BUF_SIZE]);
+ ALIGN_VAR_64(int16_t, short_test_buff[TEST_CASES][TEST_BUF_SIZE]);
bool check_IPFilterChroma_primitive(filter_pp_t ref, filter_pp_t opt);
bool check_IPFilterChroma_ps_primitive(filter_ps_t ref, filter_ps_t opt);
@@ -62,7 +62,9 @@
bool check_IPFilterLuma_ss_primitive(filter_ss_t ref, filter_ss_t opt);
bool check_IPFilterLumaHV_primitive(filter_hv_pp_t ref, filter_hv_pp_t opt);
bool check_IPFilterLumaP2S_primitive(filter_p2s_t ref, filter_p2s_t opt);
+ bool check_IPFilterLumaP2S_aligned_primitive(filter_p2s_t ref, filter_p2s_t opt);
bool check_IPFilterChromaP2S_primitive(filter_p2s_t ref, filter_p2s_t opt);
+ bool check_IPFilterChromaP2S_aligned_primitive(filter_p2s_t ref, filter_p2s_t opt);
public:
More information about the x265-devel
mailing list