[x265] [PATCH 133 of 307] x86: Fix build errors in 32 bit build
mythreyi at multicorewareinc.com
mythreyi at multicorewareinc.com
Sat Apr 7 04:32:11 CEST 2018
# HG changeset patch
# User Vignesh Vijayakumar<vignesh at multicorewareinc.com>>
# Date 1522965386 25200
# Thu Apr 05 14:56:26 2018 -0700
# Node ID 8173d05abf8dc96f3be6c97016cfb98f85e84a20
# Parent dc2d7a2515fdc434744f97a9dd34edcd670bbffa
x86: Fix build errors in 32 bit build
1. Fixes 32 bit build CMake error in Linux platform
2. Enables assembly for 32 bit build high bit depth
sign off: Mythreyi P
diff -r dc2d7a2515fd -r 8173d05abf8d source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Fri Oct 06 16:38:18 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp Thu Apr 05 14:56:26 2018 -0700
@@ -886,10 +886,6 @@
void setupAssemblyPrimitives(EncoderPrimitives &p, int cpuMask) // Main10
{
-#if !defined(X86_64)
-#error "Unsupported build configuration (32bit x86 and HIGH_BIT_DEPTH), you must configure ENABLE_ASSEMBLY=OFF"
-#endif
-
#if X86_64
p.scanPosLast = PFX(scanPosLast_x64);
#endif
@@ -950,16 +946,42 @@
CHROMA_422_VERT_FILTERS(_sse2);
CHROMA_444_VERT_FILTERS(sse2);
+#if X86_64
ALL_LUMA_PU(luma_hpp, interp_8tap_horiz_pp, sse2);
p.pu[LUMA_4x4].luma_hpp = PFX(interp_8tap_horiz_pp_4x4_sse2);
ALL_LUMA_PU(luma_hps, interp_8tap_horiz_ps, sse2);
p.pu[LUMA_4x4].luma_hps = PFX(interp_8tap_horiz_ps_4x4_sse2);
ALL_LUMA_PU(luma_vpp, interp_8tap_vert_pp, sse2);
ALL_LUMA_PU(luma_vps, interp_8tap_vert_ps, sse2);
+#endif
p.ssim_4x4x2_core = PFX(pixel_ssim_4x4x2_core_sse2);
p.ssim_end_4 = PFX(pixel_ssim_end4_sse2);
- PIXEL_AVG(sse2);
+
+ p.pu[LUMA_64x64].pixelavg_pp = PFX(pixel_avg_64x64_sse2);
+ p.pu[LUMA_64x48].pixelavg_pp = PFX(pixel_avg_64x48_sse2);
+ p.pu[LUMA_64x32].pixelavg_pp = PFX(pixel_avg_64x32_sse2);
+ p.pu[LUMA_64x16].pixelavg_pp = PFX(pixel_avg_64x16_sse2);
+ p.pu[LUMA_48x64].pixelavg_pp = PFX(pixel_avg_48x64_sse2);
+ p.pu[LUMA_32x64].pixelavg_pp = PFX(pixel_avg_32x64_sse2);
+ p.pu[LUMA_32x32].pixelavg_pp = PFX(pixel_avg_32x32_sse2);
+ p.pu[LUMA_32x24].pixelavg_pp = PFX(pixel_avg_32x24_sse2);
+ p.pu[LUMA_32x16].pixelavg_pp = PFX(pixel_avg_32x16_sse2);
+ p.pu[LUMA_32x8].pixelavg_pp = PFX(pixel_avg_32x8_sse2);
+ p.pu[LUMA_24x32].pixelavg_pp = PFX(pixel_avg_24x32_sse2);
+ p.pu[LUMA_16x64].pixelavg_pp = PFX(pixel_avg_16x64_sse2);
+ p.pu[LUMA_16x32].pixelavg_pp = PFX(pixel_avg_16x32_sse2);
+ p.pu[LUMA_16x16].pixelavg_pp = PFX(pixel_avg_16x16_sse2);
+ p.pu[LUMA_16x12].pixelavg_pp = PFX(pixel_avg_16x12_sse2);
+ p.pu[LUMA_16x8].pixelavg_pp = PFX(pixel_avg_16x8_sse2);
+ p.pu[LUMA_16x4].pixelavg_pp = PFX(pixel_avg_16x4_sse2);
+ p.pu[LUMA_12x16].pixelavg_pp = PFX(pixel_avg_12x16_sse2);
+#if X86_64
+ p.pu[LUMA_8x32].pixelavg_pp = PFX(pixel_avg_8x32_sse2);
+ p.pu[LUMA_8x16].pixelavg_pp = PFX(pixel_avg_8x16_sse2);
+ p.pu[LUMA_8x8].pixelavg_pp = PFX(pixel_avg_8x8_sse2);
+ p.pu[LUMA_8x4].pixelavg_pp = PFX(pixel_avg_8x4_sse2);
+#endif
PIXEL_AVG_W4(mmx2);
LUMA_VAR(sse2);
@@ -970,7 +992,12 @@
ALL_LUMA_TU_S(cpy1Dto2D_shl, cpy1Dto2D_shl_, sse2);
ALL_LUMA_TU_S(cpy2Dto1D_shr, cpy2Dto1D_shr_, sse2);
ALL_LUMA_TU_S(cpy2Dto1D_shl, cpy2Dto1D_shl_, sse2);
- ALL_LUMA_TU_S(ssd_s, pixel_ssd_s_, sse2);
+#if X86_64
+ p.cu[BLOCK_4x4].ssd_s = PFX(pixel_ssd_s_4_sse2);
+ p.cu[BLOCK_8x8].ssd_s = PFX(pixel_ssd_s_8_sse2);
+ p.cu[BLOCK_16x16].ssd_s = PFX(pixel_ssd_s_16_sse2);
+ p.cu[BLOCK_32x32].ssd_s = PFX(pixel_ssd_s_32_sse2);
+#endif
ALL_LUMA_TU_S(calcresidual[ALIGNED], getResidual, sse2);
ALL_LUMA_TU_S(calcresidual[NONALIGNED], getResidual, sse2);
ALL_LUMA_TU_S(transpose, transpose, sse2);
@@ -978,9 +1005,10 @@
p.cu[BLOCK_4x4].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar4_sse2);
p.cu[BLOCK_8x8].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar8_sse2);
p.cu[BLOCK_16x16].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar16_sse2);
+#if X86_64
p.cu[BLOCK_32x32].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar32_sse2);
ALL_LUMA_TU_S(intra_pred[DC_IDX], intra_pred_dc, sse2);
-
+#endif
p.cu[BLOCK_4x4].intra_pred[2] = PFX(intra_pred_ang4_2_sse2);
p.cu[BLOCK_4x4].intra_pred[3] = PFX(intra_pred_ang4_3_sse2);
p.cu[BLOCK_4x4].intra_pred[4] = PFX(intra_pred_ang4_4_sse2);
@@ -1005,7 +1033,9 @@
p.cu[BLOCK_4x4].intra_pred[23] = PFX(intra_pred_ang4_23_sse2);
p.cu[BLOCK_4x4].intra_pred[24] = PFX(intra_pred_ang4_24_sse2);
p.cu[BLOCK_4x4].intra_pred[25] = PFX(intra_pred_ang4_25_sse2);
+#if X86_64
p.cu[BLOCK_4x4].intra_pred[26] = PFX(intra_pred_ang4_26_sse2);
+#endif
p.cu[BLOCK_4x4].intra_pred[27] = PFX(intra_pred_ang4_27_sse2);
p.cu[BLOCK_4x4].intra_pred[28] = PFX(intra_pred_ang4_28_sse2);
p.cu[BLOCK_4x4].intra_pred[29] = PFX(intra_pred_ang4_29_sse2);
@@ -1014,19 +1044,24 @@
p.cu[BLOCK_4x4].intra_pred[32] = PFX(intra_pred_ang4_32_sse2);
p.cu[BLOCK_4x4].intra_pred[33] = PFX(intra_pred_ang4_33_sse2);
+#if X86_64 && X265_DEPTH <= 10
+ p.cu[BLOCK_4x4].sse_ss = PFX(pixel_ssd_ss_4x4_mmx2);
p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].sse_pp = (pixel_sse_t)PFX(pixel_ssd_ss_32x64_sse2);
p.chroma[X265_CSP_I422].cu[BLOCK_422_4x8].sse_pp = (pixel_sse_t)PFX(pixel_ssd_ss_4x8_mmx2);
p.chroma[X265_CSP_I422].cu[BLOCK_422_8x16].sse_pp = (pixel_sse_t)PFX(pixel_ssd_ss_8x16_sse2);
p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].sse_pp = (pixel_sse_t)PFX(pixel_ssd_ss_16x32_sse2);
-#if X265_DEPTH <= 10
- p.cu[BLOCK_4x4].sse_ss = PFX(pixel_ssd_ss_4x4_mmx2);
- ALL_LUMA_CU(sse_ss, pixel_ssd_ss, sse2);
+
+ p.cu[BLOCK_8x8].sse_ss = PFX(pixel_ssd_ss_8x8_sse2);
+ p.cu[BLOCK_16x16].sse_ss = PFX(pixel_ssd_ss_16x16_sse2);
+ p.cu[BLOCK_32x32].sse_ss = PFX(pixel_ssd_ss_32x32_sse2);
+ p.cu[BLOCK_64x64].sse_ss = PFX(pixel_ssd_ss_64x64_sse2);
#endif
p.cu[BLOCK_4x4].dct = PFX(dct4_sse2);
p.cu[BLOCK_8x8].dct = PFX(dct8_sse2);
p.cu[BLOCK_4x4].idct = PFX(idct4_sse2);
+#if X86_64
p.cu[BLOCK_8x8].idct = PFX(idct8_sse2);
-
+#endif
p.idst4x4 = PFX(idst4_sse2);
p.dst4x4 = PFX(dst4_sse2);
@@ -1050,12 +1085,14 @@
}
if (cpuMask & X265_CPU_SSE3)
{
+#if X86_64
ALL_CHROMA_420_PU(filter_hpp, interp_4tap_horiz_pp, sse3);
ALL_CHROMA_422_PU(filter_hpp, interp_4tap_horiz_pp, sse3);
ALL_CHROMA_444_PU(filter_hpp, interp_4tap_horiz_pp, sse3);
ALL_CHROMA_420_PU(filter_hps, interp_4tap_horiz_ps, sse3);
ALL_CHROMA_422_PU(filter_hps, interp_4tap_horiz_ps, sse3);
ALL_CHROMA_444_PU(filter_hps, interp_4tap_horiz_ps, sse3);
+#endif
}
if (cpuMask & X265_CPU_SSSE3)
{
@@ -1126,12 +1163,13 @@
}
if (cpuMask & X265_CPU_SSE4)
{
+#if X86_64
p.pelFilterLumaStrong[0] = PFX(pelFilterLumaStrong_V_sse4);
p.pelFilterLumaStrong[1] = PFX(pelFilterLumaStrong_H_sse4);
p.pelFilterChroma[0] = PFX(pelFilterChroma_V_sse4);
p.pelFilterChroma[1] = PFX(pelFilterChroma_H_sse4);
-
p.saoCuOrgE0 = PFX(saoCuOrgE0_sse4);
+#endif
p.saoCuOrgE1 = PFX(saoCuOrgE1_sse4);
p.saoCuOrgE1_2Rows = PFX(saoCuOrgE1_2Rows_sse4);
p.saoCuOrgE2[0] = PFX(saoCuOrgE2_sse4);
@@ -1146,6 +1184,68 @@
CHROMA_422_ADDAVG(sse4);
LUMA_FILTERS(sse4);
+
+#if X86_64
+ p.pu[LUMA_4x4].luma_hpp = PFX(interp_8tap_horiz_pp_4x4_sse4);
+ p.pu[LUMA_4x8].luma_hpp = PFX(interp_8tap_horiz_pp_4x8_sse4);
+ p.pu[LUMA_4x16].luma_hpp = PFX(interp_8tap_horiz_pp_4x16_sse4);
+ p.pu[LUMA_4x4].luma_hps = PFX(interp_8tap_horiz_ps_4x4_sse4);
+ p.pu[LUMA_4x8].luma_hps = PFX(interp_8tap_horiz_ps_4x8_sse4);
+ p.pu[LUMA_4x16].luma_hps = PFX(interp_8tap_horiz_ps_4x16_sse4);
+#endif
+
+ p.pu[LUMA_8x8].luma_hpp = PFX(interp_8tap_horiz_pp_8x8_sse4);
+ p.pu[LUMA_16x16].luma_hpp = PFX(interp_8tap_horiz_pp_16x16_sse4);
+ p.pu[LUMA_32x32].luma_hpp = PFX(interp_8tap_horiz_pp_32x32_sse4);
+ p.pu[LUMA_64x64].luma_hpp = PFX(interp_8tap_horiz_pp_64x64_sse4);
+ p.pu[LUMA_8x4].luma_hpp = PFX(interp_8tap_horiz_pp_8x4_sse4);
+
+ p.pu[LUMA_16x8].luma_hpp = PFX(interp_8tap_horiz_pp_16x8_sse4);
+ p.pu[LUMA_8x16].luma_hpp = PFX(interp_8tap_horiz_pp_8x16_sse4);
+ p.pu[LUMA_16x32].luma_hpp = PFX(interp_8tap_horiz_pp_16x32_sse4);
+ p.pu[LUMA_32x16].luma_hpp = PFX(interp_8tap_horiz_pp_32x16_sse4);
+ p.pu[LUMA_64x32].luma_hpp = PFX(interp_8tap_horiz_pp_64x32_sse4);
+ p.pu[LUMA_32x64].luma_hpp = PFX(interp_8tap_horiz_pp_32x64_sse4);
+ p.pu[LUMA_16x12].luma_hpp = PFX(interp_8tap_horiz_pp_16x12_sse4);
+ p.pu[LUMA_12x16].luma_hpp = PFX(interp_8tap_horiz_pp_12x16_sse4);
+ p.pu[LUMA_16x4].luma_hpp = PFX(interp_8tap_horiz_pp_16x4_sse4);
+
+ p.pu[LUMA_32x24].luma_hpp = PFX(interp_8tap_horiz_pp_32x24_sse4);
+ p.pu[LUMA_24x32].luma_hpp = PFX(interp_8tap_horiz_pp_24x32_sse4);
+ p.pu[LUMA_32x8].luma_hpp = PFX(interp_8tap_horiz_pp_32x8_sse4);
+ p.pu[LUMA_8x32].luma_hpp = PFX(interp_8tap_horiz_pp_8x32_sse4);
+ p.pu[LUMA_64x48].luma_hpp = PFX(interp_8tap_horiz_pp_64x48_sse4);
+ p.pu[LUMA_48x64].luma_hpp = PFX(interp_8tap_horiz_pp_48x64_sse4);
+ p.pu[LUMA_64x16].luma_hpp = PFX(interp_8tap_horiz_pp_64x16_sse4);
+ p.pu[LUMA_16x64].luma_hpp = PFX(interp_8tap_horiz_pp_16x64_sse4);
+
+ p.pu[LUMA_8x8].luma_hps = PFX(interp_8tap_horiz_ps_8x8_sse4);
+ p.pu[LUMA_16x16].luma_hps = PFX(interp_8tap_horiz_ps_16x16_sse4);
+ p.pu[LUMA_32x32].luma_hps = PFX(interp_8tap_horiz_ps_32x32_sse4);
+ p.pu[LUMA_64x64].luma_hps = PFX(interp_8tap_horiz_ps_64x64_sse4);
+ p.pu[LUMA_8x4].luma_hps = PFX(interp_8tap_horiz_ps_8x4_sse4);
+ p.pu[LUMA_16x8].luma_hps = PFX(interp_8tap_horiz_ps_16x8_sse4);
+ p.pu[LUMA_8x16].luma_hps = PFX(interp_8tap_horiz_ps_8x16_sse4);
+ p.pu[LUMA_16x32].luma_hps = PFX(interp_8tap_horiz_ps_16x32_sse4);
+ p.pu[LUMA_32x16].luma_hps = PFX(interp_8tap_horiz_ps_32x16_sse4);
+ p.pu[LUMA_64x32].luma_hps = PFX(interp_8tap_horiz_ps_64x32_sse4);
+ p.pu[LUMA_32x64].luma_hps = PFX(interp_8tap_horiz_ps_32x64_sse4);
+ p.pu[LUMA_16x12].luma_hps = PFX(interp_8tap_horiz_ps_16x12_sse4);
+ p.pu[LUMA_12x16].luma_hps = PFX(interp_8tap_horiz_ps_12x16_sse4);
+ p.pu[LUMA_16x4].luma_hps = PFX(interp_8tap_horiz_ps_16x4_sse4);
+ p.pu[LUMA_32x24].luma_hps = PFX(interp_8tap_horiz_ps_32x24_sse4);
+ p.pu[LUMA_24x32].luma_hps = PFX(interp_8tap_horiz_ps_24x32_sse4);
+ p.pu[LUMA_32x8].luma_hps = PFX(interp_8tap_horiz_ps_32x8_sse4);
+ p.pu[LUMA_8x32].luma_hps = PFX(interp_8tap_horiz_ps_8x32_sse4);
+ p.pu[LUMA_64x48].luma_hps = PFX(interp_8tap_horiz_ps_64x48_sse4);
+ p.pu[LUMA_48x64].luma_hps = PFX(interp_8tap_horiz_ps_48x64_sse4);
+ p.pu[LUMA_64x16].luma_hps = PFX(interp_8tap_horiz_ps_64x16_sse4);
+ p.pu[LUMA_16x64].luma_hps = PFX(interp_8tap_horiz_ps_16x64_sse4);
+
+ ALL_LUMA_PU(luma_vpp, interp_8tap_vert_pp, sse4); p.pu[LUMA_4x4].luma_vpp = PFX(interp_8tap_vert_pp_4x4_sse4);
+ ALL_LUMA_PU(luma_vps, interp_8tap_vert_ps, sse4); p.pu[LUMA_4x4].luma_vps = PFX(interp_8tap_vert_ps_4x4_sse4);
+ ALL_LUMA_PU(luma_vsp, interp_8tap_vert_sp, sse4); p.pu[LUMA_4x4].luma_vsp = PFX(interp_8tap_vert_sp_4x4_sse4);
+ ALL_LUMA_PU_T(luma_hvpp, interp_8tap_hv_pp_cpu); p.pu[LUMA_4x4].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_4x4>;
CHROMA_420_HORIZ_FILTERS(sse4);
CHROMA_420_VERT_FILTERS_SSE4(_sse4);
CHROMA_422_HORIZ_FILTERS(_sse4);
@@ -1185,7 +1285,7 @@
// TODO: check POPCNT flag!
ALL_LUMA_TU_S(copy_cnt, copy_cnt_, sse4);
-#if X265_DEPTH <= 10
+#if X86_64 && X265_DEPTH <= 10
ALL_LUMA_CU(psy_cost_pp, psyCost_pp, sse4);
#endif
@@ -1203,6 +1303,7 @@
p.saoCuStatsE3 = PFX(saoCuStatsE3_sse4);
#endif
}
+#if X86_64
if (cpuMask & X265_CPU_AVX)
{
// p.pu[LUMA_4x4].satd = p.cu[BLOCK_4x4].sa8d = PFX(pixel_satd_4x4_avx); fails tests
@@ -2535,6 +2636,7 @@
p.pu[LUMA_48x64].luma_hpp = PFX(interp_8tap_horiz_pp_48x64_avx512);
}
+#endif
}
#else // if HIGH_BIT_DEPTH
diff -r dc2d7a2515fd -r 8173d05abf8d source/common/x86/h-ipfilter16.asm
--- a/source/common/x86/h-ipfilter16.asm Fri Oct 06 16:38:18 2017 +0530
+++ b/source/common/x86/h-ipfilter16.asm Thu Apr 05 14:56:26 2018 -0700
@@ -285,7 +285,8 @@
;------------------------------------------------------------------------------------------------------------
; void interp_8tap_horiz_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx
;------------------------------------------------------------------------------------------------------------
- FILTER_HOR_LUMA_sse2 4, 4, pp
+%if ARCH_X86_64
+ FILTER_HOR_LUMA_sse2 4, 4, pp
FILTER_HOR_LUMA_sse2 4, 8, pp
FILTER_HOR_LUMA_sse2 4, 16, pp
FILTER_HOR_LUMA_sse2 8, 4, pp
@@ -339,6 +340,7 @@
FILTER_HOR_LUMA_sse2 64, 32, ps
FILTER_HOR_LUMA_sse2 64, 48, ps
FILTER_HOR_LUMA_sse2 64, 64, ps
+%endif
;-----------------------------------------------------------------------------
; void interp_4tap_horiz_%3_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
diff -r dc2d7a2515fd -r 8173d05abf8d source/common/x86/h4-ipfilter16.asm
--- a/source/common/x86/h4-ipfilter16.asm Fri Oct 06 16:38:18 2017 +0530
+++ b/source/common/x86/h4-ipfilter16.asm Thu Apr 05 14:56:26 2018 -0700
@@ -377,6 +377,7 @@
; void interp_4tap_horiz_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
;-----------------------------------------------------------------------------
+%if ARCH_X86_64
FILTER_HOR_CHROMA_sse3 2, 4, pp
FILTER_HOR_CHROMA_sse3 2, 8, pp
FILTER_HOR_CHROMA_sse3 2, 16, pp
@@ -462,6 +463,7 @@
FILTER_HOR_CHROMA_sse3 64, 32, ps
FILTER_HOR_CHROMA_sse3 64, 48, ps
FILTER_HOR_CHROMA_sse3 64, 64, ps
+%endif
%macro FILTER_W2_2 1
movu m3, [r0]
diff -r dc2d7a2515fd -r 8173d05abf8d source/common/x86/intrapred16.asm
--- a/source/common/x86/intrapred16.asm Fri Oct 06 16:38:18 2017 +0530
+++ b/source/common/x86/intrapred16.asm Thu Apr 05 14:56:26 2018 -0700
@@ -196,6 +196,7 @@
;-----------------------------------------------------------------------------------
; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel* above, int, int filter)
;-----------------------------------------------------------------------------------
+%if ARCH_X86_64
INIT_XMM sse2
cglobal intra_pred_dc8, 5, 8, 2
movu m0, [r2 + 34]
@@ -275,10 +276,13 @@
mov [r0 + r7], r3w
.end:
RET
+%endif
;-------------------------------------------------------------------------------------------------------
; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int dirMode, int filter)
;-------------------------------------------------------------------------------------------------------
+%if ARCH_X86_64
+;This code is meant for 64 bit architecture
INIT_XMM sse2
cglobal intra_pred_dc16, 5, 10, 4
lea r3, [r2 + 66]
@@ -410,6 +414,7 @@
mov [r9 + r1 * 8], r3w
.end:
RET
+%endif
;-------------------------------------------------------------------------------------------
; void intra_pred_dc(pixel* above, pixel* left, pixel* dst, intptr_t dstStride, int filter)
@@ -474,6 +479,7 @@
;-------------------------------------------------------------------------------------------------------
; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int dirMode, int filter)
;-------------------------------------------------------------------------------------------------------
+%if ARCH_X86_64
INIT_YMM avx2
cglobal intra_pred_dc16, 3, 9, 4
mov r3d, r4m
@@ -682,6 +688,7 @@
movu [r0 + r2 * 1 + 0], m0
movu [r0 + r2 * 1 + mmsize], m0
RET
+%endif
;---------------------------------------------------------------------------------------
; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter)
@@ -1104,6 +1111,7 @@
;---------------------------------------------------------------------------------------
; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter)
;---------------------------------------------------------------------------------------
+%if ARCH_X86_64
INIT_XMM sse2
cglobal intra_pred_planar32, 3,3,16
movd m3, [r2 + 66] ; topRight = above[32]
@@ -1209,7 +1217,7 @@
%endrep
RET
%endif
-
+%endif
;---------------------------------------------------------------------------------------
; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter)
;---------------------------------------------------------------------------------------
@@ -2063,6 +2071,7 @@
STORE_4x4
RET
+%if ARCH_X86_64
cglobal intra_pred_ang4_26, 3,3,3
movh m0, [r2 + 2] ;[8 7 6 5 4 3 2 1]
add r1d, r1d
@@ -2098,6 +2107,7 @@
mov [r0 + r3], r2w
.quit:
RET
+%endif
cglobal intra_pred_ang4_27, 3,3,5
movu m0, [r2 + 2] ;[8 7 6 5 4 3 2 1]
@@ -11122,6 +11132,7 @@
.end%11:
%endmacro
+%if ARCH_X86_64
;; angle 16, modes 3 and 33
cglobal ang16_mode_3_33
test r6d, r6d
@@ -18220,6 +18231,7 @@
mov rsp, [rsp+4*mmsize]
RET
+%endif
;-------------------------------------------------------------------------------------------------------
; end of avx2 code for intra_pred_ang32 mode 2 to 34
;-------------------------------------------------------------------------------------------------------
diff -r dc2d7a2515fd -r 8173d05abf8d source/common/x86/ipfilter16.asm
--- a/source/common/x86/ipfilter16.asm Fri Oct 06 16:38:18 2017 +0530
+++ b/source/common/x86/ipfilter16.asm Thu Apr 05 14:56:26 2018 -0700
@@ -266,6 +266,7 @@
;-------------------------------------------------------------------------------------------------------------
; void interp_8tap_vert_pp_%2x%3(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
;-------------------------------------------------------------------------------------------------------------
+%if ARCH_X86_64
FILTER_VER_LUMA_sse2 pp, 4, 4
FILTER_VER_LUMA_sse2 pp, 8, 8
FILTER_VER_LUMA_sse2 pp, 8, 4
@@ -320,6 +321,7 @@
FILTER_VER_LUMA_sse2 ps, 48, 64
FILTER_VER_LUMA_sse2 ps, 64, 16
FILTER_VER_LUMA_sse2 ps, 16, 64
+%endif
;-----------------------------------------------------------------------------
;p2s and p2s_aligned avx512 code start
@@ -5620,6 +5622,7 @@
;-------------------------------------------------------------------------------------------------------------
; void interp_4tap_horiz_pp(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx
;-------------------------------------------------------------------------------------------------------------
+%if ARCH_X86_64
INIT_ZMM avx512
cglobal interp_4tap_horiz_pp_8x4, 5,8,11
add r1d, r1d
@@ -5645,9 +5648,10 @@
PROCESS_IPFILTER_CHROMA_PP_8x4_AVX512
RET
-
+%endif
+
+%macro IPFILTER_CHROMA_AVX512_8xN 1
INIT_ZMM avx512
-%macro IPFILTER_CHROMA_AVX512_8xN 1
cglobal interp_4tap_horiz_pp_8x%1, 5,8,11
add r1d, r1d
add r3d, r3d
@@ -5679,14 +5683,16 @@
RET
%endmacro
+%if ARCH_X86_64
IPFILTER_CHROMA_AVX512_8xN 8
IPFILTER_CHROMA_AVX512_8xN 12
IPFILTER_CHROMA_AVX512_8xN 16
IPFILTER_CHROMA_AVX512_8xN 32
IPFILTER_CHROMA_AVX512_8xN 64
-
+%endif
+
+%macro IPFILTER_CHROMA_AVX512_16xN 1
INIT_ZMM avx512
-%macro IPFILTER_CHROMA_AVX512_16xN 1
cglobal interp_4tap_horiz_pp_16x%1, 5,6,11
add r1d, r1d
add r3d, r3d
@@ -5716,6 +5722,7 @@
RET
%endmacro
+%if ARCH_X86_64
IPFILTER_CHROMA_AVX512_16xN 4
IPFILTER_CHROMA_AVX512_16xN 8
IPFILTER_CHROMA_AVX512_16xN 12
@@ -5723,9 +5730,10 @@
IPFILTER_CHROMA_AVX512_16xN 24
IPFILTER_CHROMA_AVX512_16xN 32
IPFILTER_CHROMA_AVX512_16xN 64
-
+%endif
+
+%macro IPFILTER_CHROMA_AVX512_24xN 1
INIT_ZMM avx512
-%macro IPFILTER_CHROMA_AVX512_24xN 1
cglobal interp_4tap_horiz_pp_24x%1, 5,8,11
add r1d, r1d
add r3d, r3d
@@ -5757,11 +5765,13 @@
RET
%endmacro
+%if ARCH_X86_64
IPFILTER_CHROMA_AVX512_24xN 32
IPFILTER_CHROMA_AVX512_24xN 64
-
+%endif
+
+%macro IPFILTER_CHROMA_AVX512_32xN 1
INIT_ZMM avx512
-%macro IPFILTER_CHROMA_AVX512_32xN 1
cglobal interp_4tap_horiz_pp_32x%1, 5,6,11
add r1d, r1d
add r3d, r3d
@@ -5791,15 +5801,17 @@
RET
%endmacro
+%if ARCH_X86_64
IPFILTER_CHROMA_AVX512_32xN 8
IPFILTER_CHROMA_AVX512_32xN 16
IPFILTER_CHROMA_AVX512_32xN 24
IPFILTER_CHROMA_AVX512_32xN 32
IPFILTER_CHROMA_AVX512_32xN 48
IPFILTER_CHROMA_AVX512_32xN 64
-
+%endif
+
+%macro IPFILTER_CHROMA_AVX512_64xN 1
INIT_ZMM avx512
-%macro IPFILTER_CHROMA_AVX512_64xN 1
cglobal interp_4tap_horiz_pp_64x%1, 5,6,11
add r1d, r1d
add r3d, r3d
@@ -5829,11 +5841,14 @@
RET
%endmacro
+%if ARCH_X86_64
IPFILTER_CHROMA_AVX512_64xN 16
IPFILTER_CHROMA_AVX512_64xN 32
IPFILTER_CHROMA_AVX512_64xN 48
IPFILTER_CHROMA_AVX512_64xN 64
-
+%endif
+
+%if ARCH_X86_64
INIT_ZMM avx512
cglobal interp_4tap_horiz_pp_48x64, 5,6,11
add r1d, r1d
@@ -5862,6 +5877,7 @@
%endrep
PROCESS_IPFILTER_CHROMA_PP_48x2_AVX512
RET
+%endif
;-------------------------------------------------------------------------------------------------------------
;ipfilter_chroma_avx512 code end
;-------------------------------------------------------------------------------------------------------------
@@ -6428,8 +6444,8 @@
movu [r2 + r3 + mmsize], m10
%endmacro
+%macro IPFILTER_LUMA_AVX512_16xN 1
INIT_ZMM avx512
-%macro IPFILTER_LUMA_AVX512_16xN 1
cglobal interp_8tap_horiz_pp_16x%1, 5,8,17
add r1d, r1d
add r3d, r3d
@@ -6467,15 +6483,17 @@
RET
%endmacro
+%if ARCH_X86_64
IPFILTER_LUMA_AVX512_16xN 4
IPFILTER_LUMA_AVX512_16xN 8
IPFILTER_LUMA_AVX512_16xN 12
IPFILTER_LUMA_AVX512_16xN 16
IPFILTER_LUMA_AVX512_16xN 32
IPFILTER_LUMA_AVX512_16xN 64
-
+%endif
+
+%macro IPFILTER_LUMA_AVX512_32xN 1
INIT_ZMM avx512
-%macro IPFILTER_LUMA_AVX512_32xN 1
cglobal interp_8tap_horiz_pp_32x%1, 5,6,17
add r1d, r1d
add r3d, r3d
@@ -6511,14 +6529,16 @@
RET
%endmacro
+%if ARCH_X86_64
IPFILTER_LUMA_AVX512_32xN 8
IPFILTER_LUMA_AVX512_32xN 16
IPFILTER_LUMA_AVX512_32xN 24
IPFILTER_LUMA_AVX512_32xN 32
IPFILTER_LUMA_AVX512_32xN 64
-
+%endif
+
+%macro IPFILTER_LUMA_AVX512_64xN 1
INIT_ZMM avx512
-%macro IPFILTER_LUMA_AVX512_64xN 1
cglobal interp_8tap_horiz_pp_64x%1, 5,6,17
add r1d, r1d
add r3d, r3d
@@ -6554,11 +6574,14 @@
RET
%endmacro
+%if ARCH_X86_64
IPFILTER_LUMA_AVX512_64xN 16
IPFILTER_LUMA_AVX512_64xN 32
IPFILTER_LUMA_AVX512_64xN 48
IPFILTER_LUMA_AVX512_64xN 64
-
+%endif
+
+%if ARCH_X86_64
INIT_ZMM avx512
cglobal interp_8tap_horiz_pp_48x64, 5,8,17
add r1d, r1d
@@ -6595,6 +6618,7 @@
%endrep
PROCESS_IPFILTER_LUMA_PP_48x4_AVX512
RET
+%endif
;-------------------------------------------------------------------------------------------------------------
;ipfilter_luma_avx512 code end
;-------------------------------------------------------------------------------------------------------------
diff -r dc2d7a2515fd -r 8173d05abf8d source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm Fri Oct 06 16:38:18 2017 +0530
+++ b/source/common/x86/ipfilter8.asm Thu Apr 05 14:56:26 2018 -0700
@@ -1999,6 +1999,7 @@
;-----------------------------------------------------------------------------
; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride)
;-----------------------------------------------------------------------------
+%if ARCH_X86_64
INIT_ZMM avx512
cglobal filterPixelToShort_32x8, 3, 7, 5
mov r3d, r3m
@@ -2104,6 +2105,7 @@
%endrep
PROCESS_P2S_32x4_AVX512
RET
+%endif
%macro PROCESS_P2S_ALIGNED_32x4_AVX512 0
pmovzxbw m0, [r0]
@@ -2129,6 +2131,7 @@
;-----------------------------------------------------------------------------
; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride)
;-----------------------------------------------------------------------------
+%if ARCH_X86_64
INIT_ZMM avx512
cglobal filterPixelToShort_aligned_32x8, 3, 7, 5
mov r3d, r3m
@@ -2234,6 +2237,7 @@
%endrep
PROCESS_P2S_ALIGNED_32x4_AVX512
RET
+%endif
;-----------------------------------------------------------------------------
;p2s and p2s_aligned 32xN avx512 code end
;-----------------------------------------------------------------------------
@@ -2633,6 +2637,7 @@
;-----------------------------------------------------------------------------
; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride)
;-----------------------------------------------------------------------------
+%if ARCH_X86_64
INIT_ZMM avx512
cglobal filterPixelToShort_64x64, 3, 7, 5
mov r3d, r3m
@@ -2776,6 +2781,7 @@
%endrep
PROCESS_P2S_ALIGNED_64x4_AVX512
RET
+%endif
;-----------------------------------------------------------------------------
;p2s and p2s_aligned 64xN avx512 code end
;-----------------------------------------------------------------------------
@@ -3352,6 +3358,7 @@
;-----------------------------------------------------------------------------
; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride)
;-----------------------------------------------------------------------------
+%if ARCH_X86_64
INIT_ZMM avx512
cglobal filterPixelToShort_48x64, 3,7,5
mov r3d, r3m
@@ -3419,6 +3426,7 @@
lea r2, [r2 + r3 * 4]
PROCESS_P2S_ALIGNED_48x8_AVX512
RET
+%endif
;-----------------------------------------------------------------------------
;p2s and p2s_aligned 48xN avx512 code end
;-----------------------------------------------------------------------------
@@ -10326,10 +10334,12 @@
RET
%endmacro
+%if ARCH_X86_64
IPFILTER_CHROMA_PP_64xN_AVX512 64
IPFILTER_CHROMA_PP_64xN_AVX512 32
IPFILTER_CHROMA_PP_64xN_AVX512 48
IPFILTER_CHROMA_PP_64xN_AVX512 16
+%endif
%macro IPFILTER_CHROMA_PP_32xN_AVX512 1
INIT_ZMM avx512
@@ -10358,12 +10368,14 @@
RET
%endmacro
+%if ARCH_X86_64
IPFILTER_CHROMA_PP_32xN_AVX512 16
IPFILTER_CHROMA_PP_32xN_AVX512 24
IPFILTER_CHROMA_PP_32xN_AVX512 8
IPFILTER_CHROMA_PP_32xN_AVX512 32
IPFILTER_CHROMA_PP_32xN_AVX512 64
IPFILTER_CHROMA_PP_32xN_AVX512 48
+%endif
%macro IPFILTER_CHROMA_PP_16xN_AVX512 1
INIT_ZMM avx512
@@ -10393,6 +10405,7 @@
RET
%endmacro
+%if ARCH_X86_64
IPFILTER_CHROMA_PP_16xN_AVX512 4
IPFILTER_CHROMA_PP_16xN_AVX512 8
IPFILTER_CHROMA_PP_16xN_AVX512 12
@@ -10400,7 +10413,9 @@
IPFILTER_CHROMA_PP_16xN_AVX512 24
IPFILTER_CHROMA_PP_16xN_AVX512 32
IPFILTER_CHROMA_PP_16xN_AVX512 64
-
+%endif
+
+%if ARCH_X86_64
INIT_ZMM avx512
cglobal interp_4tap_horiz_pp_48x64, 4,8,9
mov r4d, r4m
@@ -10426,6 +10441,7 @@
%endrep
PROCESS_IPFILTER_CHROMA_PP_48x4_AVX512
RET
+%endif
%macro PROCESS_IPFILTER_CHROMA_PS_64x1_AVX512 0
movu ym6, [r0]
@@ -10501,10 +10517,12 @@
RET
%endmacro
+%if ARCH_X86_64
IPFILTER_CHROMA_PS_64xN_AVX512 64
IPFILTER_CHROMA_PS_64xN_AVX512 32
IPFILTER_CHROMA_PS_64xN_AVX512 48
IPFILTER_CHROMA_PS_64xN_AVX512 16
+%endif
%macro PROCESS_IPFILTER_CHROMA_PS_32x1_AVX512 0
movu ym6, [r0]
@@ -10567,12 +10585,14 @@
RET
%endmacro
+%if ARCH_X86_64
IPFILTER_CHROMA_PS_32xN_AVX512 64
IPFILTER_CHROMA_PS_32xN_AVX512 48
IPFILTER_CHROMA_PS_32xN_AVX512 32
IPFILTER_CHROMA_PS_32xN_AVX512 24
IPFILTER_CHROMA_PS_32xN_AVX512 16
IPFILTER_CHROMA_PS_32xN_AVX512 8
+%endif
%macro PROCESS_IPFILTER_CHROMA_PS_16x2_AVX512 0
movu xm6, [r0]
@@ -11085,10 +11105,12 @@
RET
%endmacro
+%if ARCH_X86_64
IPFILTER_LUMA_64xN_AVX512 16
IPFILTER_LUMA_64xN_AVX512 32
IPFILTER_LUMA_64xN_AVX512 48
IPFILTER_LUMA_64xN_AVX512 64
+%endif
%macro IPFILTER_LUMA_32xN_AVX512 1
INIT_ZMM avx512
@@ -11118,11 +11140,13 @@
RET
%endmacro
+%if ARCH_X86_64
IPFILTER_LUMA_32xN_AVX512 8
IPFILTER_LUMA_32xN_AVX512 16
IPFILTER_LUMA_32xN_AVX512 24
IPFILTER_LUMA_32xN_AVX512 32
IPFILTER_LUMA_32xN_AVX512 64
+%endif
%macro IPFILTER_LUMA_16xN_AVX512 1
INIT_ZMM avx512
@@ -11154,13 +11178,16 @@
RET
%endmacro
+%if ARCH_X86_64
IPFILTER_LUMA_16xN_AVX512 4
IPFILTER_LUMA_16xN_AVX512 8
IPFILTER_LUMA_16xN_AVX512 12
IPFILTER_LUMA_16xN_AVX512 16
IPFILTER_LUMA_16xN_AVX512 32
IPFILTER_LUMA_16xN_AVX512 64
-
+%endif
+
+%if ARCH_X86_64
INIT_ZMM avx512
cglobal interp_8tap_horiz_pp_48x64, 4,8,14
sub r0, 3
@@ -11188,6 +11215,7 @@
%endrep
PROCESS_IPFILTER_LUMA_PP_48x4_AVX512
RET
+%endif
%macro PROCESS_IPFILTER_LUMA_PS_64x1_AVX512 0
; register map
diff -r dc2d7a2515fd -r 8173d05abf8d source/common/x86/loopfilter.asm
--- a/source/common/x86/loopfilter.asm Fri Oct 06 16:38:18 2017 +0530
+++ b/source/common/x86/loopfilter.asm Thu Apr 05 14:56:26 2018 -0700
@@ -58,6 +58,7 @@
;============================================================================================================
INIT_XMM sse4
%if HIGH_BIT_DEPTH
+%if ARCH_X86_64
cglobal saoCuOrgE0, 4,5,9
mov r4d, r4m
movh m6, [r1]
@@ -157,7 +158,7 @@
sub r4d, 16
jnz .loopH
RET
-
+%endif
%else ; HIGH_BIT_DEPTH == 1
cglobal saoCuOrgE0, 5, 5, 8, rec, offsetEo, lcuWidth, signLeft, stride
@@ -249,6 +250,7 @@
INIT_YMM avx2
%if HIGH_BIT_DEPTH
+%if ARCH_X86_64
cglobal saoCuOrgE0, 4,4,9
vbroadcasti128 m6, [r1]
movzx r1d, byte [r3]
@@ -308,6 +310,7 @@
dec r2d
jnz .loop
RET
+%endif
%else ; HIGH_BIT_DEPTH
cglobal saoCuOrgE0, 5, 5, 7, rec, offsetEo, lcuWidth, signLeft, stride
@@ -1655,6 +1658,7 @@
RET
%endif
+%if ARCH_X86_64
INIT_YMM avx2
%if HIGH_BIT_DEPTH
cglobal saoCuOrgB0, 5,7,8
@@ -1814,6 +1818,7 @@
.end:
RET
%endif
+%endif
;============================================================================================================
; void calSign(int8_t *dst, const Pixel *src1, const Pixel *src2, const int width)
diff -r dc2d7a2515fd -r 8173d05abf8d source/common/x86/mc-a.asm
--- a/source/common/x86/mc-a.asm Fri Oct 06 16:38:18 2017 +0530
+++ b/source/common/x86/mc-a.asm Thu Apr 05 14:56:26 2018 -0700
@@ -1034,6 +1034,7 @@
;------------------------------------------------------------------------------
; avx2 asm for addAvg high_bit_depth
;------------------------------------------------------------------------------
+%if ARCH_X86_64
INIT_YMM avx2
cglobal addAvg_8x2, 6,6,2, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
movu xm0, [r0]
@@ -1111,6 +1112,7 @@
movu [r2], xm0
movu [r2 + r5], xm2
RET
+%endif
%macro ADDAVG_W8_H4_AVX2 1
cglobal addAvg_8x%1, 6,7,6, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
@@ -1165,13 +1167,16 @@
RET
%endmacro
+%if ARCH_X86_64
ADDAVG_W8_H4_AVX2 4
ADDAVG_W8_H4_AVX2 8
ADDAVG_W8_H4_AVX2 12
ADDAVG_W8_H4_AVX2 16
ADDAVG_W8_H4_AVX2 32
ADDAVG_W8_H4_AVX2 64
-
+%endif
+
+%if ARCH_X86_64
cglobal addAvg_12x16, 6,7,6, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
mova m4, [pw_ %+ ADDAVG_ROUND]
mova m5, [pw_pixel_max]
@@ -1255,6 +1260,7 @@
dec r6d
jnz .loop
RET
+%endif
%macro ADDAVG_W16_H4_AVX2 1
cglobal addAvg_16x%1, 6,7,6, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
@@ -1296,6 +1302,7 @@
RET
%endmacro
+%if ARCH_X86_64
ADDAVG_W16_H4_AVX2 4
ADDAVG_W16_H4_AVX2 8
ADDAVG_W16_H4_AVX2 12
@@ -1303,7 +1310,9 @@
ADDAVG_W16_H4_AVX2 24
ADDAVG_W16_H4_AVX2 32
ADDAVG_W16_H4_AVX2 64
-
+%endif
+
+%if ARCH_X86_64
cglobal addAvg_24x32, 6,7,6, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
mova m4, [pw_ %+ ADDAVG_ROUND]
mova m5, [pw_pixel_max]
@@ -1415,6 +1424,7 @@
dec r6d
jnz .loop
RET
+%endif
%macro ADDAVG_W32_H2_AVX2 1
cglobal addAvg_32x%1, 6,7,6, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
@@ -1474,13 +1484,16 @@
RET
%endmacro
+%if ARCH_X86_64
ADDAVG_W32_H2_AVX2 8
ADDAVG_W32_H2_AVX2 16
ADDAVG_W32_H2_AVX2 24
ADDAVG_W32_H2_AVX2 32
ADDAVG_W32_H2_AVX2 48
ADDAVG_W32_H2_AVX2 64
-
+%endif
+
+%if ARCH_X86_64
cglobal addAvg_48x64, 6,7,6, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
mova m4, [pw_ %+ ADDAVG_ROUND]
mova m5, [pw_pixel_max]
@@ -1554,6 +1567,7 @@
dec r6d
jnz .loop
RET
+%endif
%macro ADDAVG_W64_H1_AVX2 1
cglobal addAvg_64x%1, 6,7,6, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
@@ -1649,11 +1663,12 @@
RET
%endmacro
+%if ARCH_X86_64
ADDAVG_W64_H1_AVX2 16
ADDAVG_W64_H1_AVX2 32
ADDAVG_W64_H1_AVX2 48
ADDAVG_W64_H1_AVX2 64
-
+%endif
;-----------------------------------------------------------------------------
;addAvg avx512 high bit depth code start
;-----------------------------------------------------------------------------
@@ -1875,6 +1890,7 @@
;-----------------------------------------------------------------------------
;void addAvg (int16_t* src0, int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride)
;-----------------------------------------------------------------------------
+%if ARCH_X86_64
INIT_ZMM avx512
cglobal addAvg_16x4, 6,9,6
vbroadcasti32x8 m4, [pw_ %+ ADDAVG_ROUND]
@@ -1889,6 +1905,7 @@
lea r8, [3 * r5]
PROCESS_ADDAVG_16x4_HBD_AVX512
RET
+%endif
%macro ADDAVG_W16_HBD_AVX512 1
INIT_ZMM avx512
@@ -1914,12 +1931,14 @@
RET
%endmacro
+%if ARCH_X86_64
ADDAVG_W16_HBD_AVX512 8
ADDAVG_W16_HBD_AVX512 12
ADDAVG_W16_HBD_AVX512 16
ADDAVG_W16_HBD_AVX512 24
ADDAVG_W16_HBD_AVX512 32
ADDAVG_W16_HBD_AVX512 64
+%endif
%macro ADDAVG_W32_HBD_AVX512 1
INIT_ZMM avx512
@@ -1945,12 +1964,14 @@
RET
%endmacro
+%if ARCH_X86_64
ADDAVG_W32_HBD_AVX512 8
ADDAVG_W32_HBD_AVX512 16
ADDAVG_W32_HBD_AVX512 24
ADDAVG_W32_HBD_AVX512 32
ADDAVG_W32_HBD_AVX512 48
ADDAVG_W32_HBD_AVX512 64
+%endif
%macro ADDAVG_W64_HBD_AVX512 1
INIT_ZMM avx512
@@ -1976,11 +1997,14 @@
RET
%endmacro
+%if ARCH_X86_64
ADDAVG_W64_HBD_AVX512 16
ADDAVG_W64_HBD_AVX512 32
ADDAVG_W64_HBD_AVX512 48
ADDAVG_W64_HBD_AVX512 64
-
+%endif
+
+%if ARCH_X86_64
INIT_ZMM avx512
cglobal addAvg_48x64, 6,9,6
vbroadcasti32x8 m4, [pw_ %+ ADDAVG_ROUND]
@@ -2002,6 +2026,7 @@
%endrep
PROCESS_ADDAVG_48x4_HBD_AVX512
RET
+%endif
%macro PROCESS_ADDAVG_ALIGNED_16x4_HBD_AVX512 0
movu ym0, [r0]
@@ -2221,6 +2246,7 @@
;-----------------------------------------------------------------------------
;void addAvg (int16_t* src0, int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride)
;-----------------------------------------------------------------------------
+%if ARCH_X86_64
INIT_ZMM avx512
cglobal addAvg_aligned_16x4, 6,9,6
vbroadcasti32x8 m4, [pw_ %+ ADDAVG_ROUND]
@@ -2235,6 +2261,7 @@
lea r8, [3 * r5]
PROCESS_ADDAVG_ALIGNED_16x4_HBD_AVX512
RET
+%endif
%macro ADDAVG_ALIGNED_W16_HBD_AVX512 1
INIT_ZMM avx512
@@ -2260,12 +2287,14 @@
RET
%endmacro
+%if ARCH_X86_64
ADDAVG_ALIGNED_W16_HBD_AVX512 8
ADDAVG_ALIGNED_W16_HBD_AVX512 12
ADDAVG_ALIGNED_W16_HBD_AVX512 16
ADDAVG_ALIGNED_W16_HBD_AVX512 24
ADDAVG_ALIGNED_W16_HBD_AVX512 32
ADDAVG_ALIGNED_W16_HBD_AVX512 64
+%endif
%macro ADDAVG_ALIGNED_W32_HBD_AVX512 1
INIT_ZMM avx512
@@ -2291,12 +2320,14 @@
RET
%endmacro
+%if ARCH_X86_64
ADDAVG_ALIGNED_W32_HBD_AVX512 8
ADDAVG_ALIGNED_W32_HBD_AVX512 16
ADDAVG_ALIGNED_W32_HBD_AVX512 24
ADDAVG_ALIGNED_W32_HBD_AVX512 32
ADDAVG_ALIGNED_W32_HBD_AVX512 48
ADDAVG_ALIGNED_W32_HBD_AVX512 64
+%endif
%macro ADDAVG_ALIGNED_W64_HBD_AVX512 1
INIT_ZMM avx512
@@ -2322,11 +2353,14 @@
RET
%endmacro
+%if ARCH_X86_64
ADDAVG_ALIGNED_W64_HBD_AVX512 16
ADDAVG_ALIGNED_W64_HBD_AVX512 32
ADDAVG_ALIGNED_W64_HBD_AVX512 48
ADDAVG_ALIGNED_W64_HBD_AVX512 64
-
+%endif
+
+%if ARCH_X86_64
INIT_ZMM avx512
cglobal addAvg_aligned_48x64, 6,9,6
vbroadcasti32x8 m4, [pw_ %+ ADDAVG_ROUND]
@@ -2348,6 +2382,7 @@
%endrep
PROCESS_ADDAVG_ALIGNED_48x4_HBD_AVX512
RET
+%endif
;-----------------------------------------------------------------------------
;addAvg avx512 high bit depth code end
;-----------------------------------------------------------------------------
@@ -6530,11 +6565,13 @@
RET
%endmacro
+%if ARCH_X86_64
PIXEL_AVG_HBD_W32 8
PIXEL_AVG_HBD_W32 16
PIXEL_AVG_HBD_W32 24
PIXEL_AVG_HBD_W32 32
PIXEL_AVG_HBD_W32 64
+%endif
%macro PIXEL_AVG_HBD_W64 1
INIT_ZMM avx512
@@ -6556,11 +6593,14 @@
RET
%endmacro
+%if ARCH_X86_64
PIXEL_AVG_HBD_W64 16
PIXEL_AVG_HBD_W64 32
PIXEL_AVG_HBD_W64 48
PIXEL_AVG_HBD_W64 64
-
+%endif
+
+%if ARCH_X86_64
INIT_ZMM avx512
cglobal pixel_avg_48x64, 6,9,4
add r1d, r1d
@@ -6578,6 +6618,7 @@
%endrep
PROCESS_PIXELAVG_48x8_HBD_AVX512
RET
+%endif
;-----------------------------------------------------------------------------
;pixel_avg_pp avx512 high bit depth code end
;-----------------------------------------------------------------------------
@@ -6709,6 +6750,7 @@
jg .height_loop
RET
+%if ARCH_X86_64
INIT_YMM avx2
cglobal pixel_avg2_w20, 6,7
sub r2, r4
@@ -6725,6 +6767,7 @@
sub r5d, 2
jg .height_loop
RET
+%endif
; Cacheline split code for processors with high latencies for loads
; split over cache lines. See sad-a.asm for a more detailed explanation.
diff -r dc2d7a2515fd -r 8173d05abf8d source/common/x86/pixel-a.asm
--- a/source/common/x86/pixel-a.asm Fri Oct 06 16:38:18 2017 +0530
+++ b/source/common/x86/pixel-a.asm Thu Apr 05 14:56:26 2018 -0700
@@ -14055,6 +14055,7 @@
;lea %8, [%8+4*r3]
%endmacro
+%if ARCH_X86_64
INIT_YMM avx2
cglobal pixel_satd_8x8, 4,4,7
@@ -14620,5 +14621,5 @@
movd eax, xm0
RET
-
+%endif
%endif ; HIGH_BIT_DEPTH == 1 && BIT_DEPTH == 10
diff -r dc2d7a2515fd -r 8173d05abf8d source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm Fri Oct 06 16:38:18 2017 +0530
+++ b/source/common/x86/pixel-util8.asm Thu Apr 05 14:56:26 2018 -0700
@@ -7707,8 +7707,13 @@
paddd xm5, xm1
HADDW xm4, xm2
HADDD xm5, xm1
+%if ARCH_X86_64
punpckldq xm4, xm5
movq rax, xm4
+%else
+ movd eax, xm4
+ movd edx, xm5
+%endif
%endmacro
%if HIGH_BIT_DEPTH==0
diff -r dc2d7a2515fd -r 8173d05abf8d source/common/x86/sad16-a.asm
--- a/source/common/x86/sad16-a.asm Fri Oct 06 16:38:18 2017 +0530
+++ b/source/common/x86/sad16-a.asm Thu Apr 05 14:56:26 2018 -0700
@@ -1292,6 +1292,7 @@
;-----------------------------------------------------------------------------
; int pixel_sad_64x%1( uint16_t *, intptr_t, uint16_t *, intptr_t )
;-----------------------------------------------------------------------------
+%if ARCH_X86_64
INIT_ZMM avx512
cglobal pixel_sad_64x16, 4,6,7
pxor m0, m0
@@ -1399,10 +1400,12 @@
PROCESS_SAD_64x8_AVX512
PROCESS_SAD_AVX512_END
RET
+%endif
;-----------------------------------------------------------------------------
; int pixel_sad_32x%1( uint16_t *, intptr_t, uint16_t *, intptr_t )
;-----------------------------------------------------------------------------
+%if ARCH_X86_64
INIT_ZMM avx512
cglobal pixel_sad_32x8, 4,6,7
pxor m0, m0
@@ -1517,10 +1520,12 @@
PROCESS_SAD_32x8_AVX512
PROCESS_SAD_AVX512_END
RET
+%endif
;-----------------------------------------------------------------------------
; int pixel_sad_48x64( uint16_t *, intptr_t, uint16_t *, intptr_t )
;-----------------------------------------------------------------------------
+%if ARCH_X86_64
INIT_ZMM avx512
cglobal pixel_sad_48x64, 4, 7, 9
pxor m0, m0
@@ -1622,6 +1627,7 @@
PROCESS_SAD_AVX512_END
RET
+%endif
;=============================================================================
; SAD x3/x4
@@ -2611,7 +2617,7 @@
;------------------------------------------------------------------------------------------------------------------------------------------
; void pixel_sad_x3_32x%1( const pixel* pix1, const pixel* pix2, const pixel* pix3, const pixel* pix4, intptr_t frefstride, int32_t* res )
;------------------------------------------------------------------------------------------------------------------------------------------
-
+%if ARCH_X86_64
INIT_ZMM avx512
cglobal pixel_sad_x3_32x8, 6,7,8
pxor m0, m0
@@ -2970,11 +2976,12 @@
PROCESS_SAD_X3_END_AVX512
RET
+%endif
;------------------------------------------------------------------------------------------------------------------------------------------
; void pixel_sad_x3_64x%1( const pixel* pix1, const pixel* pix2, const pixel* pix3, const pixel* pix4, intptr_t frefstride, int32_t* res )
;------------------------------------------------------------------------------------------------------------------------------------------
-
+%if ARCH_X86_64
INIT_ZMM avx512
cglobal pixel_sad_x3_64x16, 6,7,12
pxor m0, m0
@@ -3214,11 +3221,11 @@
PROCESS_SAD_X3_64x4_AVX512
PROCESS_SAD_X3_END_AVX512
RET
-
+%endif
;------------------------------------------------------------------------------------------------------------------------------------------------------------
; void pixel_sad_x4_32x%1( const pixel* pix1, const pixel* pix2, const pixel* pix3, const pixel* pix4, const pixel* pix5, intptr_t frefstride, int32_t* res )
;------------------------------------------------------------------------------------------------------------------------------------------------------------
-
+%if ARCH_X86_64
INIT_ZMM avx512
cglobal pixel_sad_x4_32x8, 6,8,10
pxor m0, m0
@@ -3485,10 +3492,11 @@
PROCESS_SAD_X4_32x4_AVX512
PROCESS_SAD_X4_END_AVX512
RET
-
+%endif
;------------------------------------------------------------------------------------------------------------------------------------------------------------
; void pixel_sad_x4_48x64( const pixel* pix1, const pixel* pix2, const pixel* pix3, const pixel* pix4, const pixel* pix5, intptr_t frefstride, int32_t* res )
;------------------------------------------------------------------------------------------------------------------------------------------------------------
+%if ARCH_X86_64
INIT_ZMM avx512
cglobal pixel_sad_x4_48x64, 4, 9, 20
pxor m0, m0
@@ -3644,11 +3652,12 @@
PROCESS_SAD_X4_END_AVX512
RET
+%endif
;------------------------------------------------------------------------------------------------------------------------------------------------------------
; void pixel_sad_x4_64x%1( const pixel* pix1, const pixel* pix2, const pixel* pix3, const pixel* pix4, const pixel* pix5, intptr_t frefstride, int32_t* res )
;------------------------------------------------------------------------------------------------------------------------------------------------------------
-
+%if ARCH_X86_64
INIT_ZMM avx512
cglobal pixel_sad_x4_64x16, 6,8,15
pxor m0, m0
@@ -3928,3 +3937,4 @@
PROCESS_SAD_X4_64x4_AVX512
PROCESS_SAD_X4_END_AVX512
RET
+%endif
diff -r dc2d7a2515fd -r 8173d05abf8d source/common/x86/ssd-a.asm
--- a/source/common/x86/ssd-a.asm Fri Oct 06 16:38:18 2017 +0530
+++ b/source/common/x86/ssd-a.asm Thu Apr 05 14:56:26 2018 -0700
@@ -141,6 +141,8 @@
; Function to find ssd for 32x16 block, sse2, 12 bit depth
; Defined sepeartely to be called from SSD_ONE_32 macro
+%if ARCH_X86_64
+;This code is written for 64 bit architecture
INIT_XMM sse2
cglobal ssd_ss_32x16
pxor m8, m8
@@ -180,8 +182,10 @@
paddq m4, m5
paddq m9, m4
ret
+%endif
%macro SSD_ONE_32 0
+%if ARCH_X86_64
cglobal pixel_ssd_ss_32x64, 4,7,10
add r1d, r1d
add r3d, r3d
@@ -193,7 +197,9 @@
call ssd_ss_32x16
movq rax, m9
RET
+%endif
%endmacro
+
%macro SSD_ONE_SS_32 0
cglobal pixel_ssd_ss_32x32, 4,5,8
add r1d, r1d
@@ -554,6 +560,7 @@
RET
%endmacro
+%if ARCH_X86_64
INIT_YMM avx2
cglobal pixel_ssd_16x16, 4,7,3
FIX_STRIDES r1, r3
@@ -696,6 +703,7 @@
paddq xm3, xm4
movq rax, xm3
RET
+%endif
INIT_MMX mmx2
SSD_ONE 4, 4
@@ -726,7 +734,9 @@
%if BIT_DEPTH <= 10
SSD_ONE 32, 64
SSD_ONE 32, 32
+%if ARCH_X86_64
SSD_TWO 64, 64
+%endif
%else
SSD_ONE_32
SSD_ONE_SS_32
@@ -3246,7 +3256,7 @@
movd eax, m0
RET
-
+%if ARCH_X86_64 && BIT_DEPTH >= 10
INIT_XMM sse2
cglobal pixel_ssd_s_32, 2,3,5
add r1, r1
@@ -3287,7 +3297,6 @@
dec r2d
jnz .loop
-%if BIT_DEPTH >= 10
movu m1, m0
pxor m2, m2
punpckldq m0, m2
@@ -3296,13 +3305,56 @@
movhlps m1, m0
paddq m0, m1
movq rax, xm0
-%else
+ RET
+%endif
+
+%if BIT_DEPTH == 8
+INIT_XMM sse2
+cglobal pixel_ssd_s_32, 2,3,5
+ add r1, r1
+
+ mov r2d, 16
+ pxor m0, m0
+.loop:
+ movu m1, [r0 + 0 * mmsize]
+ movu m2, [r0 + 1 * mmsize]
+ movu m3, [r0 + 2 * mmsize]
+ movu m4, [r0 + 3 * mmsize]
+ add r0, r1
+
+ pmaddwd m1, m1
+ pmaddwd m2, m2
+ pmaddwd m3, m3
+ pmaddwd m4, m4
+ paddd m1, m2
+ paddd m3, m4
+ paddd m1, m3
+ paddd m0, m1
+
+ movu m1, [r0 + 0 * mmsize]
+ movu m2, [r0 + 1 * mmsize]
+ movu m3, [r0 + 2 * mmsize]
+ movu m4, [r0 + 3 * mmsize]
+ add r0, r1
+
+ pmaddwd m1, m1
+ pmaddwd m2, m2
+ pmaddwd m3, m3
+ pmaddwd m4, m4
+ paddd m1, m2
+ paddd m3, m4
+ paddd m1, m3
+ paddd m0, m1
+
+ dec r2d
+ jnz .loop
; calculate sum and return
HADDD m0, m1
movd eax, m0
+ RET
%endif
- RET
-
+
+%if ARCH_X86_64
INIT_YMM avx2
cglobal pixel_ssd_s_16, 2,4,5
add r1, r1
@@ -3389,7 +3441,7 @@
movd eax, xm0
%endif
RET
-
+%endif
;-----------------------------------------------------------------------------
; ssd_s avx512 code start
;-----------------------------------------------------------------------------
@@ -3447,6 +3499,7 @@
;-----------------------------------------------------------------------------
; int pixel_ssd_s( int16_t *ref, intptr_t i_stride )
;-----------------------------------------------------------------------------
+%if ARCH_X86_64
INIT_ZMM avx512
cglobal pixel_ssd_s_32, 2,4,5
add r1, r1
@@ -3495,6 +3548,7 @@
HADDD m0, m1
movd eax, xm0
RET
+%endif
;-----------------------------------------------------------------------------
; ssd_s avx512 code end
;-----------------------------------------------------------------------------
diff -r dc2d7a2515fd -r 8173d05abf8d source/common/x86/v4-ipfilter16.asm
--- a/source/common/x86/v4-ipfilter16.asm Fri Oct 06 16:38:18 2017 +0530
+++ b/source/common/x86/v4-ipfilter16.asm Thu Apr 05 14:56:26 2018 -0700
@@ -2931,6 +2931,7 @@
RET
%endmacro
+%if ARCH_X86_64
FILTER_VER_CHROMA_AVX2_4xN pp, 16, 1, 6
FILTER_VER_CHROMA_AVX2_4xN ps, 16, 0, INTERP_SHIFT_PS
FILTER_VER_CHROMA_AVX2_4xN sp, 16, 1, INTERP_SHIFT_SP
@@ -2939,6 +2940,7 @@
FILTER_VER_CHROMA_AVX2_4xN ps, 32, 0, INTERP_SHIFT_PS
FILTER_VER_CHROMA_AVX2_4xN sp, 32, 1, INTERP_SHIFT_SP
FILTER_VER_CHROMA_AVX2_4xN ss, 32, 0, 6
+%endif
%macro FILTER_VER_CHROMA_AVX2_8x8 3
INIT_YMM avx2
More information about the x265-devel
mailing list