[x265] [PATCH 133 of 307] x86: Fix build errors in 32 bit build

mythreyi at multicorewareinc.com mythreyi at multicorewareinc.com
Sat Apr 7 04:32:11 CEST 2018


# HG changeset patch
# User Vignesh Vijayakumar<vignesh at multicorewareinc.com>>
# Date 1522965386 25200
#      Thu Apr 05 14:56:26 2018 -0700
# Node ID 8173d05abf8dc96f3be6c97016cfb98f85e84a20
# Parent  dc2d7a2515fdc434744f97a9dd34edcd670bbffa
x86: Fix build errors in 32 bit build

1. Fixes 32 bit build CMake error in Linux platform
2. Enables assembly for 32 bit build high bit depth

sign off: Mythreyi P

diff -r dc2d7a2515fd -r 8173d05abf8d source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Fri Oct 06 16:38:18 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp	Thu Apr 05 14:56:26 2018 -0700
@@ -886,10 +886,6 @@
 
 void setupAssemblyPrimitives(EncoderPrimitives &p, int cpuMask) // Main10
 {
-#if !defined(X86_64)
-#error "Unsupported build configuration (32bit x86 and HIGH_BIT_DEPTH), you must configure ENABLE_ASSEMBLY=OFF"
-#endif
-
 #if X86_64
     p.scanPosLast = PFX(scanPosLast_x64);
 #endif
@@ -950,16 +946,42 @@
         CHROMA_422_VERT_FILTERS(_sse2);
         CHROMA_444_VERT_FILTERS(sse2);
 
+#if X86_64
         ALL_LUMA_PU(luma_hpp, interp_8tap_horiz_pp, sse2);
         p.pu[LUMA_4x4].luma_hpp = PFX(interp_8tap_horiz_pp_4x4_sse2);
         ALL_LUMA_PU(luma_hps, interp_8tap_horiz_ps, sse2);
         p.pu[LUMA_4x4].luma_hps = PFX(interp_8tap_horiz_ps_4x4_sse2);
         ALL_LUMA_PU(luma_vpp, interp_8tap_vert_pp, sse2);
         ALL_LUMA_PU(luma_vps, interp_8tap_vert_ps, sse2);
+#endif
 
         p.ssim_4x4x2_core = PFX(pixel_ssim_4x4x2_core_sse2);
         p.ssim_end_4 = PFX(pixel_ssim_end4_sse2);
-        PIXEL_AVG(sse2);
+
+        p.pu[LUMA_64x64].pixelavg_pp = PFX(pixel_avg_64x64_sse2);
+        p.pu[LUMA_64x48].pixelavg_pp = PFX(pixel_avg_64x48_sse2);
+        p.pu[LUMA_64x32].pixelavg_pp = PFX(pixel_avg_64x32_sse2);
+        p.pu[LUMA_64x16].pixelavg_pp = PFX(pixel_avg_64x16_sse2);
+        p.pu[LUMA_48x64].pixelavg_pp = PFX(pixel_avg_48x64_sse2);
+        p.pu[LUMA_32x64].pixelavg_pp = PFX(pixel_avg_32x64_sse2);
+        p.pu[LUMA_32x32].pixelavg_pp = PFX(pixel_avg_32x32_sse2);
+        p.pu[LUMA_32x24].pixelavg_pp = PFX(pixel_avg_32x24_sse2);
+        p.pu[LUMA_32x16].pixelavg_pp = PFX(pixel_avg_32x16_sse2);
+        p.pu[LUMA_32x8].pixelavg_pp  = PFX(pixel_avg_32x8_sse2);
+        p.pu[LUMA_24x32].pixelavg_pp = PFX(pixel_avg_24x32_sse2);
+        p.pu[LUMA_16x64].pixelavg_pp = PFX(pixel_avg_16x64_sse2);
+        p.pu[LUMA_16x32].pixelavg_pp = PFX(pixel_avg_16x32_sse2);
+        p.pu[LUMA_16x16].pixelavg_pp = PFX(pixel_avg_16x16_sse2);
+        p.pu[LUMA_16x12].pixelavg_pp = PFX(pixel_avg_16x12_sse2);
+        p.pu[LUMA_16x8].pixelavg_pp  = PFX(pixel_avg_16x8_sse2);
+        p.pu[LUMA_16x4].pixelavg_pp  = PFX(pixel_avg_16x4_sse2);
+        p.pu[LUMA_12x16].pixelavg_pp = PFX(pixel_avg_12x16_sse2);
+#if X86_64
+        p.pu[LUMA_8x32].pixelavg_pp  = PFX(pixel_avg_8x32_sse2);
+        p.pu[LUMA_8x16].pixelavg_pp = PFX(pixel_avg_8x16_sse2);
+        p.pu[LUMA_8x8].pixelavg_pp   = PFX(pixel_avg_8x8_sse2);
+        p.pu[LUMA_8x4].pixelavg_pp   = PFX(pixel_avg_8x4_sse2);
+#endif
         PIXEL_AVG_W4(mmx2);
         LUMA_VAR(sse2);
 
@@ -970,7 +992,12 @@
         ALL_LUMA_TU_S(cpy1Dto2D_shl, cpy1Dto2D_shl_, sse2);
         ALL_LUMA_TU_S(cpy2Dto1D_shr, cpy2Dto1D_shr_, sse2);
         ALL_LUMA_TU_S(cpy2Dto1D_shl, cpy2Dto1D_shl_, sse2);
-        ALL_LUMA_TU_S(ssd_s, pixel_ssd_s_, sse2);
+#if X86_64
+        p.cu[BLOCK_4x4].ssd_s = PFX(pixel_ssd_s_4_sse2);
+        p.cu[BLOCK_8x8].ssd_s = PFX(pixel_ssd_s_8_sse2);
+        p.cu[BLOCK_16x16].ssd_s = PFX(pixel_ssd_s_16_sse2);
+        p.cu[BLOCK_32x32].ssd_s = PFX(pixel_ssd_s_32_sse2);
+#endif
         ALL_LUMA_TU_S(calcresidual[ALIGNED], getResidual, sse2);
         ALL_LUMA_TU_S(calcresidual[NONALIGNED], getResidual, sse2);
         ALL_LUMA_TU_S(transpose, transpose, sse2);
@@ -978,9 +1005,10 @@
         p.cu[BLOCK_4x4].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar4_sse2);
         p.cu[BLOCK_8x8].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar8_sse2);
         p.cu[BLOCK_16x16].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar16_sse2);
+#if X86_64
         p.cu[BLOCK_32x32].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar32_sse2);
         ALL_LUMA_TU_S(intra_pred[DC_IDX], intra_pred_dc, sse2);
-
+#endif
         p.cu[BLOCK_4x4].intra_pred[2] = PFX(intra_pred_ang4_2_sse2);
         p.cu[BLOCK_4x4].intra_pred[3] = PFX(intra_pred_ang4_3_sse2);
         p.cu[BLOCK_4x4].intra_pred[4] = PFX(intra_pred_ang4_4_sse2);
@@ -1005,7 +1033,9 @@
         p.cu[BLOCK_4x4].intra_pred[23] = PFX(intra_pred_ang4_23_sse2);
         p.cu[BLOCK_4x4].intra_pred[24] = PFX(intra_pred_ang4_24_sse2);
         p.cu[BLOCK_4x4].intra_pred[25] = PFX(intra_pred_ang4_25_sse2);
+#if X86_64
         p.cu[BLOCK_4x4].intra_pred[26] = PFX(intra_pred_ang4_26_sse2);
+#endif
         p.cu[BLOCK_4x4].intra_pred[27] = PFX(intra_pred_ang4_27_sse2);
         p.cu[BLOCK_4x4].intra_pred[28] = PFX(intra_pred_ang4_28_sse2);
         p.cu[BLOCK_4x4].intra_pred[29] = PFX(intra_pred_ang4_29_sse2);
@@ -1014,19 +1044,24 @@
         p.cu[BLOCK_4x4].intra_pred[32] = PFX(intra_pred_ang4_32_sse2);
         p.cu[BLOCK_4x4].intra_pred[33] = PFX(intra_pred_ang4_33_sse2);
 
+#if X86_64 && X265_DEPTH <= 10
+        p.cu[BLOCK_4x4].sse_ss = PFX(pixel_ssd_ss_4x4_mmx2);
         p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].sse_pp = (pixel_sse_t)PFX(pixel_ssd_ss_32x64_sse2);
         p.chroma[X265_CSP_I422].cu[BLOCK_422_4x8].sse_pp = (pixel_sse_t)PFX(pixel_ssd_ss_4x8_mmx2);
         p.chroma[X265_CSP_I422].cu[BLOCK_422_8x16].sse_pp = (pixel_sse_t)PFX(pixel_ssd_ss_8x16_sse2);
         p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].sse_pp = (pixel_sse_t)PFX(pixel_ssd_ss_16x32_sse2);
-#if X265_DEPTH <= 10
-        p.cu[BLOCK_4x4].sse_ss = PFX(pixel_ssd_ss_4x4_mmx2);
-        ALL_LUMA_CU(sse_ss, pixel_ssd_ss, sse2);
+
+        p.cu[BLOCK_8x8].sse_ss = PFX(pixel_ssd_ss_8x8_sse2);
+        p.cu[BLOCK_16x16].sse_ss = PFX(pixel_ssd_ss_16x16_sse2);
+        p.cu[BLOCK_32x32].sse_ss = PFX(pixel_ssd_ss_32x32_sse2);
+        p.cu[BLOCK_64x64].sse_ss = PFX(pixel_ssd_ss_64x64_sse2);
 #endif
         p.cu[BLOCK_4x4].dct = PFX(dct4_sse2);
         p.cu[BLOCK_8x8].dct = PFX(dct8_sse2);
         p.cu[BLOCK_4x4].idct = PFX(idct4_sse2);
+#if X86_64
         p.cu[BLOCK_8x8].idct = PFX(idct8_sse2);
-
+#endif
         p.idst4x4 = PFX(idst4_sse2);
         p.dst4x4 = PFX(dst4_sse2);
 
@@ -1050,12 +1085,14 @@
     }
     if (cpuMask & X265_CPU_SSE3)
     {
+#if X86_64
         ALL_CHROMA_420_PU(filter_hpp, interp_4tap_horiz_pp, sse3);
         ALL_CHROMA_422_PU(filter_hpp, interp_4tap_horiz_pp, sse3);
         ALL_CHROMA_444_PU(filter_hpp, interp_4tap_horiz_pp, sse3);
         ALL_CHROMA_420_PU(filter_hps, interp_4tap_horiz_ps, sse3);
         ALL_CHROMA_422_PU(filter_hps, interp_4tap_horiz_ps, sse3);
         ALL_CHROMA_444_PU(filter_hps, interp_4tap_horiz_ps, sse3);
+#endif
     }
     if (cpuMask & X265_CPU_SSSE3)
     {
@@ -1126,12 +1163,13 @@
     }
     if (cpuMask & X265_CPU_SSE4)
     {
+#if X86_64
         p.pelFilterLumaStrong[0] = PFX(pelFilterLumaStrong_V_sse4);
         p.pelFilterLumaStrong[1] = PFX(pelFilterLumaStrong_H_sse4);
         p.pelFilterChroma[0] = PFX(pelFilterChroma_V_sse4);
         p.pelFilterChroma[1] = PFX(pelFilterChroma_H_sse4);
-
         p.saoCuOrgE0 = PFX(saoCuOrgE0_sse4);
+#endif
         p.saoCuOrgE1 = PFX(saoCuOrgE1_sse4);
         p.saoCuOrgE1_2Rows = PFX(saoCuOrgE1_2Rows_sse4);
         p.saoCuOrgE2[0] = PFX(saoCuOrgE2_sse4);
@@ -1146,6 +1184,68 @@
         CHROMA_422_ADDAVG(sse4);
 
         LUMA_FILTERS(sse4);
+
+#if X86_64
+        p.pu[LUMA_4x4].luma_hpp = PFX(interp_8tap_horiz_pp_4x4_sse4);
+        p.pu[LUMA_4x8].luma_hpp = PFX(interp_8tap_horiz_pp_4x8_sse4);
+        p.pu[LUMA_4x16].luma_hpp = PFX(interp_8tap_horiz_pp_4x16_sse4);
+        p.pu[LUMA_4x4].luma_hps = PFX(interp_8tap_horiz_ps_4x4_sse4);
+        p.pu[LUMA_4x8].luma_hps = PFX(interp_8tap_horiz_ps_4x8_sse4);
+        p.pu[LUMA_4x16].luma_hps = PFX(interp_8tap_horiz_ps_4x16_sse4);
+#endif
+
+        p.pu[LUMA_8x8].luma_hpp = PFX(interp_8tap_horiz_pp_8x8_sse4);
+        p.pu[LUMA_16x16].luma_hpp = PFX(interp_8tap_horiz_pp_16x16_sse4);
+        p.pu[LUMA_32x32].luma_hpp = PFX(interp_8tap_horiz_pp_32x32_sse4);
+        p.pu[LUMA_64x64].luma_hpp = PFX(interp_8tap_horiz_pp_64x64_sse4);
+        p.pu[LUMA_8x4].luma_hpp = PFX(interp_8tap_horiz_pp_8x4_sse4);
+
+        p.pu[LUMA_16x8].luma_hpp = PFX(interp_8tap_horiz_pp_16x8_sse4);
+        p.pu[LUMA_8x16].luma_hpp = PFX(interp_8tap_horiz_pp_8x16_sse4);
+        p.pu[LUMA_16x32].luma_hpp = PFX(interp_8tap_horiz_pp_16x32_sse4);
+        p.pu[LUMA_32x16].luma_hpp = PFX(interp_8tap_horiz_pp_32x16_sse4);
+        p.pu[LUMA_64x32].luma_hpp = PFX(interp_8tap_horiz_pp_64x32_sse4);
+        p.pu[LUMA_32x64].luma_hpp = PFX(interp_8tap_horiz_pp_32x64_sse4);
+        p.pu[LUMA_16x12].luma_hpp = PFX(interp_8tap_horiz_pp_16x12_sse4);
+        p.pu[LUMA_12x16].luma_hpp = PFX(interp_8tap_horiz_pp_12x16_sse4);
+        p.pu[LUMA_16x4].luma_hpp = PFX(interp_8tap_horiz_pp_16x4_sse4);
+
+        p.pu[LUMA_32x24].luma_hpp = PFX(interp_8tap_horiz_pp_32x24_sse4);
+        p.pu[LUMA_24x32].luma_hpp = PFX(interp_8tap_horiz_pp_24x32_sse4);
+        p.pu[LUMA_32x8].luma_hpp = PFX(interp_8tap_horiz_pp_32x8_sse4);
+        p.pu[LUMA_8x32].luma_hpp = PFX(interp_8tap_horiz_pp_8x32_sse4);
+        p.pu[LUMA_64x48].luma_hpp = PFX(interp_8tap_horiz_pp_64x48_sse4);
+        p.pu[LUMA_48x64].luma_hpp = PFX(interp_8tap_horiz_pp_48x64_sse4);
+        p.pu[LUMA_64x16].luma_hpp = PFX(interp_8tap_horiz_pp_64x16_sse4);
+        p.pu[LUMA_16x64].luma_hpp = PFX(interp_8tap_horiz_pp_16x64_sse4);
+
+        p.pu[LUMA_8x8].luma_hps = PFX(interp_8tap_horiz_ps_8x8_sse4);
+        p.pu[LUMA_16x16].luma_hps = PFX(interp_8tap_horiz_ps_16x16_sse4);
+        p.pu[LUMA_32x32].luma_hps = PFX(interp_8tap_horiz_ps_32x32_sse4);
+        p.pu[LUMA_64x64].luma_hps = PFX(interp_8tap_horiz_ps_64x64_sse4);
+        p.pu[LUMA_8x4].luma_hps = PFX(interp_8tap_horiz_ps_8x4_sse4);
+        p.pu[LUMA_16x8].luma_hps = PFX(interp_8tap_horiz_ps_16x8_sse4);
+        p.pu[LUMA_8x16].luma_hps = PFX(interp_8tap_horiz_ps_8x16_sse4);
+        p.pu[LUMA_16x32].luma_hps = PFX(interp_8tap_horiz_ps_16x32_sse4);
+        p.pu[LUMA_32x16].luma_hps = PFX(interp_8tap_horiz_ps_32x16_sse4);
+        p.pu[LUMA_64x32].luma_hps = PFX(interp_8tap_horiz_ps_64x32_sse4);
+        p.pu[LUMA_32x64].luma_hps = PFX(interp_8tap_horiz_ps_32x64_sse4);
+        p.pu[LUMA_16x12].luma_hps = PFX(interp_8tap_horiz_ps_16x12_sse4);
+        p.pu[LUMA_12x16].luma_hps = PFX(interp_8tap_horiz_ps_12x16_sse4);
+        p.pu[LUMA_16x4].luma_hps = PFX(interp_8tap_horiz_ps_16x4_sse4);
+        p.pu[LUMA_32x24].luma_hps = PFX(interp_8tap_horiz_ps_32x24_sse4);
+        p.pu[LUMA_24x32].luma_hps = PFX(interp_8tap_horiz_ps_24x32_sse4);
+        p.pu[LUMA_32x8].luma_hps = PFX(interp_8tap_horiz_ps_32x8_sse4);
+        p.pu[LUMA_8x32].luma_hps = PFX(interp_8tap_horiz_ps_8x32_sse4);
+        p.pu[LUMA_64x48].luma_hps = PFX(interp_8tap_horiz_ps_64x48_sse4);
+        p.pu[LUMA_48x64].luma_hps = PFX(interp_8tap_horiz_ps_48x64_sse4);
+        p.pu[LUMA_64x16].luma_hps = PFX(interp_8tap_horiz_ps_64x16_sse4);
+        p.pu[LUMA_16x64].luma_hps = PFX(interp_8tap_horiz_ps_16x64_sse4);
+
+        ALL_LUMA_PU(luma_vpp, interp_8tap_vert_pp, sse4); p.pu[LUMA_4x4].luma_vpp = PFX(interp_8tap_vert_pp_4x4_sse4);
+        ALL_LUMA_PU(luma_vps, interp_8tap_vert_ps, sse4); p.pu[LUMA_4x4].luma_vps = PFX(interp_8tap_vert_ps_4x4_sse4);
+        ALL_LUMA_PU(luma_vsp, interp_8tap_vert_sp, sse4); p.pu[LUMA_4x4].luma_vsp = PFX(interp_8tap_vert_sp_4x4_sse4);
+        ALL_LUMA_PU_T(luma_hvpp, interp_8tap_hv_pp_cpu); p.pu[LUMA_4x4].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_4x4>;
         CHROMA_420_HORIZ_FILTERS(sse4);
         CHROMA_420_VERT_FILTERS_SSE4(_sse4);
         CHROMA_422_HORIZ_FILTERS(_sse4);
@@ -1185,7 +1285,7 @@
 
         // TODO: check POPCNT flag!
         ALL_LUMA_TU_S(copy_cnt, copy_cnt_, sse4);
-#if X265_DEPTH <= 10
+#if X86_64 && X265_DEPTH <= 10
         ALL_LUMA_CU(psy_cost_pp, psyCost_pp, sse4);
 #endif
 
@@ -1203,6 +1303,7 @@
         p.saoCuStatsE3 = PFX(saoCuStatsE3_sse4);
 #endif
     }
+#if X86_64
     if (cpuMask & X265_CPU_AVX)
     {
         // p.pu[LUMA_4x4].satd = p.cu[BLOCK_4x4].sa8d = PFX(pixel_satd_4x4_avx); fails tests
@@ -2535,6 +2636,7 @@
         p.pu[LUMA_48x64].luma_hpp = PFX(interp_8tap_horiz_pp_48x64_avx512);
 
     }
+#endif
 }
 #else // if HIGH_BIT_DEPTH
 
diff -r dc2d7a2515fd -r 8173d05abf8d source/common/x86/h-ipfilter16.asm
--- a/source/common/x86/h-ipfilter16.asm	Fri Oct 06 16:38:18 2017 +0530
+++ b/source/common/x86/h-ipfilter16.asm	Thu Apr 05 14:56:26 2018 -0700
@@ -285,7 +285,8 @@
 ;------------------------------------------------------------------------------------------------------------
 ; void interp_8tap_horiz_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx
 ;------------------------------------------------------------------------------------------------------------
-    FILTER_HOR_LUMA_sse2 4, 4, pp
+%if ARCH_X86_64
+	FILTER_HOR_LUMA_sse2 4, 4, pp
     FILTER_HOR_LUMA_sse2 4, 8, pp
     FILTER_HOR_LUMA_sse2 4, 16, pp
     FILTER_HOR_LUMA_sse2 8, 4, pp
@@ -339,6 +340,7 @@
     FILTER_HOR_LUMA_sse2 64, 32, ps
     FILTER_HOR_LUMA_sse2 64, 48, ps
     FILTER_HOR_LUMA_sse2 64, 64, ps
+%endif
 
 ;-----------------------------------------------------------------------------
 ; void interp_4tap_horiz_%3_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
diff -r dc2d7a2515fd -r 8173d05abf8d source/common/x86/h4-ipfilter16.asm
--- a/source/common/x86/h4-ipfilter16.asm	Fri Oct 06 16:38:18 2017 +0530
+++ b/source/common/x86/h4-ipfilter16.asm	Thu Apr 05 14:56:26 2018 -0700
@@ -377,6 +377,7 @@
 ; void interp_4tap_horiz_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
 ;-----------------------------------------------------------------------------
 
+%if ARCH_X86_64
 FILTER_HOR_CHROMA_sse3 2, 4, pp
 FILTER_HOR_CHROMA_sse3 2, 8, pp
 FILTER_HOR_CHROMA_sse3 2, 16, pp
@@ -462,6 +463,7 @@
 FILTER_HOR_CHROMA_sse3 64, 32, ps
 FILTER_HOR_CHROMA_sse3 64, 48, ps
 FILTER_HOR_CHROMA_sse3 64, 64, ps
+%endif
 
 %macro FILTER_W2_2 1
     movu        m3,         [r0]
diff -r dc2d7a2515fd -r 8173d05abf8d source/common/x86/intrapred16.asm
--- a/source/common/x86/intrapred16.asm	Fri Oct 06 16:38:18 2017 +0530
+++ b/source/common/x86/intrapred16.asm	Thu Apr 05 14:56:26 2018 -0700
@@ -196,6 +196,7 @@
 ;-----------------------------------------------------------------------------------
 ; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel* above, int, int filter)
 ;-----------------------------------------------------------------------------------
+%if ARCH_X86_64
 INIT_XMM sse2
 cglobal intra_pred_dc8, 5, 8, 2
     movu            m0,            [r2 + 34]
@@ -275,10 +276,13 @@
     mov             [r0 + r7],     r3w
 .end:
     RET
+%endif
 
 ;-------------------------------------------------------------------------------------------------------
 ; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int dirMode, int filter)
 ;-------------------------------------------------------------------------------------------------------
+%if ARCH_X86_64
+;This code is meant for 64 bit architecture
 INIT_XMM sse2
 cglobal intra_pred_dc16, 5, 10, 4
     lea             r3,                  [r2 + 66]
@@ -410,6 +414,7 @@
     mov             [r9 + r1 * 8],       r3w
 .end:
     RET
+%endif
 
 ;-------------------------------------------------------------------------------------------
 ; void intra_pred_dc(pixel* above, pixel* left, pixel* dst, intptr_t dstStride, int filter)
@@ -474,6 +479,7 @@
 ;-------------------------------------------------------------------------------------------------------
 ; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int dirMode, int filter)
 ;-------------------------------------------------------------------------------------------------------
+%if ARCH_X86_64
 INIT_YMM avx2
 cglobal intra_pred_dc16, 3, 9, 4
     mov             r3d,                 r4m
@@ -682,6 +688,7 @@
     movu            [r0 + r2 * 1 +  0], m0
     movu            [r0 + r2 * 1 + mmsize], m0
     RET
+%endif
 
 ;---------------------------------------------------------------------------------------
 ; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter)
@@ -1104,6 +1111,7 @@
 ;---------------------------------------------------------------------------------------
 ; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter)
 ;---------------------------------------------------------------------------------------
+%if ARCH_X86_64
 INIT_XMM sse2
 cglobal intra_pred_planar32, 3,3,16
     movd            m3, [r2 + 66]               ; topRight   = above[32]
@@ -1209,7 +1217,7 @@
 %endrep
     RET
 %endif
-
+%endif
 ;---------------------------------------------------------------------------------------
 ; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter)
 ;---------------------------------------------------------------------------------------
@@ -2063,6 +2071,7 @@
     STORE_4x4
     RET
 
+%if ARCH_X86_64
 cglobal intra_pred_ang4_26, 3,3,3
     movh        m0,             [r2 + 2] ;[8 7 6 5 4 3 2 1]
     add         r1d,            r1d
@@ -2098,6 +2107,7 @@
     mov         [r0 + r3],      r2w
 .quit:
     RET
+%endif
 
 cglobal intra_pred_ang4_27, 3,3,5
     movu        m0, [r2 + 2]            ;[8 7 6 5 4 3 2 1]
@@ -11122,6 +11132,7 @@
 .end%11:
 %endmacro
 
+%if ARCH_X86_64
 ;; angle 16, modes 3 and 33
 cglobal ang16_mode_3_33
     test            r6d, r6d
@@ -18220,6 +18231,7 @@
 
     mov         rsp,                [rsp+4*mmsize]
     RET
+%endif
 ;-------------------------------------------------------------------------------------------------------
 ; end of avx2 code for intra_pred_ang32 mode 2 to 34
 ;-------------------------------------------------------------------------------------------------------
diff -r dc2d7a2515fd -r 8173d05abf8d source/common/x86/ipfilter16.asm
--- a/source/common/x86/ipfilter16.asm	Fri Oct 06 16:38:18 2017 +0530
+++ b/source/common/x86/ipfilter16.asm	Thu Apr 05 14:56:26 2018 -0700
@@ -266,6 +266,7 @@
 ;-------------------------------------------------------------------------------------------------------------
 ; void interp_8tap_vert_pp_%2x%3(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
 ;-------------------------------------------------------------------------------------------------------------
+%if ARCH_X86_64
     FILTER_VER_LUMA_sse2 pp, 4, 4
     FILTER_VER_LUMA_sse2 pp, 8, 8
     FILTER_VER_LUMA_sse2 pp, 8, 4
@@ -320,6 +321,7 @@
     FILTER_VER_LUMA_sse2 ps, 48, 64
     FILTER_VER_LUMA_sse2 ps, 64, 16
     FILTER_VER_LUMA_sse2 ps, 16, 64
+%endif
 
 ;-----------------------------------------------------------------------------
 ;p2s and p2s_aligned avx512 code start
@@ -5620,6 +5622,7 @@
 ;-------------------------------------------------------------------------------------------------------------
 ; void interp_4tap_horiz_pp(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx
 ;-------------------------------------------------------------------------------------------------------------
+%if ARCH_X86_64
 INIT_ZMM avx512
 cglobal interp_4tap_horiz_pp_8x4, 5,8,11
     add             r1d, r1d
@@ -5645,9 +5648,10 @@
 
     PROCESS_IPFILTER_CHROMA_PP_8x4_AVX512
     RET
-
+%endif
+
+%macro IPFILTER_CHROMA_AVX512_8xN 1
 INIT_ZMM avx512
-%macro IPFILTER_CHROMA_AVX512_8xN 1
 cglobal interp_4tap_horiz_pp_8x%1, 5,8,11
     add             r1d, r1d
     add             r3d, r3d
@@ -5679,14 +5683,16 @@
     RET
 %endmacro
 
+%if ARCH_X86_64
 IPFILTER_CHROMA_AVX512_8xN 8
 IPFILTER_CHROMA_AVX512_8xN 12
 IPFILTER_CHROMA_AVX512_8xN 16
 IPFILTER_CHROMA_AVX512_8xN 32
 IPFILTER_CHROMA_AVX512_8xN 64
-
+%endif
+
+%macro IPFILTER_CHROMA_AVX512_16xN 1
 INIT_ZMM avx512
-%macro IPFILTER_CHROMA_AVX512_16xN 1
 cglobal interp_4tap_horiz_pp_16x%1, 5,6,11
     add             r1d, r1d
     add             r3d, r3d
@@ -5716,6 +5722,7 @@
     RET
 %endmacro
 
+%if ARCH_X86_64
 IPFILTER_CHROMA_AVX512_16xN 4
 IPFILTER_CHROMA_AVX512_16xN 8
 IPFILTER_CHROMA_AVX512_16xN 12
@@ -5723,9 +5730,10 @@
 IPFILTER_CHROMA_AVX512_16xN 24
 IPFILTER_CHROMA_AVX512_16xN 32
 IPFILTER_CHROMA_AVX512_16xN 64
-
+%endif
+
+%macro IPFILTER_CHROMA_AVX512_24xN 1
 INIT_ZMM avx512
-%macro IPFILTER_CHROMA_AVX512_24xN 1
 cglobal interp_4tap_horiz_pp_24x%1, 5,8,11
     add             r1d, r1d
     add             r3d, r3d
@@ -5757,11 +5765,13 @@
     RET
 %endmacro
 
+%if ARCH_X86_64
 IPFILTER_CHROMA_AVX512_24xN 32
 IPFILTER_CHROMA_AVX512_24xN 64
-
+%endif
+
+%macro IPFILTER_CHROMA_AVX512_32xN 1
 INIT_ZMM avx512
-%macro IPFILTER_CHROMA_AVX512_32xN 1
 cglobal interp_4tap_horiz_pp_32x%1, 5,6,11
     add             r1d, r1d
     add             r3d, r3d
@@ -5791,15 +5801,17 @@
     RET
 %endmacro
 
+%if ARCH_X86_64
 IPFILTER_CHROMA_AVX512_32xN 8
 IPFILTER_CHROMA_AVX512_32xN 16
 IPFILTER_CHROMA_AVX512_32xN 24
 IPFILTER_CHROMA_AVX512_32xN 32
 IPFILTER_CHROMA_AVX512_32xN 48
 IPFILTER_CHROMA_AVX512_32xN 64
-
+%endif
+
+%macro IPFILTER_CHROMA_AVX512_64xN 1
 INIT_ZMM avx512
-%macro IPFILTER_CHROMA_AVX512_64xN 1
 cglobal interp_4tap_horiz_pp_64x%1, 5,6,11
     add             r1d, r1d
     add             r3d, r3d
@@ -5829,11 +5841,14 @@
     RET
 %endmacro
 
+%if ARCH_X86_64
 IPFILTER_CHROMA_AVX512_64xN 16
 IPFILTER_CHROMA_AVX512_64xN 32
 IPFILTER_CHROMA_AVX512_64xN 48
 IPFILTER_CHROMA_AVX512_64xN 64
-
+%endif
+
+%if ARCH_X86_64
 INIT_ZMM avx512
 cglobal interp_4tap_horiz_pp_48x64, 5,6,11
     add             r1d, r1d
@@ -5862,6 +5877,7 @@
 %endrep
     PROCESS_IPFILTER_CHROMA_PP_48x2_AVX512
     RET
+%endif
 ;-------------------------------------------------------------------------------------------------------------
 ;ipfilter_chroma_avx512 code end
 ;-------------------------------------------------------------------------------------------------------------
@@ -6428,8 +6444,8 @@
     movu            [r2 + r3 + mmsize],    m10
 %endmacro
 
+%macro IPFILTER_LUMA_AVX512_16xN 1
 INIT_ZMM avx512
-%macro IPFILTER_LUMA_AVX512_16xN 1
 cglobal interp_8tap_horiz_pp_16x%1, 5,8,17
     add              r1d,        r1d
     add              r3d,        r3d
@@ -6467,15 +6483,17 @@
     RET
 %endmacro
 
+%if ARCH_X86_64
 IPFILTER_LUMA_AVX512_16xN 4
 IPFILTER_LUMA_AVX512_16xN 8
 IPFILTER_LUMA_AVX512_16xN 12
 IPFILTER_LUMA_AVX512_16xN 16
 IPFILTER_LUMA_AVX512_16xN 32
 IPFILTER_LUMA_AVX512_16xN 64
-
+%endif
+
+%macro IPFILTER_LUMA_AVX512_32xN 1
 INIT_ZMM avx512
-%macro IPFILTER_LUMA_AVX512_32xN 1
 cglobal interp_8tap_horiz_pp_32x%1, 5,6,17
     add              r1d,        r1d
     add              r3d,        r3d
@@ -6511,14 +6529,16 @@
     RET
 %endmacro
 
+%if ARCH_X86_64
 IPFILTER_LUMA_AVX512_32xN 8
 IPFILTER_LUMA_AVX512_32xN 16
 IPFILTER_LUMA_AVX512_32xN 24
 IPFILTER_LUMA_AVX512_32xN 32
 IPFILTER_LUMA_AVX512_32xN 64
-
+%endif
+
+%macro IPFILTER_LUMA_AVX512_64xN 1
 INIT_ZMM avx512
-%macro IPFILTER_LUMA_AVX512_64xN 1
 cglobal interp_8tap_horiz_pp_64x%1, 5,6,17
     add              r1d,        r1d
     add              r3d,        r3d
@@ -6554,11 +6574,14 @@
     RET
 %endmacro
 
+%if ARCH_X86_64
 IPFILTER_LUMA_AVX512_64xN 16
 IPFILTER_LUMA_AVX512_64xN 32
 IPFILTER_LUMA_AVX512_64xN 48
 IPFILTER_LUMA_AVX512_64xN 64
-
+%endif
+
+%if ARCH_X86_64
 INIT_ZMM avx512
 cglobal interp_8tap_horiz_pp_48x64, 5,8,17
     add              r1d,        r1d
@@ -6595,6 +6618,7 @@
 %endrep
     PROCESS_IPFILTER_LUMA_PP_48x4_AVX512
     RET
+%endif
 ;-------------------------------------------------------------------------------------------------------------
 ;ipfilter_luma_avx512 code end
 ;-------------------------------------------------------------------------------------------------------------
diff -r dc2d7a2515fd -r 8173d05abf8d source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm	Fri Oct 06 16:38:18 2017 +0530
+++ b/source/common/x86/ipfilter8.asm	Thu Apr 05 14:56:26 2018 -0700
@@ -1999,6 +1999,7 @@
 ;-----------------------------------------------------------------------------
 ; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride)
 ;-----------------------------------------------------------------------------
+%if ARCH_X86_64
 INIT_ZMM avx512
 cglobal filterPixelToShort_32x8, 3, 7, 5
     mov         r3d, r3m
@@ -2104,6 +2105,7 @@
 %endrep
     PROCESS_P2S_32x4_AVX512
     RET
+%endif
 
 %macro PROCESS_P2S_ALIGNED_32x4_AVX512 0
     pmovzxbw    m0, [r0]
@@ -2129,6 +2131,7 @@
 ;-----------------------------------------------------------------------------
 ; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride)
 ;-----------------------------------------------------------------------------
+%if ARCH_X86_64
 INIT_ZMM avx512
 cglobal filterPixelToShort_aligned_32x8, 3, 7, 5
     mov         r3d, r3m
@@ -2234,6 +2237,7 @@
 %endrep
     PROCESS_P2S_ALIGNED_32x4_AVX512
     RET
+%endif
 ;-----------------------------------------------------------------------------
 ;p2s and p2s_aligned 32xN avx512 code end
 ;-----------------------------------------------------------------------------
@@ -2633,6 +2637,7 @@
 ;-----------------------------------------------------------------------------
 ; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride)
 ;-----------------------------------------------------------------------------
+%if ARCH_X86_64
 INIT_ZMM avx512
 cglobal filterPixelToShort_64x64, 3, 7, 5
     mov         r3d, r3m
@@ -2776,6 +2781,7 @@
 %endrep
     PROCESS_P2S_ALIGNED_64x4_AVX512
     RET
+%endif
 ;-----------------------------------------------------------------------------
 ;p2s and p2s_aligned 64xN avx512 code end
 ;-----------------------------------------------------------------------------
@@ -3352,6 +3358,7 @@
 ;-----------------------------------------------------------------------------
 ; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride)
 ;-----------------------------------------------------------------------------
+%if ARCH_X86_64
 INIT_ZMM avx512
 cglobal filterPixelToShort_48x64, 3,7,5
     mov         r3d, r3m
@@ -3419,6 +3426,7 @@
     lea         r2, [r2 + r3 * 4]
     PROCESS_P2S_ALIGNED_48x8_AVX512
     RET
+%endif
 ;-----------------------------------------------------------------------------
 ;p2s and p2s_aligned 48xN avx512 code end
 ;-----------------------------------------------------------------------------
@@ -10326,10 +10334,12 @@
     RET
 %endmacro
 
+%if ARCH_X86_64
     IPFILTER_CHROMA_PP_64xN_AVX512  64
     IPFILTER_CHROMA_PP_64xN_AVX512  32
     IPFILTER_CHROMA_PP_64xN_AVX512  48
     IPFILTER_CHROMA_PP_64xN_AVX512  16
+%endif
 
 %macro IPFILTER_CHROMA_PP_32xN_AVX512 1
 INIT_ZMM avx512
@@ -10358,12 +10368,14 @@
     RET
 %endmacro
 
+%if ARCH_X86_64
     IPFILTER_CHROMA_PP_32xN_AVX512 16
     IPFILTER_CHROMA_PP_32xN_AVX512 24
     IPFILTER_CHROMA_PP_32xN_AVX512 8
     IPFILTER_CHROMA_PP_32xN_AVX512 32
     IPFILTER_CHROMA_PP_32xN_AVX512 64
     IPFILTER_CHROMA_PP_32xN_AVX512 48
+%endif
 
 %macro IPFILTER_CHROMA_PP_16xN_AVX512 1
 INIT_ZMM avx512
@@ -10393,6 +10405,7 @@
     RET
 %endmacro
 
+%if ARCH_X86_64
     IPFILTER_CHROMA_PP_16xN_AVX512 4
     IPFILTER_CHROMA_PP_16xN_AVX512 8
     IPFILTER_CHROMA_PP_16xN_AVX512 12
@@ -10400,7 +10413,9 @@
     IPFILTER_CHROMA_PP_16xN_AVX512 24
     IPFILTER_CHROMA_PP_16xN_AVX512 32
     IPFILTER_CHROMA_PP_16xN_AVX512 64
-
+%endif
+
+%if ARCH_X86_64
 INIT_ZMM avx512
 cglobal interp_4tap_horiz_pp_48x64, 4,8,9
     mov               r4d,          r4m
@@ -10426,6 +10441,7 @@
 %endrep
     PROCESS_IPFILTER_CHROMA_PP_48x4_AVX512
     RET
+%endif
 
 %macro PROCESS_IPFILTER_CHROMA_PS_64x1_AVX512 0
     movu               ym6,          [r0]
@@ -10501,10 +10517,12 @@
     RET
 %endmacro
 
+%if ARCH_X86_64
     IPFILTER_CHROMA_PS_64xN_AVX512 64
     IPFILTER_CHROMA_PS_64xN_AVX512 32
     IPFILTER_CHROMA_PS_64xN_AVX512 48
     IPFILTER_CHROMA_PS_64xN_AVX512 16
+%endif
 
 %macro PROCESS_IPFILTER_CHROMA_PS_32x1_AVX512 0
     movu               ym6,          [r0]
@@ -10567,12 +10585,14 @@
     RET
 %endmacro
 
+%if ARCH_X86_64
     IPFILTER_CHROMA_PS_32xN_AVX512 64
     IPFILTER_CHROMA_PS_32xN_AVX512 48
     IPFILTER_CHROMA_PS_32xN_AVX512 32
     IPFILTER_CHROMA_PS_32xN_AVX512 24
     IPFILTER_CHROMA_PS_32xN_AVX512 16
     IPFILTER_CHROMA_PS_32xN_AVX512 8
+%endif
 
 %macro PROCESS_IPFILTER_CHROMA_PS_16x2_AVX512 0
     movu               xm6,         [r0]
@@ -11085,10 +11105,12 @@
     RET
 %endmacro
 
+%if ARCH_X86_64
 IPFILTER_LUMA_64xN_AVX512 16
 IPFILTER_LUMA_64xN_AVX512 32
 IPFILTER_LUMA_64xN_AVX512 48
 IPFILTER_LUMA_64xN_AVX512 64
+%endif
 
 %macro IPFILTER_LUMA_32xN_AVX512 1
 INIT_ZMM avx512
@@ -11118,11 +11140,13 @@
     RET
 %endmacro
 
+%if ARCH_X86_64
 IPFILTER_LUMA_32xN_AVX512 8
 IPFILTER_LUMA_32xN_AVX512 16
 IPFILTER_LUMA_32xN_AVX512 24
 IPFILTER_LUMA_32xN_AVX512 32
 IPFILTER_LUMA_32xN_AVX512 64
+%endif
 
 %macro IPFILTER_LUMA_16xN_AVX512 1
 INIT_ZMM avx512
@@ -11154,13 +11178,16 @@
     RET
 %endmacro
 
+%if ARCH_X86_64
 IPFILTER_LUMA_16xN_AVX512 4
 IPFILTER_LUMA_16xN_AVX512 8
 IPFILTER_LUMA_16xN_AVX512 12
 IPFILTER_LUMA_16xN_AVX512 16
 IPFILTER_LUMA_16xN_AVX512 32
 IPFILTER_LUMA_16xN_AVX512 64
-
+%endif
+
+%if ARCH_X86_64
 INIT_ZMM avx512
 cglobal interp_8tap_horiz_pp_48x64, 4,8,14
     sub               r0,    3
@@ -11188,6 +11215,7 @@
 %endrep
     PROCESS_IPFILTER_LUMA_PP_48x4_AVX512
     RET
+%endif
 
 %macro PROCESS_IPFILTER_LUMA_PS_64x1_AVX512 0
     ; register map
diff -r dc2d7a2515fd -r 8173d05abf8d source/common/x86/loopfilter.asm
--- a/source/common/x86/loopfilter.asm	Fri Oct 06 16:38:18 2017 +0530
+++ b/source/common/x86/loopfilter.asm	Thu Apr 05 14:56:26 2018 -0700
@@ -58,6 +58,7 @@
 ;============================================================================================================
 INIT_XMM sse4
 %if HIGH_BIT_DEPTH
+%if ARCH_X86_64
 cglobal saoCuOrgE0, 4,5,9
     mov         r4d, r4m
     movh        m6,  [r1]
@@ -157,7 +158,7 @@
     sub         r4d, 16
     jnz        .loopH
     RET
-
+%endif
 %else ; HIGH_BIT_DEPTH == 1
 
 cglobal saoCuOrgE0, 5, 5, 8, rec, offsetEo, lcuWidth, signLeft, stride
@@ -249,6 +250,7 @@
 
 INIT_YMM avx2
 %if HIGH_BIT_DEPTH
+%if ARCH_X86_64
 cglobal saoCuOrgE0, 4,4,9
     vbroadcasti128  m6, [r1]
     movzx           r1d, byte [r3]
@@ -308,6 +310,7 @@
     dec             r2d
     jnz             .loop
     RET
+%endif
 %else ; HIGH_BIT_DEPTH
 cglobal saoCuOrgE0, 5, 5, 7, rec, offsetEo, lcuWidth, signLeft, stride
 
@@ -1655,6 +1658,7 @@
     RET
 %endif
 
+%if ARCH_X86_64
 INIT_YMM avx2
 %if HIGH_BIT_DEPTH
 cglobal saoCuOrgB0, 5,7,8
@@ -1814,6 +1818,7 @@
 .end:
     RET
 %endif
+%endif
 
 ;============================================================================================================
 ; void calSign(int8_t *dst, const Pixel *src1, const Pixel *src2, const int width)
diff -r dc2d7a2515fd -r 8173d05abf8d source/common/x86/mc-a.asm
--- a/source/common/x86/mc-a.asm	Fri Oct 06 16:38:18 2017 +0530
+++ b/source/common/x86/mc-a.asm	Thu Apr 05 14:56:26 2018 -0700
@@ -1034,6 +1034,7 @@
 ;------------------------------------------------------------------------------
 ; avx2 asm for addAvg high_bit_depth
 ;------------------------------------------------------------------------------
+%if ARCH_X86_64
 INIT_YMM avx2
 cglobal addAvg_8x2, 6,6,2, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
     movu        xm0,         [r0]
@@ -1111,6 +1112,7 @@
     movu        [r2],        xm0
     movu        [r2 + r5],   xm2
     RET
+%endif
 
 %macro ADDAVG_W8_H4_AVX2 1
 cglobal addAvg_8x%1, 6,7,6, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
@@ -1165,13 +1167,16 @@
     RET
 %endmacro
 
+%if ARCH_X86_64
 ADDAVG_W8_H4_AVX2 4
 ADDAVG_W8_H4_AVX2 8
 ADDAVG_W8_H4_AVX2 12
 ADDAVG_W8_H4_AVX2 16
 ADDAVG_W8_H4_AVX2 32
 ADDAVG_W8_H4_AVX2 64
-
+%endif
+
+%if ARCH_X86_64
 cglobal addAvg_12x16, 6,7,6, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
     mova           m4,             [pw_ %+ ADDAVG_ROUND]
     mova           m5,             [pw_pixel_max]
@@ -1255,6 +1260,7 @@
     dec            r6d
     jnz            .loop
     RET
+%endif
 
 %macro ADDAVG_W16_H4_AVX2 1
 cglobal addAvg_16x%1, 6,7,6, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
@@ -1296,6 +1302,7 @@
     RET
 %endmacro
 
+%if ARCH_X86_64
 ADDAVG_W16_H4_AVX2 4
 ADDAVG_W16_H4_AVX2 8
 ADDAVG_W16_H4_AVX2 12
@@ -1303,7 +1310,9 @@
 ADDAVG_W16_H4_AVX2 24
 ADDAVG_W16_H4_AVX2 32
 ADDAVG_W16_H4_AVX2 64
-
+%endif
+
+%if ARCH_X86_64
 cglobal addAvg_24x32, 6,7,6, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
     mova        m4,              [pw_ %+ ADDAVG_ROUND]
     mova        m5,              [pw_pixel_max]
@@ -1415,6 +1424,7 @@
     dec         r6d
     jnz         .loop
     RET
+%endif
 
 %macro ADDAVG_W32_H2_AVX2 1
 cglobal addAvg_32x%1, 6,7,6, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
@@ -1474,13 +1484,16 @@
     RET
 %endmacro
 
+%if ARCH_X86_64
 ADDAVG_W32_H2_AVX2 8
 ADDAVG_W32_H2_AVX2 16
 ADDAVG_W32_H2_AVX2 24
 ADDAVG_W32_H2_AVX2 32
 ADDAVG_W32_H2_AVX2 48
 ADDAVG_W32_H2_AVX2 64
-
+%endif
+
+%if ARCH_X86_64
 cglobal addAvg_48x64, 6,7,6, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
     mova        m4,              [pw_ %+ ADDAVG_ROUND]
     mova        m5,              [pw_pixel_max]
@@ -1554,6 +1567,7 @@
     dec         r6d
     jnz        .loop
     RET
+%endif
 
 %macro ADDAVG_W64_H1_AVX2 1
 cglobal addAvg_64x%1, 6,7,6, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
@@ -1649,11 +1663,12 @@
     RET
 %endmacro
 
+%if ARCH_X86_64
 ADDAVG_W64_H1_AVX2 16
 ADDAVG_W64_H1_AVX2 32
 ADDAVG_W64_H1_AVX2 48
 ADDAVG_W64_H1_AVX2 64
-
+%endif
 ;-----------------------------------------------------------------------------
 ;addAvg avx512 high bit depth code start
 ;-----------------------------------------------------------------------------
@@ -1875,6 +1890,7 @@
 ;-----------------------------------------------------------------------------
 ;void addAvg (int16_t* src0, int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride)
 ;-----------------------------------------------------------------------------
+%if ARCH_X86_64
 INIT_ZMM avx512
 cglobal addAvg_16x4, 6,9,6
     vbroadcasti32x8        m4,              [pw_ %+ ADDAVG_ROUND]
@@ -1889,6 +1905,7 @@
     lea         r8,        [3 * r5]
     PROCESS_ADDAVG_16x4_HBD_AVX512
     RET
+%endif
 
 %macro ADDAVG_W16_HBD_AVX512 1
 INIT_ZMM avx512
@@ -1914,12 +1931,14 @@
     RET
 %endmacro
 
+%if ARCH_X86_64
 ADDAVG_W16_HBD_AVX512 8
 ADDAVG_W16_HBD_AVX512 12
 ADDAVG_W16_HBD_AVX512 16
 ADDAVG_W16_HBD_AVX512 24
 ADDAVG_W16_HBD_AVX512 32
 ADDAVG_W16_HBD_AVX512 64
+%endif
 
 %macro ADDAVG_W32_HBD_AVX512 1
 INIT_ZMM avx512
@@ -1945,12 +1964,14 @@
     RET
 %endmacro
 
+%if ARCH_X86_64
 ADDAVG_W32_HBD_AVX512 8
 ADDAVG_W32_HBD_AVX512 16
 ADDAVG_W32_HBD_AVX512 24
 ADDAVG_W32_HBD_AVX512 32
 ADDAVG_W32_HBD_AVX512 48
 ADDAVG_W32_HBD_AVX512 64
+%endif
 
 %macro ADDAVG_W64_HBD_AVX512 1
 INIT_ZMM avx512
@@ -1976,11 +1997,14 @@
     RET
 %endmacro
 
+%if ARCH_X86_64
 ADDAVG_W64_HBD_AVX512 16
 ADDAVG_W64_HBD_AVX512 32
 ADDAVG_W64_HBD_AVX512 48
 ADDAVG_W64_HBD_AVX512 64
-
+%endif
+
+%if ARCH_X86_64
 INIT_ZMM avx512
 cglobal addAvg_48x64, 6,9,6
     vbroadcasti32x8        m4,              [pw_ %+ ADDAVG_ROUND]
@@ -2002,6 +2026,7 @@
 %endrep
     PROCESS_ADDAVG_48x4_HBD_AVX512
     RET
+%endif
 
 %macro PROCESS_ADDAVG_ALIGNED_16x4_HBD_AVX512 0
     movu              ym0,              [r0]
@@ -2221,6 +2246,7 @@
 ;-----------------------------------------------------------------------------
 ;void addAvg (int16_t* src0, int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride)
 ;-----------------------------------------------------------------------------
+%if ARCH_X86_64
 INIT_ZMM avx512
 cglobal addAvg_aligned_16x4, 6,9,6
     vbroadcasti32x8        m4,              [pw_ %+ ADDAVG_ROUND]
@@ -2235,6 +2261,7 @@
     lea         r8,        [3 * r5]
     PROCESS_ADDAVG_ALIGNED_16x4_HBD_AVX512
     RET
+%endif
 
 %macro ADDAVG_ALIGNED_W16_HBD_AVX512 1
 INIT_ZMM avx512
@@ -2260,12 +2287,14 @@
     RET
 %endmacro
 
+%if ARCH_X86_64
 ADDAVG_ALIGNED_W16_HBD_AVX512 8
 ADDAVG_ALIGNED_W16_HBD_AVX512 12
 ADDAVG_ALIGNED_W16_HBD_AVX512 16
 ADDAVG_ALIGNED_W16_HBD_AVX512 24
 ADDAVG_ALIGNED_W16_HBD_AVX512 32
 ADDAVG_ALIGNED_W16_HBD_AVX512 64
+%endif
 
 %macro ADDAVG_ALIGNED_W32_HBD_AVX512 1
 INIT_ZMM avx512
@@ -2291,12 +2320,14 @@
     RET
 %endmacro
 
+%if ARCH_X86_64
 ADDAVG_ALIGNED_W32_HBD_AVX512 8
 ADDAVG_ALIGNED_W32_HBD_AVX512 16
 ADDAVG_ALIGNED_W32_HBD_AVX512 24
 ADDAVG_ALIGNED_W32_HBD_AVX512 32
 ADDAVG_ALIGNED_W32_HBD_AVX512 48
 ADDAVG_ALIGNED_W32_HBD_AVX512 64
+%endif
 
 %macro ADDAVG_ALIGNED_W64_HBD_AVX512 1
 INIT_ZMM avx512
@@ -2322,11 +2353,14 @@
     RET
 %endmacro
 
+%if ARCH_X86_64
 ADDAVG_ALIGNED_W64_HBD_AVX512 16
 ADDAVG_ALIGNED_W64_HBD_AVX512 32
 ADDAVG_ALIGNED_W64_HBD_AVX512 48
 ADDAVG_ALIGNED_W64_HBD_AVX512 64
-
+%endif
+
+%if ARCH_X86_64
 INIT_ZMM avx512
 cglobal addAvg_aligned_48x64, 6,9,6
     vbroadcasti32x8        m4,              [pw_ %+ ADDAVG_ROUND]
@@ -2348,6 +2382,7 @@
 %endrep
     PROCESS_ADDAVG_ALIGNED_48x4_HBD_AVX512
     RET
+%endif
 ;-----------------------------------------------------------------------------
 ;addAvg avx512 high bit depth code end
 ;-----------------------------------------------------------------------------
@@ -6530,11 +6565,13 @@
     RET
 %endmacro
 
+%if ARCH_X86_64
 PIXEL_AVG_HBD_W32 8
 PIXEL_AVG_HBD_W32 16
 PIXEL_AVG_HBD_W32 24
 PIXEL_AVG_HBD_W32 32
 PIXEL_AVG_HBD_W32 64
+%endif
 
 %macro PIXEL_AVG_HBD_W64 1
 INIT_ZMM avx512
@@ -6556,11 +6593,14 @@
     RET
 %endmacro
 
+%if ARCH_X86_64
 PIXEL_AVG_HBD_W64 16
 PIXEL_AVG_HBD_W64 32
 PIXEL_AVG_HBD_W64 48
 PIXEL_AVG_HBD_W64 64
-
+%endif
+
+%if ARCH_X86_64
 INIT_ZMM avx512
 cglobal pixel_avg_48x64, 6,9,4
     add     r1d, r1d
@@ -6578,6 +6618,7 @@
 %endrep
     PROCESS_PIXELAVG_48x8_HBD_AVX512
     RET
+%endif
 ;-----------------------------------------------------------------------------
 ;pixel_avg_pp avx512 high bit depth code end
 ;-----------------------------------------------------------------------------
@@ -6709,6 +6750,7 @@
     jg .height_loop
     RET
 
+%if ARCH_X86_64
 INIT_YMM avx2
 cglobal pixel_avg2_w20, 6,7
     sub    r2, r4
@@ -6725,6 +6767,7 @@
     sub    r5d, 2
     jg     .height_loop
     RET
+%endif
 
 ; Cacheline split code for processors with high latencies for loads
 ; split over cache lines.  See sad-a.asm for a more detailed explanation.
diff -r dc2d7a2515fd -r 8173d05abf8d source/common/x86/pixel-a.asm
--- a/source/common/x86/pixel-a.asm	Fri Oct 06 16:38:18 2017 +0530
+++ b/source/common/x86/pixel-a.asm	Thu Apr 05 14:56:26 2018 -0700
@@ -14055,6 +14055,7 @@
     ;lea %8, [%8+4*r3]
 %endmacro
 
+%if ARCH_X86_64
 INIT_YMM avx2
 cglobal pixel_satd_8x8, 4,4,7
 
@@ -14620,5 +14621,5 @@
 
     movd eax, xm0
     RET
-
+%endif
 %endif ; HIGH_BIT_DEPTH == 1 && BIT_DEPTH == 10
diff -r dc2d7a2515fd -r 8173d05abf8d source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm	Fri Oct 06 16:38:18 2017 +0530
+++ b/source/common/x86/pixel-util8.asm	Thu Apr 05 14:56:26 2018 -0700
@@ -7707,8 +7707,13 @@
     paddd          xm5, xm1
     HADDW          xm4, xm2
     HADDD          xm5, xm1
+%if ARCH_X86_64
     punpckldq      xm4, xm5
     movq           rax, xm4
+%else
+    movd           eax, xm4
+    movd           edx, xm5
+%endif
 %endmacro
 
 %if HIGH_BIT_DEPTH==0
diff -r dc2d7a2515fd -r 8173d05abf8d source/common/x86/sad16-a.asm
--- a/source/common/x86/sad16-a.asm	Fri Oct 06 16:38:18 2017 +0530
+++ b/source/common/x86/sad16-a.asm	Thu Apr 05 14:56:26 2018 -0700
@@ -1292,6 +1292,7 @@
 ;-----------------------------------------------------------------------------
 ; int pixel_sad_64x%1( uint16_t *, intptr_t, uint16_t *, intptr_t )
 ;-----------------------------------------------------------------------------
+%if ARCH_X86_64
 INIT_ZMM avx512
 cglobal pixel_sad_64x16, 4,6,7
     pxor    m0, m0
@@ -1399,10 +1400,12 @@
     PROCESS_SAD_64x8_AVX512
     PROCESS_SAD_AVX512_END
     RET
+%endif
 
 ;-----------------------------------------------------------------------------
 ; int pixel_sad_32x%1( uint16_t *, intptr_t, uint16_t *, intptr_t )
 ;-----------------------------------------------------------------------------
+%if ARCH_X86_64
 INIT_ZMM avx512
 cglobal pixel_sad_32x8, 4,6,7
     pxor    m0, m0
@@ -1517,10 +1520,12 @@
     PROCESS_SAD_32x8_AVX512
     PROCESS_SAD_AVX512_END
     RET
+%endif
 
 ;-----------------------------------------------------------------------------
 ; int pixel_sad_48x64( uint16_t *, intptr_t, uint16_t *, intptr_t )
 ;-----------------------------------------------------------------------------
+%if ARCH_X86_64
 INIT_ZMM avx512
 cglobal pixel_sad_48x64, 4, 7, 9
     pxor    m0,  m0
@@ -1622,6 +1627,7 @@
 
     PROCESS_SAD_AVX512_END
     RET
+%endif
 
 ;=============================================================================
 ; SAD x3/x4
@@ -2611,7 +2617,7 @@
 ;------------------------------------------------------------------------------------------------------------------------------------------
 ; void pixel_sad_x3_32x%1( const pixel* pix1, const pixel* pix2, const pixel* pix3, const pixel* pix4, intptr_t frefstride, int32_t* res )
 ;------------------------------------------------------------------------------------------------------------------------------------------
-
+%if ARCH_X86_64
 INIT_ZMM avx512
 cglobal pixel_sad_x3_32x8, 6,7,8
     pxor    m0,  m0
@@ -2970,11 +2976,12 @@
 
     PROCESS_SAD_X3_END_AVX512
     RET
+%endif
 
 ;------------------------------------------------------------------------------------------------------------------------------------------
 ; void pixel_sad_x3_64x%1( const pixel* pix1, const pixel* pix2, const pixel* pix3, const pixel* pix4, intptr_t frefstride, int32_t* res )
 ;------------------------------------------------------------------------------------------------------------------------------------------
-
+%if ARCH_X86_64
 INIT_ZMM avx512
 cglobal pixel_sad_x3_64x16, 6,7,12
     pxor    m0,  m0
@@ -3214,11 +3221,11 @@
     PROCESS_SAD_X3_64x4_AVX512
     PROCESS_SAD_X3_END_AVX512
     RET
-
+%endif
 ;------------------------------------------------------------------------------------------------------------------------------------------------------------
 ; void pixel_sad_x4_32x%1( const pixel* pix1, const pixel* pix2, const pixel* pix3, const pixel* pix4, const pixel* pix5, intptr_t frefstride, int32_t* res )
 ;------------------------------------------------------------------------------------------------------------------------------------------------------------
-
+%if ARCH_X86_64
 INIT_ZMM avx512
 cglobal pixel_sad_x4_32x8, 6,8,10
     pxor    m0,  m0
@@ -3485,10 +3492,11 @@
     PROCESS_SAD_X4_32x4_AVX512
     PROCESS_SAD_X4_END_AVX512
     RET
-
+%endif
 ;------------------------------------------------------------------------------------------------------------------------------------------------------------
 ; void pixel_sad_x4_48x64( const pixel* pix1, const pixel* pix2, const pixel* pix3, const pixel* pix4, const pixel* pix5, intptr_t frefstride, int32_t* res )
 ;------------------------------------------------------------------------------------------------------------------------------------------------------------
+%if ARCH_X86_64
 INIT_ZMM avx512
 cglobal pixel_sad_x4_48x64, 4, 9, 20
     pxor    m0,  m0
@@ -3644,11 +3652,12 @@
 
     PROCESS_SAD_X4_END_AVX512
     RET
+%endif
 
 ;------------------------------------------------------------------------------------------------------------------------------------------------------------
 ; void pixel_sad_x4_64x%1( const pixel* pix1, const pixel* pix2, const pixel* pix3, const pixel* pix4, const pixel* pix5, intptr_t frefstride, int32_t* res )
 ;------------------------------------------------------------------------------------------------------------------------------------------------------------
-
+%if ARCH_X86_64
 INIT_ZMM avx512
 cglobal pixel_sad_x4_64x16, 6,8,15
     pxor    m0,  m0
@@ -3928,3 +3937,4 @@
     PROCESS_SAD_X4_64x4_AVX512
     PROCESS_SAD_X4_END_AVX512
     RET
+%endif
diff -r dc2d7a2515fd -r 8173d05abf8d source/common/x86/ssd-a.asm
--- a/source/common/x86/ssd-a.asm	Fri Oct 06 16:38:18 2017 +0530
+++ b/source/common/x86/ssd-a.asm	Thu Apr 05 14:56:26 2018 -0700
@@ -141,6 +141,8 @@
 
 ; Function to find ssd for 32x16 block, sse2, 12 bit depth
 ; Defined sepeartely to be called from SSD_ONE_32 macro
+%if ARCH_X86_64
+;This code is written for 64 bit architecture
 INIT_XMM sse2
 cglobal ssd_ss_32x16
     pxor        m8, m8
@@ -180,8 +182,10 @@
     paddq       m4, m5
     paddq       m9, m4
     ret
+%endif
 
 %macro SSD_ONE_32 0
+%if ARCH_X86_64
 cglobal pixel_ssd_ss_32x64, 4,7,10
     add         r1d, r1d
     add         r3d, r3d
@@ -193,7 +197,9 @@
     call        ssd_ss_32x16
     movq        rax, m9
     RET
+%endif
 %endmacro
+
 %macro SSD_ONE_SS_32 0
 cglobal pixel_ssd_ss_32x32, 4,5,8
     add         r1d, r1d
@@ -554,6 +560,7 @@
     RET
 %endmacro
 
+%if ARCH_X86_64
 INIT_YMM avx2
 cglobal pixel_ssd_16x16, 4,7,3
     FIX_STRIDES r1, r3
@@ -696,6 +703,7 @@
     paddq           xm3, xm4
     movq            rax, xm3
     RET
+%endif
 
 INIT_MMX mmx2
 SSD_ONE     4,  4
@@ -726,7 +734,9 @@
 %if BIT_DEPTH <= 10
     SSD_ONE    32, 64
     SSD_ONE    32, 32
+%if ARCH_X86_64
     SSD_TWO    64, 64
+%endif
 %else
     SSD_ONE_32
     SSD_ONE_SS_32
@@ -3246,7 +3256,7 @@
     movd    eax, m0
     RET
 
-
+%if ARCH_X86_64 && BIT_DEPTH >= 10
 INIT_XMM sse2
 cglobal pixel_ssd_s_32, 2,3,5
     add     r1, r1
@@ -3287,7 +3297,6 @@
     dec     r2d
     jnz    .loop
 
-%if BIT_DEPTH >= 10
     movu            m1, m0
     pxor            m2, m2
     punpckldq       m0, m2
@@ -3296,13 +3305,56 @@
     movhlps         m1, m0
     paddq           m0, m1
     movq            rax, xm0
-%else
+    RET
+%endif
+
+%if BIT_DEPTH == 8
+INIT_XMM sse2
+cglobal pixel_ssd_s_32, 2,3,5
+    add     r1, r1
+
+    mov     r2d, 16
+    pxor    m0, m0
+.loop:
+    movu    m1, [r0 + 0 * mmsize]
+    movu    m2, [r0 + 1 * mmsize]
+    movu    m3, [r0 + 2 * mmsize]
+    movu    m4, [r0 + 3 * mmsize]
+    add     r0, r1
+
+    pmaddwd m1, m1
+    pmaddwd m2, m2
+    pmaddwd m3, m3
+    pmaddwd m4, m4
+    paddd   m1, m2
+    paddd   m3, m4
+    paddd   m1, m3
+    paddd   m0, m1
+
+    movu    m1, [r0 + 0 * mmsize]
+    movu    m2, [r0 + 1 * mmsize]
+    movu    m3, [r0 + 2 * mmsize]
+    movu    m4, [r0 + 3 * mmsize]
+    add     r0, r1
+
+    pmaddwd m1, m1
+    pmaddwd m2, m2
+    pmaddwd m3, m3
+    pmaddwd m4, m4
+    paddd   m1, m2
+    paddd   m3, m4
+    paddd   m1, m3
+    paddd   m0, m1
+
+    dec     r2d
+    jnz    .loop
     ; calculate sum and return
     HADDD   m0, m1
     movd    eax, m0
+    RET
 %endif
-    RET
-
+
+%if ARCH_X86_64
 INIT_YMM avx2
 cglobal pixel_ssd_s_16, 2,4,5
     add     r1, r1
@@ -3389,7 +3441,7 @@
     movd    eax, xm0
 %endif
     RET
-
+%endif
 ;-----------------------------------------------------------------------------
 ; ssd_s avx512 code start
 ;-----------------------------------------------------------------------------
@@ -3447,6 +3499,7 @@
 ;-----------------------------------------------------------------------------
 ; int pixel_ssd_s( int16_t *ref, intptr_t i_stride )
 ;-----------------------------------------------------------------------------
+%if ARCH_X86_64
 INIT_ZMM avx512
 cglobal pixel_ssd_s_32, 2,4,5
     add     r1, r1
@@ -3495,6 +3548,7 @@
     HADDD   m0, m1
     movd    eax, xm0
     RET
+%endif
 ;-----------------------------------------------------------------------------
 ; ssd_s avx512 code end
 ;-----------------------------------------------------------------------------
diff -r dc2d7a2515fd -r 8173d05abf8d source/common/x86/v4-ipfilter16.asm
--- a/source/common/x86/v4-ipfilter16.asm	Fri Oct 06 16:38:18 2017 +0530
+++ b/source/common/x86/v4-ipfilter16.asm	Thu Apr 05 14:56:26 2018 -0700
@@ -2931,6 +2931,7 @@
     RET
 %endmacro
 
+%if ARCH_X86_64
 FILTER_VER_CHROMA_AVX2_4xN pp, 16, 1, 6
 FILTER_VER_CHROMA_AVX2_4xN ps, 16, 0, INTERP_SHIFT_PS
 FILTER_VER_CHROMA_AVX2_4xN sp, 16, 1, INTERP_SHIFT_SP
@@ -2939,6 +2940,7 @@
 FILTER_VER_CHROMA_AVX2_4xN ps, 32, 0, INTERP_SHIFT_PS
 FILTER_VER_CHROMA_AVX2_4xN sp, 32, 1, INTERP_SHIFT_SP
 FILTER_VER_CHROMA_AVX2_4xN ss, 32, 0, 6
+%endif
 
 %macro FILTER_VER_CHROMA_AVX2_8x8 3
 INIT_YMM avx2



More information about the x265-devel mailing list