<div dir="ltr">Kindly don't consider this patch. I will merge both AVX and AVX2 primitives(to be disabled) and will resend .<div>Apologies for the confusion.</div><div><br></div><div><br></div><div>Thanks,</div><div>Aasaipriya</div></div><div class="gmail_extra"><br><div class="gmail_quote">On Mon, Aug 17, 2015 at 11:57 AM,  <span dir="ltr"><<a href="mailto:aasaipriya@multicorewareinc.com" target="_blank">aasaipriya@multicorewareinc.com</a>></span> wrote:<br><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex"># HG changeset patch<br>
# User Aasaipriya Chandran <<a href="mailto:aasaipriya@multicorewareinc.com">aasaipriya@multicorewareinc.com</a>><br>
# Date 1439792797 -19800<br>
#      Mon Aug 17 11:56:37 2015 +0530<br>
# Node ID 6364dbfbf62a504d2899831ed8832ff7d5f2fa91<br>
# Parent  d56b2466c04459205287e1581d8a36eebf372ba6<br>
asm: disabled 10bpp AVX primitives having less than 3% speed up over SSE<br>
<br>
these AVX primitives are slower than SSE primitives<br>
<br>
diff -r d56b2466c044 -r 6364dbfbf62a source/common/x86/asm-primitives.cpp<br>
--- a/source/common/x86/asm-primitives.cpp      Wed Aug 12 18:12:20 2015 +0530<br>
+++ b/source/common/x86/asm-primitives.cpp      Mon Aug 17 11:56:37 2015 +0530<br>
@@ -1169,7 +1169,6 @@<br>
     }<br>
     if (cpuMask & X265_CPU_AVX)<br>
     {<br>
-        // p.pu[LUMA_4x4].satd = <a href="http://p.cu" rel="noreferrer" target="_blank">p.cu</a>[BLOCK_4x4].sa8d = PFX(pixel_satd_4x4_avx); fails tests<br>
         p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].satd = PFX(pixel_satd_16x24_avx);<br>
         p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].satd = PFX(pixel_satd_32x48_avx);<br>
         p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].satd = PFX(pixel_satd_24x64_avx);<br>
@@ -1177,32 +1176,40 @@<br>
         p.chroma[X265_CSP_I422].pu[CHROMA_422_8x12].satd = PFX(pixel_satd_8x12_avx);<br>
         p.chroma[X265_CSP_I422].pu[CHROMA_422_12x32].satd = PFX(pixel_satd_12x32_avx);<br>
         p.chroma[X265_CSP_I422].pu[CHROMA_422_4x32].satd = PFX(pixel_satd_4x32_avx);<br>
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].satd = PFX(pixel_satd_4x8_avx);<br>
         p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].satd = PFX(pixel_satd_8x16_avx);<br>
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].satd = PFX(pixel_satd_4x4_avx);<br>
         p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].satd = PFX(pixel_satd_8x8_avx);<br>
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].satd = PFX(pixel_satd_4x16_avx);<br>
         p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].satd = PFX(pixel_satd_8x32_avx);<br>
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].satd = PFX(pixel_satd_8x4_avx);<br>
-<br>
-        ALL_LUMA_PU(satd, pixel_satd, avx);<br>
         p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].satd = PFX(pixel_satd_8x8_avx);<br>
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].satd = PFX(pixel_satd_8x4_avx);<br>
         p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].satd = PFX(pixel_satd_8x16_avx);<br>
         p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].satd = PFX(pixel_satd_8x32_avx);<br>
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_12x16].satd = PFX(pixel_satd_12x16_avx);<br>
         p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].satd = PFX(pixel_satd_24x32_avx);<br>
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].satd = PFX(pixel_satd_4x16_avx);<br>
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].satd = PFX(pixel_satd_4x8_avx);<br>
-#if X265_DEPTH <= 10<br>
-        ASSIGN_SA8D(avx);<br>
-#endif<br>
+<br>
+        p.pu[LUMA_8x8].satd = PFX(pixel_satd_8x8_avx);<br>
+        p.pu[LUMA_16x16].satd = PFX(pixel_satd_16x16_avx);<br>
+        p.pu[LUMA_32x32].satd = PFX(pixel_satd_32x32_avx);<br>
+        p.pu[LUMA_64x64].satd = PFX(pixel_satd_64x64_avx);<br>
+        p.pu[LUMA_16x8].satd = PFX(pixel_satd_16x8_avx);<br>
+        p.pu[LUMA_8x16].satd = PFX(pixel_satd_8x16_avx);<br>
+        p.pu[LUMA_16x32].satd = PFX(pixel_satd_16x32_avx);<br>
+        p.pu[LUMA_32x16].satd = PFX(pixel_satd_32x16_avx);<br>
+        p.pu[LUMA_64x32].satd = PFX(pixel_satd_64x32_avx);<br>
+        p.pu[LUMA_32x64].satd = PFX(pixel_satd_32x64_avx);<br>
+        p.pu[LUMA_16x12].satd = PFX(pixel_satd_16x12_avx);<br>
+        p.pu[LUMA_16x4].satd = PFX(pixel_satd_16x4_avx);<br>
+        p.pu[LUMA_32x24].satd = PFX(pixel_satd_32x24_avx);<br>
+        p.pu[LUMA_24x32].satd = PFX(pixel_satd_24x32_avx);<br>
+        p.pu[LUMA_32x8].satd = PFX(pixel_satd_32x8_avx);<br>
+        p.pu[LUMA_8x32].satd = PFX(pixel_satd_8x32_avx);<br>
+        p.pu[LUMA_64x48].satd = PFX(pixel_satd_64x48_avx);<br>
+        p.pu[LUMA_48x64].satd = PFX(pixel_satd_48x64_avx);<br>
+        p.pu[LUMA_64x16].satd = PFX(pixel_satd_64x16_avx);<br>
+        p.pu[LUMA_16x64].satd = PFX(pixel_satd_16x64_avx);<br>
+<br>
         p.chroma[X265_CSP_I420].cu[BLOCK_420_8x8].sa8d = PFX(pixel_sa8d_8x8_avx);<br>
         p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].sa8d = PFX(pixel_sa8d_16x16_avx);<br>
         p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].sa8d = PFX(pixel_sa8d_32x32_avx);<br>
-        LUMA_VAR(avx);<br>
-        p.ssim_4x4x2_core = PFX(pixel_ssim_4x4x2_core_avx);<br>
-        p.ssim_end_4 = PFX(pixel_ssim_end4_avx);<br>
+<br>
+        <a href="http://p.cu" rel="noreferrer" target="_blank">p.cu</a>[BLOCK_16x16].var = PFX(pixel_var_16x16_avx);<br>
<br>
         // copy_pp primitives<br>
         // 16 x N<br>
@@ -1299,6 +1306,30 @@<br>
         p.pu[LUMA_64x32].copy_pp = (copy_pp_t)PFX(blockcopy_ss_64x32_avx);<br>
         p.pu[LUMA_64x48].copy_pp = (copy_pp_t)PFX(blockcopy_ss_64x48_avx);<br>
         p.pu[LUMA_64x64].copy_pp = (copy_pp_t)PFX(blockcopy_ss_64x64_avx);<br>
+<br>
+        /* The following primitives have been disabled since performance compared to SSE4 is negligible/negative */<br>
+#if 0<br>
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].satd = PFX(pixel_satd_4x8_avx);<br>
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].satd = PFX(pixel_satd_4x8_avx);<br>
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].satd = PFX(pixel_satd_4x4_avx);<br>
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].satd = PFX(pixel_satd_4x16_avx);<br>
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].satd = PFX(pixel_satd_4x16_avx);<br>
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].satd = PFX(pixel_satd_8x4_avx);<br>
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].satd = PFX(pixel_satd_8x4_avx);<br>
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_12x16].satd = PFX(pixel_satd_12x16_avx);<br>
+<br>
+        ALL_LUMA_PU(satd, pixel_satd, avx);<br>
+<br>
+        LUMA_VAR(avx);<br>
+        <a href="http://p.cu" rel="noreferrer" target="_blank">p.cu</a>[BLOCK_8x8].var   = PFX(pixel_var_8x8_avx);<br>
+        <a href="http://p.cu" rel="noreferrer" target="_blank">p.cu</a>[BLOCK_32x32].var = PFX(pixel_var_32x32_avx);<br>
+        <a href="http://p.cu" rel="noreferrer" target="_blank">p.cu</a>[BLOCK_64x64].var = PFX(pixel_var_64x64_avx)<br>
+<br>
+        ASSIGN_SA8D(avx);<br>
+<br>
+        p.ssim_4x4x2_core = PFX(pixel_ssim_4x4x2_core_avx);<br>
+        p.ssim_end_4 = PFX(pixel_ssim_end4_avx);<br>
+#endif<br>
     }<br>
     if (cpuMask & X265_CPU_XOP)<br>
     {<br>
</blockquote></div><br></div>