[x265] [PATCH] asm: fix Main12 luma_hps avx2

rajesh at multicorewareinc.com rajesh at multicorewareinc.com
Thu Oct 1 09:17:21 CEST 2015


# HG changeset patch
# User Rajesh Paulraj<rajesh at multicorewareinc.com>
# Date 1443683779 -19800
#      Thu Oct 01 12:46:19 2015 +0530
# Node ID b2889a2a87f8194fa5587496e8f5752ca13b8d9f
# Parent  6e7761bdfe23addb862483f8407b388800de7d92
asm: fix Main12 luma_hps avx2

luma_hps[  4x4] - improved 1182.34c -> 914.36c
luma_hps[  4x8] - improved 1574.37c -> 1215.93c
luma_hps[ 4x16] - improved 2420.23c -> 1869.97c
luma_hps[  8x4] - improved 2271.52c -> 1271.69c
luma_hps[  8x8] - improved 3069.59c -> 1720.41c
luma_hps[ 8x16] - improved 4661.46c -> 2654.03c
luma_hps[ 8x32] - improved 7997.91c -> 4522.86c
luma_hps[12x16] - improved 6861.77c -> 4269.89c
luma_hps[ 16x4] - improved 4477.89c -> 2475.53c
luma_hps[ 16x8] - improved 6068.55c -> 3345.52c
luma_hps[16x12] - improved 7674.40c -> 4256.70c
luma_hps[16x16] - improved 9276.08c -> 5133.52c
luma_hps[16x32] - improved 15345.88c -> 8952.43c
luma_hps[16x64] - improved 28760.98c -> 16185.94c
luma_hps[24x32] - improved 23099.60c -> 12639.93c
luma_hps[ 32x8] - improved 11985.72c -> 6633.61c
luma_hps[32x16] - improved 18333.05c -> 10195.87c
luma_hps[32x24] - improved 24897.07c -> 13669.38c
luma_hps[32x32] - improved 31639.04c -> 17052.17c
luma_hps[32x64] - improved 59536.79c -> 32469.87c
luma_hps[48x64] - improved 85916.74c -> 48233.44c
luma_hps[64x16] - improved 40331.41c -> 20956.46c
luma_hps[64x32] - improved 63250.45c -> 35248.38c
luma_hps[64x48] - improved 95125.35c -> 52035.05c
luma_hps[64x64] - improved 117354.90c -> 66846.09c

diff -r 6e7761bdfe23 -r b2889a2a87f8 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Wed Sep 30 14:57:15 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp	Thu Oct 01 12:46:19 2015 +0530
@@ -1711,7 +1711,6 @@
         p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].p2s = PFX(filterPixelToShort_32x48_avx2);
         p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].p2s = PFX(filterPixelToShort_32x64_avx2);
 
-#if X265_DEPTH <= 10
         p.pu[LUMA_4x4].luma_hps = PFX(interp_8tap_horiz_ps_4x4_avx2);
         p.pu[LUMA_4x8].luma_hps = PFX(interp_8tap_horiz_ps_4x8_avx2);
         p.pu[LUMA_4x16].luma_hps = PFX(interp_8tap_horiz_ps_4x16_avx2);
@@ -1737,7 +1736,6 @@
         p.pu[LUMA_48x64].luma_hps = PFX(interp_8tap_horiz_ps_48x64_avx2);
         p.pu[LUMA_24x32].luma_hps = PFX(interp_8tap_horiz_ps_24x32_avx2);
         p.pu[LUMA_12x16].luma_hps = PFX(interp_8tap_horiz_ps_12x16_avx2);
-#endif
 
         p.pu[LUMA_4x4].luma_hpp = PFX(interp_8tap_horiz_pp_4x4_avx2);
         p.pu[LUMA_4x8].luma_hpp = PFX(interp_8tap_horiz_pp_4x8_avx2);
diff -r 6e7761bdfe23 -r b2889a2a87f8 source/common/x86/ipfilter16.asm
--- a/source/common/x86/ipfilter16.asm	Wed Sep 30 14:57:15 2015 +0530
+++ b/source/common/x86/ipfilter16.asm	Thu Oct 01 12:46:19 2015 +0530
@@ -10342,8 +10342,8 @@
     vpermd                      m3,                m5,                  m3
     paddd                       m3,                m2
     vextracti128                xm4,               m3,                  1
-    psrad                       xm3,               2
-    psrad                       xm4,               2
+    psrad                       xm3,               INTERP_SHIFT_PS
+    psrad                       xm4,               INTERP_SHIFT_PS
     packssdw                    xm3,               xm3
     packssdw                    xm4,               xm4
 
@@ -10375,8 +10375,8 @@
     vpermd                      m3,                m5,                  m3
     paddd                       m3,                m2
     vextracti128                xm4,               m3,                  1
-    psrad                       xm3,               2
-    psrad                       xm4,               2
+    psrad                       xm3,               INTERP_SHIFT_PS
+    psrad                       xm4,               INTERP_SHIFT_PS
     packssdw                    xm3,               xm3
     packssdw                    xm4,               xm4
 
@@ -10441,8 +10441,8 @@
     vpermq              m4, m4, q3120
     paddd               m4, m2
     vextracti128        xm5,m4, 1
-    psrad               xm4, 2
-    psrad               xm5, 2
+    psrad               xm4, INTERP_SHIFT_PS
+    psrad               xm5, INTERP_SHIFT_PS
     packssdw            xm4, xm5
 
     movu                [r2], xm4
@@ -10511,8 +10511,8 @@
     vpermq              m4, m4, q3120
     paddd               m4, m2
     vextracti128        xm5,m4, 1
-    psrad               xm4, 2
-    psrad               xm5, 2
+    psrad               xm4, INTERP_SHIFT_PS
+    psrad               xm5, INTERP_SHIFT_PS
     packssdw            xm4, xm5
 
     movu                [r2 + x], xm4
@@ -10583,8 +10583,8 @@
     vpermq              m4, m4, q3120
     paddd               m4, m2
     vextracti128        xm5,m4, 1
-    psrad               xm4, 2
-    psrad               xm5, 2
+    psrad               xm4, INTERP_SHIFT_PS
+    psrad               xm5, INTERP_SHIFT_PS
     packssdw            xm4, xm5
 
     movu                [r2 + x], xm4
@@ -10609,8 +10609,8 @@
     vpermq              m6, m6, q3120
     paddd               m6, m2
     vextracti128        xm5,m6, 1
-    psrad               xm6, 2
-    psrad               xm5, 2
+    psrad               xm6, INTERP_SHIFT_PS
+    psrad               xm5, INTERP_SHIFT_PS
     packssdw            xm6, xm5
 
     movu                [r2 + 16 + x], xm6
@@ -10690,8 +10690,8 @@
     vpermq              m4, m4, q3120
     paddd               m4, m2
     vextracti128        xm5, m4, 1
-    psrad               xm4, 2
-    psrad               xm5, 2
+    psrad               xm4, INTERP_SHIFT_PS
+    psrad               xm5, INTERP_SHIFT_PS
     packssdw            xm4, xm5
     movu                [r2], xm4
 
@@ -10713,8 +10713,8 @@
     vpermq              m6, m6, q3120
     paddd               m6, m2
     vextracti128        xm5,m6, 1
-    psrad               xm6, 2
-    psrad               xm5, 2
+    psrad               xm6, INTERP_SHIFT_PS
+    psrad               xm5, INTERP_SHIFT_PS
     packssdw            xm6, xm5
     movu                [r2 + 16], xm6
 
@@ -10783,8 +10783,8 @@
     vpermq              m4, m4, q3120
     paddd               m4, m2
     vextracti128        xm5,m4, 1
-    psrad               xm4, 2
-    psrad               xm5, 2
+    psrad               xm4, INTERP_SHIFT_PS
+    psrad               xm5, INTERP_SHIFT_PS
     packssdw            xm4, xm5
     movu                [r2], xm4
 
@@ -10798,7 +10798,7 @@
     phaddd              m6, m6
     vpermq              m6, m6, q3120
     paddd               xm6, xm2
-    psrad               xm6, 2
+    psrad               xm6, INTERP_SHIFT_PS
     packssdw            xm6, xm6
     movq                [r2 + 16], xm6
 


More information about the x265-devel mailing list