[x265] [PATCH] asm: fix main12 avx2 for chroma_vpp/vps/vsp/vss

rajesh at multicorewareinc.com rajesh at multicorewareinc.com
Mon Oct 5 14:59:00 CEST 2015


# HG changeset patch
# User Rajesh Paulraj<rajesh at multicorewareinc.com>
# Date 1444041513 -19800
#      Mon Oct 05 16:08:33 2015 +0530
# Node ID 8dc9dfe33c370e5bc09863ab1062568662d46e37
# Parent  5f73ada8caa0c62cc7540799966bde7536861bf7
asm: fix main12 avx2 for chroma_vpp/vps/vsp/vss

diff -r 5f73ada8caa0 -r 8dc9dfe33c37 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Thu Oct 01 17:53:59 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp	Mon Oct 05 16:08:33 2015 +0530
@@ -1881,7 +1881,6 @@
         p.chroma[X265_CSP_I444].pu[LUMA_64x64].filter_hpp = PFX(interp_4tap_horiz_pp_64x64_avx2);
         p.chroma[X265_CSP_I444].pu[LUMA_48x64].filter_hpp = PFX(interp_4tap_horiz_pp_48x64_avx2);
 
-#if X265_DEPTH <= 10
         p.chroma[X265_CSP_I420].pu[CHROMA_420_4x2].filter_vpp = PFX(interp_4tap_vert_pp_4x2_avx2);
         p.chroma[X265_CSP_I420].pu[CHROMA_420_4x2].filter_vps = PFX(interp_4tap_vert_ps_4x2_avx2);
         p.chroma[X265_CSP_I420].pu[CHROMA_420_4x2].filter_vsp = PFX(interp_4tap_vert_sp_4x2_avx2);
@@ -2161,7 +2160,6 @@
         p.chroma[X265_CSP_I444].pu[LUMA_64x32].filter_vsp = PFX(interp_4tap_vert_sp_64x32_avx2);
         p.chroma[X265_CSP_I444].pu[LUMA_64x48].filter_vsp = PFX(interp_4tap_vert_sp_64x48_avx2);
         p.chroma[X265_CSP_I444].pu[LUMA_64x64].filter_vsp = PFX(interp_4tap_vert_sp_64x64_avx2);
-#endif
 
         p.frameInitLowres = PFX(frame_init_lowres_core_avx2);
 
diff -r 5f73ada8caa0 -r 8dc9dfe33c37 source/common/x86/ipfilter16.asm
--- a/source/common/x86/ipfilter16.asm	Thu Oct 01 17:53:59 2015 +0530
+++ b/source/common/x86/ipfilter16.asm	Mon Oct 05 16:08:33 2015 +0530
@@ -4869,7 +4869,7 @@
 %ifidn %2,pp
     vbroadcasti128  m8, [INTERP_OFFSET_PP]
 %elifidn %2, sp
-    mova            m8, [INTERP_OFFSET_SP]
+    vbroadcasti128  m8, [INTERP_OFFSET_SP]
 %else
     vbroadcasti128  m8, [INTERP_OFFSET_PS]
 %endif
@@ -5011,11 +5011,11 @@
     mov       r4d, %1/2
 
 %ifidn %2, pp
-    mova      m7, [INTERP_OFFSET_PP]
+    vbroadcasti128  m7, [INTERP_OFFSET_PP]
 %elifidn %2, sp
-    mova      m7, [INTERP_OFFSET_SP]
+    vbroadcasti128  m7, [INTERP_OFFSET_SP]
 %elifidn %2, ps
-    mova      m7, [INTERP_OFFSET_PS]
+    vbroadcasti128  m7, [INTERP_OFFSET_PS]
 %endif
 
 .loopH:
@@ -5183,11 +5183,11 @@
     mov       r4d, %1/2
 
 %ifidn %2, pp
-    mova      m7, [INTERP_OFFSET_PP]
+    vbroadcasti128  m7, [INTERP_OFFSET_PP]
 %elifidn %2, sp
-    mova      m7, [INTERP_OFFSET_SP]
+    vbroadcasti128  m7, [INTERP_OFFSET_SP]
 %elifidn %2, ps
-    mova      m7, [INTERP_OFFSET_PS]
+    vbroadcasti128  m7, [INTERP_OFFSET_PS]
 %endif
 
 .loopH:
@@ -5325,11 +5325,11 @@
     mov       r4d, %1/2
 
 %ifidn %2, pp
-    mova      m7, [INTERP_OFFSET_PP]
+    vbroadcasti128  m7, [INTERP_OFFSET_PP]
 %elifidn %2, sp
-    mova      m7, [INTERP_OFFSET_SP]
+    vbroadcasti128  m7, [INTERP_OFFSET_SP]
 %elifidn %2, ps
-    mova      m7, [INTERP_OFFSET_PS]
+    vbroadcasti128  m7, [INTERP_OFFSET_PS]
 %endif
 
 .loopH:
@@ -5456,11 +5456,11 @@
     mov       r4d, %1/2
 
 %ifidn %2, pp
-    mova      m7, [INTERP_OFFSET_PP]
+    vbroadcasti128  m7, [INTERP_OFFSET_PP]
 %elifidn %2, sp
-    mova      m7, [INTERP_OFFSET_SP]
+    vbroadcasti128  m7, [INTERP_OFFSET_SP]
 %elifidn %2, ps
-    mova      m7, [INTERP_OFFSET_PS]
+    vbroadcasti128  m7, [INTERP_OFFSET_PS]
 %endif
 
 .loopH:
@@ -5609,11 +5609,11 @@
     mov       r4d, %1/2
 
 %ifidn %2, pp
-    mova      m7, [INTERP_OFFSET_PP]
+    vbroadcasti128  m7, [INTERP_OFFSET_PP]
 %elifidn %2, sp
-    mova      m7, [INTERP_OFFSET_SP]
+    vbroadcasti128  m7, [INTERP_OFFSET_SP]
 %elifidn %2, ps
-    mova      m7, [INTERP_OFFSET_PS]
+    vbroadcasti128  m7, [INTERP_OFFSET_PS]
 %endif
 
 .loopH:
@@ -5732,11 +5732,11 @@
     mov       r4d, 32
 
 %ifidn %1, pp
-    mova      m7, [INTERP_OFFSET_PP]
+    vbroadcasti128  m7, [INTERP_OFFSET_PP]
 %elifidn %1, sp
-    mova      m7, [INTERP_OFFSET_SP]
+    vbroadcasti128  m7, [INTERP_OFFSET_SP]
 %elifidn %1, ps
-    mova      m7, [INTERP_OFFSET_PS]
+    vbroadcasti128  m7, [INTERP_OFFSET_PS]
 %endif
 
 .loopH:
@@ -11537,7 +11537,7 @@
 %ifidn %1,pp
     vbroadcasti128  m14, [pd_32]
 %elifidn %1, sp
-    mova            m14, [pd_524800]
+    vbroadcasti128  m14, [INTERP_OFFSET_SP]
 %else
     vbroadcasti128  m14, [INTERP_OFFSET_PS]
 %endif
@@ -11665,19 +11665,19 @@
     psrad           m4, 6
     psrad           m5, 6
 %elifidn %1, sp
-    psrad           m0, 10
-    psrad           m1, 10
-    psrad           m2, 10
-    psrad           m3, 10
-    psrad           m4, 10
-    psrad           m5, 10
-%else
-    psrad           m0, 2
-    psrad           m1, 2
-    psrad           m2, 2
-    psrad           m3, 2
-    psrad           m4, 2
-    psrad           m5, 2
+    psrad           m0, INTERP_SHIFT_SP
+    psrad           m1, INTERP_SHIFT_SP
+    psrad           m2, INTERP_SHIFT_SP
+    psrad           m3, INTERP_SHIFT_SP
+    psrad           m4, INTERP_SHIFT_SP
+    psrad           m5, INTERP_SHIFT_SP
+%else
+    psrad           m0, INTERP_SHIFT_PS
+    psrad           m1, INTERP_SHIFT_PS
+    psrad           m2, INTERP_SHIFT_PS
+    psrad           m3, INTERP_SHIFT_PS
+    psrad           m4, INTERP_SHIFT_PS
+    psrad           m5, INTERP_SHIFT_PS
 %endif
 %endif
 
@@ -11736,11 +11736,11 @@
     psrad           m6, 6
     psrad           m7, 6
 %elifidn %1, sp
-    psrad           m6, 10
-    psrad           m7, 10
-%else
-    psrad           m6, 2
-    psrad           m7, 2
+    psrad           m6, INTERP_SHIFT_SP
+    psrad           m7, INTERP_SHIFT_SP
+%else
+    psrad           m6, INTERP_SHIFT_PS
+    psrad           m7, INTERP_SHIFT_PS
 %endif
 %endif
 
@@ -11814,23 +11814,23 @@
     psrad           m0, 6
     psrad           m1, 6
 %elifidn %1, sp
-    psrad           m8, 10
-    psrad           m9, 10
-    psrad           m10, 10
-    psrad           m11, 10
-    psrad           m12, 10
-    psrad           m13, 10
-    psrad           m0, 10
-    psrad           m1, 10
-%else
-    psrad           m8, 2
-    psrad           m9, 2
-    psrad           m10, 2
-    psrad           m11, 2
-    psrad           m12, 2
-    psrad           m13, 2
-    psrad           m0, 2
-    psrad           m1, 2
+    psrad           m8, INTERP_SHIFT_SP
+    psrad           m9, INTERP_SHIFT_SP
+    psrad           m10, INTERP_SHIFT_SP
+    psrad           m11, INTERP_SHIFT_SP
+    psrad           m12, INTERP_SHIFT_SP
+    psrad           m13, INTERP_SHIFT_SP
+    psrad           m0, INTERP_SHIFT_SP
+    psrad           m1, INTERP_SHIFT_SP
+%else
+    psrad           m8, INTERP_SHIFT_PS
+    psrad           m9, INTERP_SHIFT_PS
+    psrad           m10, INTERP_SHIFT_PS
+    psrad           m11, INTERP_SHIFT_PS
+    psrad           m12, INTERP_SHIFT_PS
+    psrad           m13, INTERP_SHIFT_PS
+    psrad           m0, INTERP_SHIFT_PS
+    psrad           m1, INTERP_SHIFT_PS
 %endif
 %endif
 
@@ -11954,7 +11954,7 @@
 %ifidn %1,pp
     vbroadcasti128  m7, [pd_32]
 %elifidn %1, sp
-    mova            m7, [pd_524800]
+    vbroadcasti128  m7, [INTERP_OFFSET_SP]
 %else
     vbroadcasti128  m7, [INTERP_OFFSET_PS]
 %endif
@@ -11966,8 +11966,8 @@
 %endmacro
 
 FILTER_VER_CHROMA_AVX2_8x2 pp, 1, 6
-FILTER_VER_CHROMA_AVX2_8x2 ps, 0, 2
-FILTER_VER_CHROMA_AVX2_8x2 sp, 1, 10
+FILTER_VER_CHROMA_AVX2_8x2 ps, 0, INTERP_SHIFT_PS
+FILTER_VER_CHROMA_AVX2_8x2 sp, 1, INTERP_SHIFT_SP
 FILTER_VER_CHROMA_AVX2_8x2 ss, 0, 6
 
 %macro FILTER_VER_CHROMA_AVX2_4x2 3
@@ -11991,7 +11991,7 @@
 %ifidn %1,pp
     vbroadcasti128  m6, [pd_32]
 %elifidn %1, sp
-    mova            m6, [pd_524800]
+    vbroadcasti128  m6, [INTERP_OFFSET_SP]
 %else
     vbroadcasti128  m6, [INTERP_OFFSET_PS]
 %endif
@@ -12033,8 +12033,8 @@
 %endmacro
 
 FILTER_VER_CHROMA_AVX2_4x2 pp, 1, 6
-FILTER_VER_CHROMA_AVX2_4x2 ps, 0, 2
-FILTER_VER_CHROMA_AVX2_4x2 sp, 1, 10
+FILTER_VER_CHROMA_AVX2_4x2 ps, 0, INTERP_SHIFT_PS
+FILTER_VER_CHROMA_AVX2_4x2 sp, 1, INTERP_SHIFT_SP
 FILTER_VER_CHROMA_AVX2_4x2 ss, 0, 6
 
 %macro FILTER_VER_CHROMA_AVX2_4x4 3
@@ -12058,7 +12058,7 @@
 %ifidn %1,pp
    vbroadcasti128  m6, [pd_32]
 %elifidn %1, sp
-    mova            m6, [pd_524800]
+   vbroadcasti128  m6, [INTERP_OFFSET_SP]
 %else
     vbroadcasti128  m6, [INTERP_OFFSET_PS]
 %endif
@@ -12112,8 +12112,8 @@
 %endmacro
 
 FILTER_VER_CHROMA_AVX2_4x4 pp, 1, 6
-FILTER_VER_CHROMA_AVX2_4x4 ps, 0, 2
-FILTER_VER_CHROMA_AVX2_4x4 sp, 1, 10
+FILTER_VER_CHROMA_AVX2_4x4 ps, 0, INTERP_SHIFT_PS
+FILTER_VER_CHROMA_AVX2_4x4 sp, 1, INTERP_SHIFT_SP
 FILTER_VER_CHROMA_AVX2_4x4 ss, 0, 6
 
 
@@ -12138,7 +12138,7 @@
 %ifidn %1,pp
     vbroadcasti128  m7, [pd_32]
 %elifidn %1, sp
-    mova            m7, [pd_524800]
+    vbroadcasti128  m7, [INTERP_OFFSET_SP]
 %else
     vbroadcasti128  m7, [INTERP_OFFSET_PS]
 %endif
@@ -12225,8 +12225,8 @@
 %endmacro
 
 FILTER_VER_CHROMA_AVX2_4x8 pp, 1, 6
-FILTER_VER_CHROMA_AVX2_4x8 ps, 0, 2
-FILTER_VER_CHROMA_AVX2_4x8 sp, 1, 10
+FILTER_VER_CHROMA_AVX2_4x8 ps, 0, INTERP_SHIFT_PS
+FILTER_VER_CHROMA_AVX2_4x8 sp, 1, INTERP_SHIFT_SP
 FILTER_VER_CHROMA_AVX2_4x8 ss, 0 , 6
 
 %macro PROCESS_LUMA_AVX2_W4_16R_4TAP 3
@@ -12396,7 +12396,7 @@
 %ifidn %1,pp
     vbroadcasti128  m7, [pd_32]
 %elifidn %1, sp
-    mova            m7, [pd_524800]
+    vbroadcasti128  m7, [INTERP_OFFSET_SP]
 %else
     vbroadcasti128  m7, [INTERP_OFFSET_PS]
 %endif
@@ -12410,12 +12410,12 @@
 %endmacro
 
 FILTER_VER_CHROMA_AVX2_4xN pp, 16, 1, 6
-FILTER_VER_CHROMA_AVX2_4xN ps, 16, 0, 2
-FILTER_VER_CHROMA_AVX2_4xN sp, 16, 1, 10
+FILTER_VER_CHROMA_AVX2_4xN ps, 16, 0, INTERP_SHIFT_PS
+FILTER_VER_CHROMA_AVX2_4xN sp, 16, 1, INTERP_SHIFT_SP
 FILTER_VER_CHROMA_AVX2_4xN ss, 16, 0, 6
 FILTER_VER_CHROMA_AVX2_4xN pp, 32, 1, 6
-FILTER_VER_CHROMA_AVX2_4xN ps, 32, 0, 2
-FILTER_VER_CHROMA_AVX2_4xN sp, 32, 1, 10
+FILTER_VER_CHROMA_AVX2_4xN ps, 32, 0, INTERP_SHIFT_PS
+FILTER_VER_CHROMA_AVX2_4xN sp, 32, 1, INTERP_SHIFT_SP
 FILTER_VER_CHROMA_AVX2_4xN ss, 32, 0, 6
 
 %macro FILTER_VER_CHROMA_AVX2_8x8 3
@@ -12429,7 +12429,7 @@
 
 %ifdef PIC
     lea             r5, [tab_ChromaCoeffVer]
-   add             r5, r4
+    add             r5, r4
 %else
     lea             r5, [tab_ChromaCoeffVer + r4]
 %endif
@@ -12440,7 +12440,7 @@
 %ifidn %1,pp
     vbroadcasti128  m11, [pd_32]
 %elifidn %1, sp
-    mova            m11, [pd_524800]
+    vbroadcasti128  m11, [INTERP_OFFSET_SP]
 %else
     vbroadcasti128  m11, [INTERP_OFFSET_PS]
 %endif
@@ -12569,8 +12569,8 @@
 %endmacro
 
 FILTER_VER_CHROMA_AVX2_8x8 pp, 1, 6
-FILTER_VER_CHROMA_AVX2_8x8 ps, 0, 2
-FILTER_VER_CHROMA_AVX2_8x8 sp, 1, 10
+FILTER_VER_CHROMA_AVX2_8x8 ps, 0, INTERP_SHIFT_PS
+FILTER_VER_CHROMA_AVX2_8x8 sp, 1, INTERP_SHIFT_SP
 FILTER_VER_CHROMA_AVX2_8x8 ss, 0, 6
 
 %macro FILTER_VER_CHROMA_AVX2_8x6 3
@@ -12595,7 +12595,7 @@
 %ifidn %1,pp
     vbroadcasti128  m11, [pd_32]
 %elifidn %1, sp
-    mova            m11, [pd_524800]
+    vbroadcasti128  m11, [INTERP_OFFSET_SP]
 %else
     vbroadcasti128  m11, [INTERP_OFFSET_PS]
 %endif
@@ -12700,8 +12700,8 @@
 %endmacro
 
 FILTER_VER_CHROMA_AVX2_8x6 pp, 1, 6
-FILTER_VER_CHROMA_AVX2_8x6 ps, 0, 2
-FILTER_VER_CHROMA_AVX2_8x6 sp, 1, 10
+FILTER_VER_CHROMA_AVX2_8x6 ps, 0, INTERP_SHIFT_PS
+FILTER_VER_CHROMA_AVX2_8x6 sp, 1, INTERP_SHIFT_SP
 FILTER_VER_CHROMA_AVX2_8x6 ss, 0, 6
 
 %macro PROCESS_CHROMA_AVX2 3
@@ -12785,7 +12785,7 @@
 %ifidn %1,pp
     vbroadcasti128  m7, [pd_32]
 %elifidn %1, sp
-    mova            m7, [pd_524800]
+    vbroadcasti128  m7, [INTERP_OFFSET_SP]
 %else
     vbroadcasti128  m7, [INTERP_OFFSET_PS]
 %endif
@@ -12799,8 +12799,8 @@
 %endmacro
 
 FILTER_VER_CHROMA_AVX2_8x4 pp, 1, 6
-FILTER_VER_CHROMA_AVX2_8x4 ps, 0, 2
-FILTER_VER_CHROMA_AVX2_8x4 sp, 1, 10
+FILTER_VER_CHROMA_AVX2_8x4 ps, 0, INTERP_SHIFT_PS
+FILTER_VER_CHROMA_AVX2_8x4 sp, 1, INTERP_SHIFT_SP
 FILTER_VER_CHROMA_AVX2_8x4 ss, 0, 6
 
 %macro FILTER_VER_CHROMA_AVX2_8x12 3
@@ -12824,7 +12824,7 @@
 %ifidn %1,pp
     vbroadcasti128  m14, [pd_32]
 %elifidn %1, sp
-    mova            m14, [pd_524800]
+    vbroadcasti128  m14, [INTERP_OFFSET_SP]
 %else
     vbroadcasti128  m14, [INTERP_OFFSET_PS]
 %endif
@@ -13002,6 +13002,6 @@
 %endmacro
 
 FILTER_VER_CHROMA_AVX2_8x12 pp, 1, 6
-FILTER_VER_CHROMA_AVX2_8x12 ps, 0, 2
-FILTER_VER_CHROMA_AVX2_8x12 sp, 1, 10
+FILTER_VER_CHROMA_AVX2_8x12 ps, 0, INTERP_SHIFT_PS
+FILTER_VER_CHROMA_AVX2_8x12 sp, 1, INTERP_SHIFT_SP
 FILTER_VER_CHROMA_AVX2_8x12 ss, 0, 6


More information about the x265-devel mailing list