[x265] [PATCH] asm: frameInitLowres avx2 code for 8bpp and 10bpp

rajesh at multicorewareinc.com rajesh at multicorewareinc.com
Thu Jul 9 14:45:20 CEST 2015


# HG changeset patch
# User Rajesh Paulraj<rajesh at multicorewareinc.com>
# Date 1436444469 -19800
#      Thu Jul 09 17:51:09 2015 +0530
# Node ID 55c41fad48cf4a3af08cecc55deccbd34aadd252
# Parent  83bc6fac1fb54e9d5241c5c10d8578811a355273
asm: frameInitLowres avx2 code for 8bpp and 10bpp

8bpp:
avx2: downscale  30.38x   22659.94        688378.63
avx : downscale  18.92x   33242.29        628884.06

10bpp:
avx2: downscale  13.48x   51288.90        691165.69
avx : downscale  10.84x   64374.10        697631.81

diff -r 83bc6fac1fb5 -r 55c41fad48cf source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Wed Jul 08 13:35:39 2015 -0500
+++ b/source/common/x86/asm-primitives.cpp	Thu Jul 09 17:51:09 2015 +0530
@@ -2116,6 +2116,8 @@
         p.chroma[X265_CSP_I444].pu[LUMA_64x48].filter_vsp = PFX(interp_4tap_vert_sp_64x48_avx2);
         p.chroma[X265_CSP_I444].pu[LUMA_64x64].filter_vsp = PFX(interp_4tap_vert_sp_64x64_avx2);
 
+        p.frameInitLowres = PFX(frame_init_lowres_core_avx2);
+
         ALL_LUMA_PU_T(luma_hvpp, interp_8tap_hv_pp_cpu);                        // calling luma_hvpp for all sizes
         p.pu[LUMA_4x4].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_4x4>;             // ALL_LUMA_PU_T has declared all sizes except 4x4, hence calling luma_hvpp[4x4] 
 
@@ -3558,6 +3560,8 @@
         p.chroma[X265_CSP_I444].pu[LUMA_64x32].filter_vpp = PFX(interp_4tap_vert_pp_64x32_avx2);
         p.chroma[X265_CSP_I444].pu[LUMA_64x16].filter_vpp = PFX(interp_4tap_vert_pp_64x16_avx2);
 
+        p.frameInitLowres = PFX(frame_init_lowres_core_avx2);
+
         if (cpuMask & X265_CPU_BMI2)
             p.scanPosLast = PFX(scanPosLast_avx2_bmi2);
     }
diff -r 83bc6fac1fb5 -r 55c41fad48cf source/common/x86/mc-a2.asm
--- a/source/common/x86/mc-a2.asm	Wed Jul 08 13:35:39 2015 -0500
+++ b/source/common/x86/mc-a2.asm	Thu Jul 09 17:51:09 2015 +0530
@@ -692,7 +692,7 @@
 %endmacro
 
 %macro FILT32x4U 4
-    mova      m1, [r0+r5]
+    movu      m1, [r0+r5]
     pavgb     m0, m1, [r0]
     movu      m3, [r0+r5+1]
     pavgb     m2, m3, [r0+1]
@@ -701,7 +701,7 @@
     pavgb     m0, m2
     pavgb     m1, m3
 
-    mova      m3, [r0+r5+mmsize]
+    movu      m3, [r0+r5+mmsize]
     pavgb     m2, m3, [r0+mmsize]
     movu      m5, [r0+r5+1+mmsize]
     pavgb     m4, m5, [r0+1+mmsize]
@@ -722,10 +722,10 @@
     vpermq    m1, m4, q3120
     vpermq    m2, m2, q3120
     vpermq    m3, m5, q3120
-    mova    [%1], m0
-    mova    [%2], m1
-    mova    [%3], m2
-    mova    [%4], m3
+    movu    [%1], m0
+    movu    [%2], m1
+    movu    [%3], m2
+    movu    [%4], m3
 %endmacro
 
 %macro FILT16x2 4
@@ -796,8 +796,8 @@
 %endmacro
 
 %macro FILT8xA 4
-    mova      m3, [r0+%4+mmsize]
-    mova      m2, [r0+%4]
+    movu      m3, [r0+%4+mmsize]
+    movu      m2, [r0+%4]
     pavgw     m3, [r0+%4+r5+mmsize]
     pavgw     m2, [r0+%4+r5]
     PALIGNR   %1, m3, 2, m6
@@ -815,9 +815,13 @@
     packssdw  m3, %1
     packssdw  m5, m4
 %endif
-    mova    [%2], m3
-    mova    [%3], m5
-    mova      %1, m2
+%if cpuflag(avx2)
+    vpermq     m3, m3, q3120
+    vpermq     m5, m5, q3120
+%endif
+    movu    [%2], m3
+    movu    [%3], m5
+    movu      %1, m2
 %endmacro
 
 ;-----------------------------------------------------------------------------
@@ -871,8 +875,8 @@
 .vloop:
     mov      r6d, r7m
 %ifnidn cpuname, mmx2
-    mova      m0, [r0]
-    mova      m1, [r0+r5]
+    movu      m0, [r0]
+    movu      m1, [r0+r5]
     pavgw     m0, m1
     pavgw     m1, [r0+r5*2]
 %endif
@@ -977,7 +981,7 @@
 FRAME_INIT_LOWRES
 INIT_XMM xop
 FRAME_INIT_LOWRES
-%if HIGH_BIT_DEPTH==0
+%if ARCH_X86_64 == 1
 INIT_YMM avx2
 FRAME_INIT_LOWRES
 %endif
diff -r 83bc6fac1fb5 -r 55c41fad48cf source/common/x86/mc.h
--- a/source/common/x86/mc.h	Wed Jul 08 13:35:39 2015 -0500
+++ b/source/common/x86/mc.h	Thu Jul 09 17:51:09 2015 +0530
@@ -31,6 +31,7 @@
 LOWRES(sse2)
 LOWRES(ssse3)
 LOWRES(avx)
+LOWRES(avx2)
 LOWRES(xop)
 
 #undef LOWRES
diff -r 83bc6fac1fb5 -r 55c41fad48cf source/common/x86/x86util.asm
--- a/source/common/x86/x86util.asm	Wed Jul 08 13:35:39 2015 -0500
+++ b/source/common/x86/x86util.asm	Thu Jul 09 17:51:09 2015 +0530
@@ -358,11 +358,11 @@
 %if sizeof%1==32
                                  ; %3 = abcdefgh ijklmnop (lower address)
                                  ; %2 = ABCDEFGH IJKLMNOP (higher address)
-;   vperm2i128 %5, %2, %3, q0003 ; %5 = ijklmnop ABCDEFGH
-%if %4 < 16
-    palignr    %1, %5, %3, %4    ; %1 = bcdefghi jklmnopA
+    vperm2i128 %4, %1, %2, q0003 ; %4 = ijklmnop ABCDEFGH
+%if %3 < 16
+    palignr    %1, %4, %2, %3    ; %1 = bcdefghi jklmnopA
 %else
-    palignr    %1, %2, %5, %4-16 ; %1 = pABCDEFG HIJKLMNO
+    palignr    %1, %2, %4, %3-16 ; %1 = pABCDEFG HIJKLMNO
 %endif
 %elif cpuflag(ssse3)
     %if %0==5


More information about the x265-devel mailing list