[x265] [PATCH] asm: frameInitLowres avx2 code for 8bpp and 10bpp
rajesh at multicorewareinc.com
rajesh at multicorewareinc.com
Thu Jul 9 14:45:20 CEST 2015
# HG changeset patch
# User Rajesh Paulraj<rajesh at multicorewareinc.com>
# Date 1436444469 -19800
# Thu Jul 09 17:51:09 2015 +0530
# Node ID 55c41fad48cf4a3af08cecc55deccbd34aadd252
# Parent 83bc6fac1fb54e9d5241c5c10d8578811a355273
asm: frameInitLowres avx2 code for 8bpp and 10bpp
8bpp:
avx2: downscale 30.38x 22659.94 688378.63
avx : downscale 18.92x 33242.29 628884.06
10bpp:
avx2: downscale 13.48x 51288.90 691165.69
avx : downscale 10.84x 64374.10 697631.81
diff -r 83bc6fac1fb5 -r 55c41fad48cf source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Wed Jul 08 13:35:39 2015 -0500
+++ b/source/common/x86/asm-primitives.cpp Thu Jul 09 17:51:09 2015 +0530
@@ -2116,6 +2116,8 @@
p.chroma[X265_CSP_I444].pu[LUMA_64x48].filter_vsp = PFX(interp_4tap_vert_sp_64x48_avx2);
p.chroma[X265_CSP_I444].pu[LUMA_64x64].filter_vsp = PFX(interp_4tap_vert_sp_64x64_avx2);
+ p.frameInitLowres = PFX(frame_init_lowres_core_avx2);
+
ALL_LUMA_PU_T(luma_hvpp, interp_8tap_hv_pp_cpu); // calling luma_hvpp for all sizes
p.pu[LUMA_4x4].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_4x4>; // ALL_LUMA_PU_T has declared all sizes except 4x4, hence calling luma_hvpp[4x4]
@@ -3558,6 +3560,8 @@
p.chroma[X265_CSP_I444].pu[LUMA_64x32].filter_vpp = PFX(interp_4tap_vert_pp_64x32_avx2);
p.chroma[X265_CSP_I444].pu[LUMA_64x16].filter_vpp = PFX(interp_4tap_vert_pp_64x16_avx2);
+ p.frameInitLowres = PFX(frame_init_lowres_core_avx2);
+
if (cpuMask & X265_CPU_BMI2)
p.scanPosLast = PFX(scanPosLast_avx2_bmi2);
}
diff -r 83bc6fac1fb5 -r 55c41fad48cf source/common/x86/mc-a2.asm
--- a/source/common/x86/mc-a2.asm Wed Jul 08 13:35:39 2015 -0500
+++ b/source/common/x86/mc-a2.asm Thu Jul 09 17:51:09 2015 +0530
@@ -692,7 +692,7 @@
%endmacro
%macro FILT32x4U 4
- mova m1, [r0+r5]
+ movu m1, [r0+r5]
pavgb m0, m1, [r0]
movu m3, [r0+r5+1]
pavgb m2, m3, [r0+1]
@@ -701,7 +701,7 @@
pavgb m0, m2
pavgb m1, m3
- mova m3, [r0+r5+mmsize]
+ movu m3, [r0+r5+mmsize]
pavgb m2, m3, [r0+mmsize]
movu m5, [r0+r5+1+mmsize]
pavgb m4, m5, [r0+1+mmsize]
@@ -722,10 +722,10 @@
vpermq m1, m4, q3120
vpermq m2, m2, q3120
vpermq m3, m5, q3120
- mova [%1], m0
- mova [%2], m1
- mova [%3], m2
- mova [%4], m3
+ movu [%1], m0
+ movu [%2], m1
+ movu [%3], m2
+ movu [%4], m3
%endmacro
%macro FILT16x2 4
@@ -796,8 +796,8 @@
%endmacro
%macro FILT8xA 4
- mova m3, [r0+%4+mmsize]
- mova m2, [r0+%4]
+ movu m3, [r0+%4+mmsize]
+ movu m2, [r0+%4]
pavgw m3, [r0+%4+r5+mmsize]
pavgw m2, [r0+%4+r5]
PALIGNR %1, m3, 2, m6
@@ -815,9 +815,13 @@
packssdw m3, %1
packssdw m5, m4
%endif
- mova [%2], m3
- mova [%3], m5
- mova %1, m2
+%if cpuflag(avx2)
+ vpermq m3, m3, q3120
+ vpermq m5, m5, q3120
+%endif
+ movu [%2], m3
+ movu [%3], m5
+ movu %1, m2
%endmacro
;-----------------------------------------------------------------------------
@@ -871,8 +875,8 @@
.vloop:
mov r6d, r7m
%ifnidn cpuname, mmx2
- mova m0, [r0]
- mova m1, [r0+r5]
+ movu m0, [r0]
+ movu m1, [r0+r5]
pavgw m0, m1
pavgw m1, [r0+r5*2]
%endif
@@ -977,7 +981,7 @@
FRAME_INIT_LOWRES
INIT_XMM xop
FRAME_INIT_LOWRES
-%if HIGH_BIT_DEPTH==0
+%if ARCH_X86_64 == 1
INIT_YMM avx2
FRAME_INIT_LOWRES
%endif
diff -r 83bc6fac1fb5 -r 55c41fad48cf source/common/x86/mc.h
--- a/source/common/x86/mc.h Wed Jul 08 13:35:39 2015 -0500
+++ b/source/common/x86/mc.h Thu Jul 09 17:51:09 2015 +0530
@@ -31,6 +31,7 @@
LOWRES(sse2)
LOWRES(ssse3)
LOWRES(avx)
+LOWRES(avx2)
LOWRES(xop)
#undef LOWRES
diff -r 83bc6fac1fb5 -r 55c41fad48cf source/common/x86/x86util.asm
--- a/source/common/x86/x86util.asm Wed Jul 08 13:35:39 2015 -0500
+++ b/source/common/x86/x86util.asm Thu Jul 09 17:51:09 2015 +0530
@@ -358,11 +358,11 @@
%if sizeof%1==32
; %3 = abcdefgh ijklmnop (lower address)
; %2 = ABCDEFGH IJKLMNOP (higher address)
-; vperm2i128 %5, %2, %3, q0003 ; %5 = ijklmnop ABCDEFGH
-%if %4 < 16
- palignr %1, %5, %3, %4 ; %1 = bcdefghi jklmnopA
+ vperm2i128 %4, %1, %2, q0003 ; %4 = ijklmnop ABCDEFGH
+%if %3 < 16
+ palignr %1, %4, %2, %3 ; %1 = bcdefghi jklmnopA
%else
- palignr %1, %2, %5, %4-16 ; %1 = pABCDEFG HIJKLMNO
+ palignr %1, %2, %4, %3-16 ; %1 = pABCDEFG HIJKLMNO
%endif
%elif cpuflag(ssse3)
%if %0==5
More information about the x265-devel
mailing list