[x265] [PATCH] asm: fix Main12 luma_hps avx2
rajesh at multicorewareinc.com
rajesh at multicorewareinc.com
Thu Oct 1 09:17:21 CEST 2015
# HG changeset patch
# User Rajesh Paulraj<rajesh at multicorewareinc.com>
# Date 1443683779 -19800
# Thu Oct 01 12:46:19 2015 +0530
# Node ID b2889a2a87f8194fa5587496e8f5752ca13b8d9f
# Parent 6e7761bdfe23addb862483f8407b388800de7d92
asm: fix Main12 luma_hps avx2
luma_hps[ 4x4] - improved 1182.34c -> 914.36c
luma_hps[ 4x8] - improved 1574.37c -> 1215.93c
luma_hps[ 4x16] - improved 2420.23c -> 1869.97c
luma_hps[ 8x4] - improved 2271.52c -> 1271.69c
luma_hps[ 8x8] - improved 3069.59c -> 1720.41c
luma_hps[ 8x16] - improved 4661.46c -> 2654.03c
luma_hps[ 8x32] - improved 7997.91c -> 4522.86c
luma_hps[12x16] - improved 6861.77c -> 4269.89c
luma_hps[ 16x4] - improved 4477.89c -> 2475.53c
luma_hps[ 16x8] - improved 6068.55c -> 3345.52c
luma_hps[16x12] - improved 7674.40c -> 4256.70c
luma_hps[16x16] - improved 9276.08c -> 5133.52c
luma_hps[16x32] - improved 15345.88c -> 8952.43c
luma_hps[16x64] - improved 28760.98c -> 16185.94c
luma_hps[24x32] - improved 23099.60c -> 12639.93c
luma_hps[ 32x8] - improved 11985.72c -> 6633.61c
luma_hps[32x16] - improved 18333.05c -> 10195.87c
luma_hps[32x24] - improved 24897.07c -> 13669.38c
luma_hps[32x32] - improved 31639.04c -> 17052.17c
luma_hps[32x64] - improved 59536.79c -> 32469.87c
luma_hps[48x64] - improved 85916.74c -> 48233.44c
luma_hps[64x16] - improved 40331.41c -> 20956.46c
luma_hps[64x32] - improved 63250.45c -> 35248.38c
luma_hps[64x48] - improved 95125.35c -> 52035.05c
luma_hps[64x64] - improved 117354.90c -> 66846.09c
diff -r 6e7761bdfe23 -r b2889a2a87f8 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Wed Sep 30 14:57:15 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp Thu Oct 01 12:46:19 2015 +0530
@@ -1711,7 +1711,6 @@
p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].p2s = PFX(filterPixelToShort_32x48_avx2);
p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].p2s = PFX(filterPixelToShort_32x64_avx2);
-#if X265_DEPTH <= 10
p.pu[LUMA_4x4].luma_hps = PFX(interp_8tap_horiz_ps_4x4_avx2);
p.pu[LUMA_4x8].luma_hps = PFX(interp_8tap_horiz_ps_4x8_avx2);
p.pu[LUMA_4x16].luma_hps = PFX(interp_8tap_horiz_ps_4x16_avx2);
@@ -1737,7 +1736,6 @@
p.pu[LUMA_48x64].luma_hps = PFX(interp_8tap_horiz_ps_48x64_avx2);
p.pu[LUMA_24x32].luma_hps = PFX(interp_8tap_horiz_ps_24x32_avx2);
p.pu[LUMA_12x16].luma_hps = PFX(interp_8tap_horiz_ps_12x16_avx2);
-#endif
p.pu[LUMA_4x4].luma_hpp = PFX(interp_8tap_horiz_pp_4x4_avx2);
p.pu[LUMA_4x8].luma_hpp = PFX(interp_8tap_horiz_pp_4x8_avx2);
diff -r 6e7761bdfe23 -r b2889a2a87f8 source/common/x86/ipfilter16.asm
--- a/source/common/x86/ipfilter16.asm Wed Sep 30 14:57:15 2015 +0530
+++ b/source/common/x86/ipfilter16.asm Thu Oct 01 12:46:19 2015 +0530
@@ -10342,8 +10342,8 @@
vpermd m3, m5, m3
paddd m3, m2
vextracti128 xm4, m3, 1
- psrad xm3, 2
- psrad xm4, 2
+ psrad xm3, INTERP_SHIFT_PS
+ psrad xm4, INTERP_SHIFT_PS
packssdw xm3, xm3
packssdw xm4, xm4
@@ -10375,8 +10375,8 @@
vpermd m3, m5, m3
paddd m3, m2
vextracti128 xm4, m3, 1
- psrad xm3, 2
- psrad xm4, 2
+ psrad xm3, INTERP_SHIFT_PS
+ psrad xm4, INTERP_SHIFT_PS
packssdw xm3, xm3
packssdw xm4, xm4
@@ -10441,8 +10441,8 @@
vpermq m4, m4, q3120
paddd m4, m2
vextracti128 xm5,m4, 1
- psrad xm4, 2
- psrad xm5, 2
+ psrad xm4, INTERP_SHIFT_PS
+ psrad xm5, INTERP_SHIFT_PS
packssdw xm4, xm5
movu [r2], xm4
@@ -10511,8 +10511,8 @@
vpermq m4, m4, q3120
paddd m4, m2
vextracti128 xm5,m4, 1
- psrad xm4, 2
- psrad xm5, 2
+ psrad xm4, INTERP_SHIFT_PS
+ psrad xm5, INTERP_SHIFT_PS
packssdw xm4, xm5
movu [r2 + x], xm4
@@ -10583,8 +10583,8 @@
vpermq m4, m4, q3120
paddd m4, m2
vextracti128 xm5,m4, 1
- psrad xm4, 2
- psrad xm5, 2
+ psrad xm4, INTERP_SHIFT_PS
+ psrad xm5, INTERP_SHIFT_PS
packssdw xm4, xm5
movu [r2 + x], xm4
@@ -10609,8 +10609,8 @@
vpermq m6, m6, q3120
paddd m6, m2
vextracti128 xm5,m6, 1
- psrad xm6, 2
- psrad xm5, 2
+ psrad xm6, INTERP_SHIFT_PS
+ psrad xm5, INTERP_SHIFT_PS
packssdw xm6, xm5
movu [r2 + 16 + x], xm6
@@ -10690,8 +10690,8 @@
vpermq m4, m4, q3120
paddd m4, m2
vextracti128 xm5, m4, 1
- psrad xm4, 2
- psrad xm5, 2
+ psrad xm4, INTERP_SHIFT_PS
+ psrad xm5, INTERP_SHIFT_PS
packssdw xm4, xm5
movu [r2], xm4
@@ -10713,8 +10713,8 @@
vpermq m6, m6, q3120
paddd m6, m2
vextracti128 xm5,m6, 1
- psrad xm6, 2
- psrad xm5, 2
+ psrad xm6, INTERP_SHIFT_PS
+ psrad xm5, INTERP_SHIFT_PS
packssdw xm6, xm5
movu [r2 + 16], xm6
@@ -10783,8 +10783,8 @@
vpermq m4, m4, q3120
paddd m4, m2
vextracti128 xm5,m4, 1
- psrad xm4, 2
- psrad xm5, 2
+ psrad xm4, INTERP_SHIFT_PS
+ psrad xm5, INTERP_SHIFT_PS
packssdw xm4, xm5
movu [r2], xm4
@@ -10798,7 +10798,7 @@
phaddd m6, m6
vpermq m6, m6, q3120
paddd xm6, xm2
- psrad xm6, 2
+ psrad xm6, INTERP_SHIFT_PS
packssdw xm6, xm6
movq [r2 + 16], xm6
More information about the x265-devel
mailing list