[x265] [PATCH] asm: avx2 code for sad_x4[16xN] for 10 bpp
sumalatha at multicorewareinc.com
sumalatha at multicorewareinc.com
Tue May 19 09:01:19 CEST 2015
# HG changeset patch
# User Sumalatha Polureddy
# Date 1432018871 -19800
# Tue May 19 12:31:11 2015 +0530
# Node ID 7423bf9989d3def6f009a2dc813ac245d9789100
# Parent fd1f061f22290c209560abc5fd02d6401477861a
asm: avx2 code for sad_x4[16xN] for 10 bpp
sse2
sad_x4[ 16x4] 2.80x 976.64 2730.64
sad_x4[ 16x8] 2.97x 1718.50 5111.16
sad_x4[16x12] 3.04x 2475.38 7525.02
sad_x4[16x16] 3.09x 3122.67 9651.31
sad_x4[16x32] 2.83x 6974.52 19741.04
sad_x4[16x64] 3.07x 12935.32 39669.09
avx2
sad_x4[ 16x4] 4.93x 518.46 2555.28
sad_x4[ 16x8] 5.91x 852.26 5038.35
sad_x4[16x12] 6.30x 1185.09 7470.80
sad_x4[16x16] 6.27x 1533.31 9617.03
sad_x4[16x32] 5.82x 3501.26 20373.02
sad_x4[16x64] 6.60x 6106.51 40281.86
diff -r fd1f061f2229 -r 7423bf9989d3 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Tue May 19 10:40:00 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp Tue May 19 12:31:11 2015 +0530
@@ -1346,6 +1346,13 @@
p.pu[LUMA_64x48].sad_x3 = x265_pixel_sad_x3_64x48_avx2;
p.pu[LUMA_64x64].sad_x3 = x265_pixel_sad_x3_64x64_avx2;
+ p.pu[LUMA_16x4].sad_x4 = x265_pixel_sad_x4_16x4_avx2;
+ p.pu[LUMA_16x8].sad_x4 = x265_pixel_sad_x4_16x8_avx2;
+ p.pu[LUMA_16x12].sad_x4 = x265_pixel_sad_x4_16x12_avx2;
+ p.pu[LUMA_16x16].sad_x4 = x265_pixel_sad_x4_16x16_avx2;
+ p.pu[LUMA_16x32].sad_x4 = x265_pixel_sad_x4_16x32_avx2;
+ p.pu[LUMA_16x64].sad_x4 = x265_pixel_sad_x4_16x64_avx2;
+
p.pu[LUMA_16x4].convert_p2s = x265_filterPixelToShort_16x4_avx2;
p.pu[LUMA_16x8].convert_p2s = x265_filterPixelToShort_16x8_avx2;
p.pu[LUMA_16x12].convert_p2s = x265_filterPixelToShort_16x12_avx2;
diff -r fd1f061f2229 -r 7423bf9989d3 source/common/x86/sad16-a.asm
--- a/source/common/x86/sad16-a.asm Tue May 19 10:40:00 2015 +0530
+++ b/source/common/x86/sad16-a.asm Tue May 19 12:31:11 2015 +0530
@@ -1502,6 +1502,10 @@
SAD_X 3, 64, 48
SAD_X 3, 64, 64
%define XMM_REGS 9
-SAD_X 4, 16, 16
+SAD_X 4, 16, 4
SAD_X 4, 16, 8
+SAD_X 4, 16, 12
+SAD_X 4, 16, 16
+SAD_X 4, 16, 32
+SAD_X 4, 16, 64
More information about the x265-devel
mailing list