[x265] [PATCH] asm: avx2 code for pixel_add_ps[64x64] for 10 bpp - 178x
sumalatha at multicorewareinc.com
sumalatha at multicorewareinc.com
Mon Mar 2 09:55:13 CET 2015
# HG changeset patch
# User Sumalatha Polureddy<sumalatha at multicorewareinc.com>
# Date 1425286505 -19800
# Node ID 8d78c9dc559641fa6140ac44ccc6da8171e7fb14
# Parent 360caa3c6a3d41d2b9adc2c5bc64e2f5d611f52b
asm: avx2 code for pixel_add_ps[64x64] for 10 bpp - 178x
add_ps[64x64] 178.62x 5687.26 1015873.50
diff -r 360caa3c6a3d -r 8d78c9dc5596 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Mon Mar 02 14:22:51 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp Mon Mar 02 14:25:05 2015 +0530
@@ -1072,6 +1072,7 @@
p.cu[BLOCK_8x8].add_ps = x265_pixel_add_ps_8x8_avx2;
p.cu[BLOCK_16x16].add_ps = x265_pixel_add_ps_16x16_avx2;
p.cu[BLOCK_32x32].add_ps = x265_pixel_add_ps_32x32_avx2;
+ p.cu[BLOCK_64x64].add_ps = x265_pixel_add_ps_64x64_avx2;
p.cu[BLOCK_32x32].ssd_s = x265_pixel_ssd_s_32_avx2;
p.cu[BLOCK_16x16].sse_ss = x265_pixel_ssd_ss_16x16_avx2;
diff -r 360caa3c6a3d -r 8d78c9dc5596 source/common/x86/pixeladd8.asm
--- a/source/common/x86/pixeladd8.asm Mon Mar 02 14:22:51 2015 +0530
+++ b/source/common/x86/pixeladd8.asm Mon Mar 02 14:25:05 2015 +0530
@@ -952,6 +952,43 @@
jnz .loop
RET
+
+INIT_YMM avx2
+cglobal pixel_add_ps_64x%2, 6, 7, 6, dest, destride, src0, scr1, srcStride0, srcStride1
+ mova m5, [pw_pixel_max]
+ pxor m4, m4
+ mov r6d, %2
+ add r4, r4
+ add r5, r5
+ add r1, r1
+.loop:
+ movu m0, [r2] ; first 16 of row 0 of src0
+ movu m2, [r2 + 32] ; second 16 of row 0 of src0
+ movu m1, [r3] ; first 16 of row 0 of src1
+ movu m3, [r3 + 32] ; second 16 of row 0 of src1
+ paddw m0, m1
+ paddw m2, m3
+ CLIPW2 m0, m2, m4, m5
+ movu [r0], m0 ; first 16 of row 0 of dst
+ movu [r0 + 32], m2 ; second 16 of row 0 of dst
+
+ movu m0, [r2 + 64] ; first 16 of row 1 of src0
+ movu m2, [r2 + 96] ; second 16 of row 1 of src0
+ movu m1, [r3 + 64] ; first 16 of row 1 of src1
+ movu m3, [r3 + 96] ; second 16 of row 1 of src1
+ paddw m0, m1
+ paddw m2, m3
+ CLIPW2 m0, m2, m4, m5
+ movu [r0 + 64], m0 ; first 16 of row 1 of dst
+ movu [r0 + 96], m2 ; second 16 of row 1 of dst
+
+ lea r2, [r2 + r4 * 1]
+ lea r3, [r3 + r5 * 1]
+ lea r0, [r0 + r1 * 1]
+
+ dec r6d
+ jnz .loop
+ RET
%else
INIT_XMM sse4
cglobal pixel_add_ps_64x%2, 6, 7, 8, dest, destride, src0, scr1, srcStride0, srcStride1
More information about the x265-devel
mailing list