[x265] [PATCH] asm: avx2 10bit code for scale2D_64to32
rajesh at multicorewareinc.com
rajesh at multicorewareinc.com
Wed Apr 22 16:23:12 CEST 2015
# HG changeset patch
# User Rajesh Paulraj<rajesh at multicorewareinc.com>
# Date 1429712526 -19800
# Wed Apr 22 19:52:06 2015 +0530
# Node ID eb7ba0bea5b6bac28ea39030062c31b5ed504487
# Parent 86268e498680951069c48b681eef830b0aa37873
asm: avx2 10bit code for scale2D_64to32
AVX2:
scale2D_64to32 17.07x 3873.16 44301.99
SSSE3:
scale2D_64to32 2.75x 14407.30 39553.04
diff -r 86268e498680 -r eb7ba0bea5b6 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Wed Apr 22 00:00:39 2015 -0500
+++ b/source/common/x86/asm-primitives.cpp Wed Apr 22 19:52:06 2015 +0530
@@ -1196,6 +1196,7 @@
p.dequant_normal = x265_dequant_normal_avx2;
p.scale1D_128to64 = x265_scale1D_128to64_avx2;
+ p.scale2D_64to32 = x265_scale2D_64to32_avx2;
// p.weight_pp = x265_weight_pp_avx2; fails tests
p.cu[BLOCK_16x16].calcresidual = x265_getResidual16_avx2;
diff -r 86268e498680 -r eb7ba0bea5b6 source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm Wed Apr 22 00:00:39 2015 -0500
+++ b/source/common/x86/pixel-util8.asm Wed Apr 22 19:52:06 2015 +0530
@@ -4022,6 +4022,50 @@
RET
%endif
+;-----------------------------------------------------------------
+; void scale2D_64to32(pixel *dst, pixel *src, intptr_t stride)
+;-----------------------------------------------------------------
+%if HIGH_BIT_DEPTH
+INIT_YMM avx2
+cglobal scale2D_64to32, 3, 4, 5, dest, src, stride
+ mov r3d, 32
+ add r2d, r2d
+ mova m4, [pw_2000]
+
+.loop:
+ movu m0, [r1]
+ movu m1, [r1 + 1 * mmsize]
+ movu m2, [r1 + r2]
+ movu m3, [r1 + r2 + 1 * mmsize]
+
+ paddw m0, m2
+ paddw m1, m3
+ phaddw m0, m1
+
+ pmulhrsw m0, m4
+ vpermq m0, m0, q3120
+ movu [r0], m0
+
+ movu m0, [r1 + 2 * mmsize]
+ movu m1, [r1 + 3 * mmsize]
+ movu m2, [r1 + r2 + 2 * mmsize]
+ movu m3, [r1 + r2 + 3 * mmsize]
+
+ paddw m0, m2
+ paddw m1, m3
+ phaddw m0, m1
+
+ pmulhrsw m0, m4
+ vpermq m0, m0, q3120
+ movu [r0 + mmsize], m0
+
+ add r0, 64
+ lea r1, [r1 + 2 * r2]
+ dec r3d
+ jnz .loop
+ RET
+%else
+
INIT_YMM avx2
cglobal scale2D_64to32, 3, 5, 8, dest, src, stride
mov r3d, 16
@@ -4121,6 +4165,7 @@
dec r3d
jnz .loop
RET
+%endif
;-----------------------------------------------------------------------------
; void pixel_sub_ps_4x4(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1);
More information about the x265-devel
mailing list