[x265] [PATCH] asm: avx2 10bit code for planecopy_cp(10660.20 -> 5685.80)
rajesh at multicorewareinc.com
rajesh at multicorewareinc.com
Thu Jun 25 10:50:32 CEST 2015
# HG changeset patch
# User Rajesh Paulraj<rajesh at multicorewareinc.com>
# Date 1435220155 -19800
# Thu Jun 25 13:45:55 2015 +0530
# Node ID 26e8eff8eb5abc1c2fa5dd94f59f620c6040caf9
# Parent 430625004ef81ba9e9e398d4cf12a68a1cd4b664
asm: avx2 10bit code for planecopy_cp(10660.20 -> 5685.80)
avx2:
planecopy_cp 19.36x 5685.80 110052.08
sse4:
planecopy_cp 9.65x 10660.20 102850.27
diff -r 430625004ef8 -r 26e8eff8eb5a source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Thu Jun 25 13:34:17 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp Thu Jun 25 13:45:55 2015 +0530
@@ -1497,6 +1497,7 @@
p.scale2D_64to32 = PFX(scale2D_64to32_avx2);
p.weight_pp = PFX(weight_pp_avx2);
p.sign = x265_calculateSign_avx2;
+ p.planecopy_cp = PFX(upShift_8_avx2);
p.cu[BLOCK_16x16].calcresidual = PFX(getResidual16_avx2);
p.cu[BLOCK_32x32].calcresidual = PFX(getResidual32_avx2);
diff -r 430625004ef8 -r 26e8eff8eb5a source/common/x86/pixel-a.asm
--- a/source/common/x86/pixel-a.asm Thu Jun 25 13:34:17 2015 +0530
+++ b/source/common/x86/pixel-a.asm Thu Jun 25 13:45:55 2015 +0530
@@ -7388,6 +7388,96 @@
.end:
RET
+;---------------------------------------------------------------------------------------------------------------------
+;void planecopy_cp(uint8_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift)
+;---------------------------------------------------------------------------------------------------------------------
+%if ARCH_X86_64
+INIT_YMM avx2
+cglobal upShift_8, 7,8,3
+ movd xm2, r6d
+ add r3, r3
+
+.loopH:
+ xor r7, r7
+ mov r6d, r4d
+.loopW:
+ pmovzxbw m0,[r0 + r7]
+ pmovzxbw m1,[r0 + r7 + 16]
+ psllw m0, xm2
+ psllw m1, xm2
+ movu [r2 + r7 * 2], m0
+ movu [r2 + r7 * 2 + 32], m1
+
+ add r7d, 32
+ sub r6d, 32
+ jg .loopW
+
+ ; move to next row
+ add r0, r1
+ add r2, r3
+ dec r5d
+ jnz .loopH
+
+;processing last row of every frame [To handle width which not a multiple of 16]
+
+.loop16:
+ pmovzxbw m0,[r0]
+ psllw m0, xm2
+ movu [r2], m0
+
+ add r0, mmsize
+ add r2, 2 * mmsize
+ sub r4d, 16
+ jg .loop16
+ jz .end
+
+ cmp r4d, 8
+ jl .process4
+ pmovzxbw m0,[r0]
+ psllw m0, xm2
+ movu [r2], m0
+
+ add r0, 8
+ add r2, mmsize
+ sub r4d, 8
+ jz .end
+
+.process4:
+ cmp r4d, 4
+ jl .process2
+ movq xm0,[r0]
+ pmovzxbw m0,xm0
+ psllw xm0, xm2
+ movq [r2], xm0
+
+ add r0, 4
+ add r2, 8
+ sub r4d, 4
+ jz .end
+
+.process2:
+ cmp r4d, 2
+ jl .process1
+ movzx r3d, byte [r0]
+ shl r3d, 2
+ mov [r2], r3w
+ movzx r3d, byte [r0 + 1]
+ shl r3d, 2
+ mov [r2 + 2], r3w
+
+ add r0, 2
+ add r2, 4
+ sub r4d, 2
+ jz .end
+
+.process1:
+ movzx r3d, byte [r0]
+ shl r3d, 2
+ mov [r2], r3w
+.end:
+ RET
+%endif
+
%macro ABSD2 6 ; dst1, dst2, src1, src2, tmp, tmp
%if cpuflag(ssse3)
pabsd %1, %3
diff -r 430625004ef8 -r 26e8eff8eb5a source/common/x86/pixel.h
--- a/source/common/x86/pixel.h Thu Jun 25 13:34:17 2015 +0530
+++ b/source/common/x86/pixel.h Thu Jun 25 13:45:55 2015 +0530
@@ -31,6 +31,7 @@
void PFX(downShift_16_sse2)(const uint16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask);
void PFX(downShift_16_avx2)(const uint16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask);
void PFX(upShift_8_sse4)(const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift);
+void PFX(upShift_8_avx2)(const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift);
#define DECL_PIXELS(cpu) \
FUNCDEF_PU(int, pixel_ssd, cpu, const pixel*, intptr_t, const pixel*, intptr_t); \
More information about the x265-devel
mailing list