[x265] [PATCH] asm: avx2 10bit code for planecopy_cp(10660.20 -> 5685.80)

rajesh at multicorewareinc.com rajesh at multicorewareinc.com
Fri Jun 26 11:44:39 CEST 2015


# HG changeset patch
# User Rajesh Paulraj<rajesh at multicorewareinc.com>
# Date 1435311677 -19800
#      Fri Jun 26 15:11:17 2015 +0530
# Node ID 818b70b015513a01993af0c48e4714cf4fd8dc84
# Parent  956401f1a679f1e71181b704d64e4acdb6f1a93f
asm: avx2 10bit code for planecopy_cp(10660.20 -> 5685.80)

avx2:
planecopy_cp  19.36x   5685.80         110052.08

sse4:
planecopy_cp  9.65x    10660.20        102850.27

diff -r 956401f1a679 -r 818b70b01551 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Fri Jun 26 15:01:16 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp	Fri Jun 26 15:11:17 2015 +0530
@@ -1522,6 +1522,7 @@
         p.scale2D_64to32 = PFX(scale2D_64to32_avx2);
         p.weight_pp = PFX(weight_pp_avx2);
         p.sign = PFX(calSign_avx2);
+        p.planecopy_cp = PFX(upShift_8_avx2);
 
         p.cu[BLOCK_16x16].calcresidual = PFX(getResidual16_avx2);
         p.cu[BLOCK_32x32].calcresidual = PFX(getResidual32_avx2);
diff -r 956401f1a679 -r 818b70b01551 source/common/x86/pixel-a.asm
--- a/source/common/x86/pixel-a.asm	Fri Jun 26 15:01:16 2015 +0530
+++ b/source/common/x86/pixel-a.asm	Fri Jun 26 15:11:17 2015 +0530
@@ -7388,6 +7388,96 @@
 .end:
     RET
 
+;---------------------------------------------------------------------------------------------------------------------
+;void planecopy_cp(uint8_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift)
+;---------------------------------------------------------------------------------------------------------------------
+%if ARCH_X86_64
+INIT_YMM avx2
+cglobal upShift_8, 7,8,3
+    movd        xm2, r6d
+    add         r3, r3
+
+.loopH:
+    xor         r7, r7
+    mov         r6d, r4d
+.loopW:
+    pmovzxbw    m0,[r0 + r7]
+    pmovzxbw    m1,[r0 + r7 + 16]
+    psllw       m0, xm2
+    psllw       m1, xm2
+    movu        [r2 + r7 * 2], m0
+    movu        [r2 + r7 * 2 + 32], m1
+
+    add         r7d, 32
+    sub         r6d, 32
+    jg          .loopW
+
+    ; move to next row
+    add         r0, r1
+    add         r2, r3
+    dec         r5d
+    jnz         .loopH
+
+;processing last row of every frame [To handle width which not a multiple of 16]
+
+.loop16:
+    pmovzxbw    m0,[r0]
+    psllw       m0, xm2
+    movu        [r2], m0
+
+    add         r0, mmsize
+    add         r2, 2 * mmsize
+    sub         r4d, 16
+    jg          .loop16
+    jz          .end
+
+    cmp         r4d, 8
+    jl          .process4
+    pmovzxbw    m0,[r0]
+    psllw       m0, xm2
+    movu        [r2], m0
+
+    add         r0, 8
+    add         r2, mmsize
+    sub         r4d, 8
+    jz          .end
+
+.process4:
+    cmp         r4d, 4
+    jl          .process2
+    movq        xm0,[r0]
+    pmovzxbw    m0,xm0
+    psllw       xm0, xm2
+    movq        [r2], xm0
+
+    add         r0, 4
+    add         r2, 8
+    sub         r4d, 4
+    jz          .end
+
+.process2:
+    cmp         r4d, 2
+    jl          .process1
+    movzx       r3d, byte [r0]
+    shl         r3d, 2
+    mov         [r2], r3w
+    movzx       r3d, byte [r0 + 1]
+    shl         r3d, 2
+    mov         [r2 + 2], r3w
+
+    add         r0, 2
+    add         r2, 4
+    sub         r4d, 2
+    jz          .end
+
+.process1:
+    movzx       r3d, byte [r0]
+    shl         r3d, 2
+    mov         [r2], r3w
+.end:
+    RET
+%endif
+
 %macro ABSD2 6 ; dst1, dst2, src1, src2, tmp, tmp
 %if cpuflag(ssse3)
     pabsd   %1, %3
diff -r 956401f1a679 -r 818b70b01551 source/common/x86/pixel.h
--- a/source/common/x86/pixel.h	Fri Jun 26 15:01:16 2015 +0530
+++ b/source/common/x86/pixel.h	Fri Jun 26 15:11:17 2015 +0530
@@ -31,6 +31,7 @@
 void PFX(downShift_16_sse2)(const uint16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask);
 void PFX(downShift_16_avx2)(const uint16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask);
 void PFX(upShift_8_sse4)(const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift);
+void PFX(upShift_8_avx2)(const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift);
 
 #define DECL_PIXELS(cpu) \
     FUNCDEF_PU(int, pixel_ssd, cpu, const pixel*, intptr_t, const pixel*, intptr_t); \


More information about the x265-devel mailing list