[x265] [PATCH] asm: avx2 code for planecopy_sp
dnyaneshwar at multicorewareinc.com
dnyaneshwar at multicorewareinc.com
Fri Apr 10 07:27:03 CEST 2015
# HG changeset patch
# User Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
# Date 1428643601 -19800
# Fri Apr 10 10:56:41 2015 +0530
# Node ID 7421d139835b88615cd939fbc55bd24eaf87970c
# Parent 984e254f93f7cedc5a9b00851d2e14b49dc94e91
asm: avx2 code for planecopy_sp
AVX2:
planecopy_sp 22.19x 5337.07 118407.46
SSE2:
planecopy_sp 14.83x 8106.54 120242.02
diff -r 984e254f93f7 -r 7421d139835b source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Thu Apr 09 11:48:08 2015 -0500
+++ b/source/common/x86/asm-primitives.cpp Fri Apr 10 10:56:41 2015 +0530
@@ -1552,6 +1552,8 @@
#if X86_64
if (cpuMask & X265_CPU_AVX2)
{
+ p.planecopy_sp = x265_downShift_16_avx2;
+
p.cu[BLOCK_32x32].intra_pred[DC_IDX] = x265_intra_pred_dc32_avx2;
p.cu[BLOCK_16x16].intra_pred[PLANAR_IDX] = x265_intra_pred_planar16_avx2;
diff -r 984e254f93f7 -r 7421d139835b source/common/x86/pixel-a.asm
--- a/source/common/x86/pixel-a.asm Thu Apr 09 11:48:08 2015 -0500
+++ b/source/common/x86/pixel-a.asm Fri Apr 10 10:56:41 2015 +0530
@@ -7078,6 +7078,117 @@
.end:
RET
+; Input 16bpp, Output 8bpp
+;-------------------------------------------------------------------------------------------------------------------------------------
+;void planecopy_sp(uint16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask)
+;-------------------------------------------------------------------------------------------------------------------------------------
+INIT_YMM avx2
+cglobal downShift_16, 6,7,3
+ movd xm0, r6m ; m0 = shift
+ add r1d, r1d
+ dec r5d
+.loopH:
+ xor r6, r6
+.loopW:
+ movu m1, [r0 + r6 * 2 + 0]
+ movu m2, [r0 + r6 * 2 + 32]
+ vpsrlw m1, xm0
+ vpsrlw m2, xm0
+ packuswb m1, m2
+ vpermq m1, m1, 11011000b
+ movu [r2 + r6], m1
+
+ add r6d, mmsize
+ cmp r6d, r4d
+ jl .loopW
+
+ ; move to next row
+ add r0, r1
+ add r2, r3
+ dec r5d
+ jnz .loopH
+
+; processing last row of every frame [To handle width which not a multiple of 32]
+ mov r6d, r4d
+ and r4d, 31
+ shr r6d, 5
+
+.loop32:
+ movu m1, [r0]
+ movu m2, [r0 + 32]
+ psrlw m1, xm0
+ psrlw m2, xm0
+ packuswb m1, m2
+ vpermq m1, m1, 11011000b
+ movu [r2], m1
+
+ add r0, 2*mmsize
+ add r2, mmsize
+ dec r6d
+ jnz .loop32
+
+ cmp r4d, 16
+ jl .process8
+ movu m1, [r0]
+ psrlw m1, xm0
+ packuswb m1, m1
+ vpermq m1, m1, 10001000b
+ movu [r2], xm1
+
+ add r0, mmsize
+ add r2, 16
+ sub r4d, 16
+ jz .end
+
+.process8:
+ cmp r4d, 8
+ jl .process4
+ movu m1, [r0]
+ psrlw m1, xm0
+ packuswb m1, m1
+ movq [r2], xm1
+
+ add r0, 16
+ add r2, 8
+ sub r4d, 8
+ jz .end
+
+.process4:
+ cmp r4d, 4
+ jl .process2
+ movq xm1,[r0]
+ psrlw m1, xm0
+ packuswb m1, m1
+ movd [r2], xm1
+
+ add r0, 8
+ add r2, 4
+ sub r4d, 4
+ jz .end
+
+.process2:
+ cmp r4d, 2
+ jl .process1
+ movd xm1, [r0]
+ psrlw m1, xm0
+ packuswb m1, m1
+ movd r6d, xm1
+ mov [r2], r6w
+
+ add r0, 4
+ add r2, 2
+ sub r4d, 2
+ jz .end
+
+.process1:
+ movd xm1, [r0]
+ psrlw m1, xm0
+ packuswb m1, m1
+ movd r3d, xm1
+ mov [r2], r3b
+.end:
+ RET
+
; Input 8bpp, Output 16bpp
;---------------------------------------------------------------------------------------------------------------------
;void planecopy_cp(uint8_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift)
diff -r 984e254f93f7 -r 7421d139835b source/common/x86/pixel.h
--- a/source/common/x86/pixel.h Thu Apr 09 11:48:08 2015 -0500
+++ b/source/common/x86/pixel.h Fri Apr 10 10:56:41 2015 +0530
@@ -226,6 +226,7 @@
ADDAVG(addAvg_32x48)
void x265_downShift_16_sse2(const uint16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask);
+void x265_downShift_16_avx2(const uint16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask);
void x265_upShift_8_sse4(const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift);
int x265_psyCost_pp_4x4_sse4(const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride);
int x265_psyCost_pp_8x8_sse4(const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride);
diff -r 984e254f93f7 -r 7421d139835b source/test/pixelharness.cpp
--- a/source/test/pixelharness.cpp Thu Apr 09 11:48:08 2015 -0500
+++ b/source/test/pixelharness.cpp Fri Apr 10 10:56:41 2015 +0530
@@ -1018,9 +1018,8 @@
memset(ref_dest, 0xCD, sizeof(ref_dest));
memset(opt_dest, 0xCD, sizeof(opt_dest));
-
- int width = 16 + rand() % 48;
- int height = 16 + rand() % 48;
+ int width = 32 + rand() % 32;
+ int height = 32 + rand() % 32;
intptr_t srcStride = 64;
intptr_t dstStride = width;
int j = 0;
More information about the x265-devel
mailing list