[x265] [PATCH] asm: avx2 code for weight_sp() for 8bpp
sumalatha at multicorewareinc.com
sumalatha at multicorewareinc.com
Thu Apr 2 11:51:08 CEST 2015
# HG changeset patch
# User Sumalatha Polureddy
# Date 1427968258 -19800
# Thu Apr 02 15:20:58 2015 +0530
# Node ID 7f976e1e89c5940a8bb2f5b965ebd9ed6e6948a6
# Parent ac85c775620f1dcb0df056874633cbf916098bd2
asm: avx2 code for weight_sp() for 8bpp
sse4
weight_sp 16.40x 7768.71 127369.20
avx2
weight_sp 25.83x 4918.74 127040.17
diff -r ac85c775620f -r 7f976e1e89c5 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Tue Mar 31 20:04:28 2015 -0500
+++ b/source/common/x86/asm-primitives.cpp Thu Apr 02 15:20:58 2015 +0530
@@ -1604,6 +1604,7 @@
p.scale1D_128to64 = x265_scale1D_128to64_avx2;
p.weight_pp = x265_weight_pp_avx2;
+ p.weight_sp = x265_weight_sp_avx2;
// intra_pred functions
p.cu[BLOCK_8x8].intra_pred[3] = x265_intra_pred_ang8_3_avx2;
diff -r ac85c775620f -r 7f976e1e89c5 source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm Tue Mar 31 20:04:28 2015 -0500
+++ b/source/common/x86/pixel-util8.asm Thu Apr 02 15:20:58 2015 +0530
@@ -1492,6 +1492,84 @@
dec r5d
jnz .loopH
RET
+
+%if ARCH_X86_64
+INIT_YMM avx2
+cglobal weight_sp, 6, 9, 7
+ mov r7d, r7m
+ shl r7d, 16
+ or r7d, r6m
+ vpbroadcastd m0, r7d ; m0 = times 8 dw w0, round
+ movd xm1, r8m ; m1 = [shift]
+ vpbroadcastd m2, r9m ; m2 = times 16 dw offset
+ vpbroadcastw m3, [pw_1]
+ vpbroadcastw m4, [pw_2000]
+
+ add r2d, r2d ; 2 * srcstride
+
+ mov r7, r0
+ mov r8, r1
+.loopH:
+ mov r6d, r4d ; width
+
+ ; save old src and dst
+ mov r0, r7 ; src
+ mov r1, r8 ; dst
+.loopW:
+ movu m5, [r0]
+ paddw m5, m4
+
+ punpcklwd m6,m5, m3
+ pmaddwd m6, m0
+ psrad m6, xm1
+ paddd m6, m2
+
+ punpckhwd m5, m3
+ pmaddwd m5, m0
+ psrad m5, xm1
+ paddd m5, m2
+
+ packssdw m6, m5
+ packuswb m6, m6
+ vpermq m6, m6, 10001000b
+
+ sub r6d, 16
+ jl .width8
+ movu [r1], xm6
+ je .nextH
+ add r0, 32
+ add r1, 16
+ jmp .loopW
+
+.width8:
+ add r6d, 16
+ cmp r6d, 8
+ jl .width4
+ movq [r1], xm6
+ je .nextH
+ psrldq m6, 8
+ sub r6d, 8
+ add r1, 8
+
+.width4:
+ cmp r6d, 4
+ jl .width2
+ movd [r1], xm6
+ je .nextH
+ add r1, 4
+ pshufd m6, m6, 1
+
+.width2:
+ pextrw [r1], xm6, 0
+
+.nextH:
+ lea r7, [r7 + r2]
+ lea r8, [r8 + r3]
+
+ dec r5d
+ jnz .loopH
+ RET
+%endif
%endif ; end of (HIGH_BIT_DEPTH == 0)
diff -r ac85c775620f -r 7f976e1e89c5 source/common/x86/pixel.h
--- a/source/common/x86/pixel.h Tue Mar 31 20:04:28 2015 -0500
+++ b/source/common/x86/pixel.h Thu Apr 02 15:20:58 2015 +0530
@@ -272,6 +272,7 @@
int x265_psyCost_ss_16x16_avx2(const int16_t* source, intptr_t sstride, const int16_t* recon, intptr_t rstride);
int x265_psyCost_ss_32x32_avx2(const int16_t* source, intptr_t sstride, const int16_t* recon, intptr_t rstride);
int x265_psyCost_ss_64x64_avx2(const int16_t* source, intptr_t sstride, const int16_t* recon, intptr_t rstride);
+void x265_weight_sp_avx2(const int16_t* src, pixel* dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset);
#undef DECL_PIXELS
#undef DECL_HEVC_SSD
More information about the x265-devel
mailing list