[x265] [PATCH] asm: scale2D_64to32 avx2 code
dnyaneshwar at multicorewareinc.com
dnyaneshwar at multicorewareinc.com
Fri Mar 27 06:57:59 CET 2015
# HG changeset patch
# User Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
# Date 1427435844 -19800
# Fri Mar 27 11:27:24 2015 +0530
# Branch stable
# Node ID 694ce8986d7e83c8fca7e7a421c29b21b423ffbf
# Parent 3d0f23cb0e58585e490362587022e67cfded143a
asm: scale2D_64to32 avx2 code
AVX2:
scale2D_64to32 10.41x 3861.30 40192.99
scale2D_64to32 10.35x 3880.97 40175.66
SSSE3:
scale2D_64to32 5.44x 7454.44 40576.51
scale2D_64to32 5.45x 7445.73 40613.14
diff -r 3d0f23cb0e58 -r 694ce8986d7e source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Thu Mar 26 15:09:51 2015 -0500
+++ b/source/common/x86/asm-primitives.cpp Fri Mar 27 11:27:24 2015 +0530
@@ -1447,6 +1447,8 @@
#if X86_64
if (cpuMask & X265_CPU_AVX2)
{
+ p.scale2D_64to32 = x265_scale2D_64to32_avx2;
+
p.cu[BLOCK_4x4].psy_cost_ss = x265_psyCost_ss_4x4_avx2;
p.cu[BLOCK_8x8].psy_cost_ss = x265_psyCost_ss_8x8_avx2;
p.cu[BLOCK_16x16].psy_cost_ss = x265_psyCost_ss_16x16_avx2;
diff -r 3d0f23cb0e58 -r 694ce8986d7e source/common/x86/pixel-util.h
--- a/source/common/x86/pixel-util.h Thu Mar 26 15:09:51 2015 -0500
+++ b/source/common/x86/pixel-util.h Fri Mar 27 11:27:24 2015 +0530
@@ -76,6 +76,7 @@
void x265_scale1D_128to64_ssse3(pixel*, const pixel*, intptr_t);
void x265_scale1D_128to64_avx2(pixel*, const pixel*, intptr_t);
void x265_scale2D_64to32_ssse3(pixel*, const pixel*, intptr_t);
+void x265_scale2D_64to32_avx2(pixel*, const pixel*, intptr_t);
int x265_findPosLast_x64(const uint16_t *scan, const coeff_t *coeff, uint16_t *coeffSign, uint16_t *coeffFlag, uint8_t *coeffNum, int numSig);
diff -r 3d0f23cb0e58 -r 694ce8986d7e source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm Thu Mar 26 15:09:51 2015 -0500
+++ b/source/common/x86/pixel-util8.asm Fri Mar 27 11:27:24 2015 +0530
@@ -40,16 +40,17 @@
ssim_c1: times 4 dd 416 ; .01*.01*255*255*64
ssim_c2: times 4 dd 235963 ; .03*.03*255*255*64*63
%endif
-mask_ff: times 16 db 0xff
- times 16 db 0
-deinterleave_shuf: db 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15
-deinterleave_word_shuf: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15
-hmul_16p: times 16 db 1
- times 8 db 1, -1
-hmulw_16p: times 8 dw 1
- times 4 dw 1, -1
-
-trans8_shuf: dd 0, 4, 1, 5, 2, 6, 3, 7
+
+mask_ff: times 16 db 0xff
+ times 16 db 0
+deinterleave_shuf: times 2 db 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15
+deinterleave_word_shuf: times 2 db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15
+hmul_16p: times 16 db 1
+ times 8 db 1, -1
+hmulw_16p: times 8 dw 1
+ times 4 dw 1, -1
+
+trans8_shuf: dd 0, 4, 1, 5, 2, 6, 3, 7
SECTION .text
@@ -3944,6 +3945,105 @@
RET
%endif
+INIT_YMM avx2
+cglobal scale2D_64to32, 3, 5, 8, dest, src, stride
+ mov r3d, 16
+ mova m7, [deinterleave_shuf]
+.loop:
+ movu m0, [r1] ; i
+ lea r4, [r1 + r2 * 2]
+ psrlw m1, m0, 8 ; j
+ movu m2, [r1 + r2] ; k
+ psrlw m3, m2, 8 ; l
+
+ pxor m4, m0, m1 ; i^j
+ pxor m5, m2, m3 ; k^l
+ por m4, m5 ; ij|kl
+
+ pavgb m0, m1 ; s
+ pavgb m2, m3 ; t
+ mova m5, m0
+ pavgb m0, m2 ; (s+t+1)/2
+ pxor m5, m2 ; s^t
+ pand m4, m5 ; (ij|kl)&st
+ pand m4, [pb_1]
+ psubb m0, m4 ; Result
+
+ movu m1, [r1 + 32] ; i
+ psrlw m2, m1, 8 ; j
+ movu m3, [r1 + r2 + 32] ; k
+ psrlw m4, m3, 8 ; l
+
+ pxor m5, m1, m2 ; i^j
+ pxor m6, m3, m4 ; k^l
+ por m5, m6 ; ij|kl
+
+ pavgb m1, m2 ; s
+ pavgb m3, m4 ; t
+ mova m6, m1
+ pavgb m1, m3 ; (s+t+1)/2
+ pxor m6, m3 ; s^t
+ pand m5, m6 ; (ij|kl)&st
+ pand m5, [pb_1]
+ psubb m1, m5 ; Result
+
+ pshufb m0, m0, m7
+ pshufb m1, m1, m7
+
+ punpcklqdq m0, m1
+ vpermq m0, m0, 11011000b
+ movu [r0], m0
+
+ add r0, 32
+
+ movu m0, [r4] ; i
+ psrlw m1, m0, 8 ; j
+ movu m2, [r4 + r2] ; k
+ psrlw m3, m2, 8 ; l
+
+ pxor m4, m0, m1 ; i^j
+ pxor m5, m2, m3 ; k^l
+ por m4, m5 ; ij|kl
+
+ pavgb m0, m1 ; s
+ pavgb m2, m3 ; t
+ mova m5, m0
+ pavgb m0, m2 ; (s+t+1)/2
+ pxor m5, m2 ; s^t
+ pand m4, m5 ; (ij|kl)&st
+ pand m4, [pb_1]
+ psubb m0, m4 ; Result
+
+ movu m1, [r4 + 32] ; i
+ psrlw m2, m1, 8 ; j
+ movu m3, [r4 + r2 + 32] ; k
+ psrlw m4, m3, 8 ; l
+
+ pxor m5, m1, m2 ; i^j
+ pxor m6, m3, m4 ; k^l
+ por m5, m6 ; ij|kl
+
+ pavgb m1, m2 ; s
+ pavgb m3, m4 ; t
+ mova m6, m1
+ pavgb m1, m3 ; (s+t+1)/2
+ pxor m6, m3 ; s^t
+ pand m5, m6 ; (ij|kl)&st
+ pand m5, [pb_1]
+ psubb m1, m5 ; Result
+
+ pshufb m0, m0, m7
+ pshufb m1, m1, m7
+
+ punpcklqdq m0, m1
+ vpermq m0, m0, 11011000b
+ movu [r0], m0
+
+ lea r1, [r1 + 4 * r2]
+ add r0, 32
+ dec r3d
+ jnz .loop
+ RET
;-----------------------------------------------------------------------------
; void pixel_sub_ps_4x4(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1);
More information about the x265-devel
mailing list